In [65]:
import re
from pprint import pprint

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import spacy
from gensim.models import KeyedVectors, Word2Vec
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity


pd.set_option('display.max_colwidth', None)
SPECIAL_CHARS = '[^A-Za-z0-9 ]+'
STOP_WORDS = stopwords.words('english')

# 2. Functions

In [2]:
def preprocess_text(text):
    """
    Take out stopwords.
    Take out punctuations and special characters.
    """
    SPECIAL_CHARS = '[^A-Za-z0-9 ]+'
    STOP_WORDS = stopwords.words('english')
    text = text.lower().split(' ')
    temp = [word for word in text if word not in STOP_WORDS]
    text = ' '.join(temp)
    text = re.sub(SPECIAL_CHARS, '', text)
    return text

In [3]:
def tokenise(doc):
    return [token.text for token in nlp(doc)]

In [4]:
def tokenise_lemma(doc):
    """
    Use spacy as the nlp object to tokenise each doc
    Lemmatise each words
    """
    return ' '.join([token.lemma_ for token in nlp(doc)])

In [5]:
# this is one way to get each title's vector representation
# more investagtion is needed later.

def get_vectors(first_map, second_map):
    """
    Use tokenised words to get vectors representations from the pretrained model (i.e. second_map).
    Average the vector representation of the description as the representation of the document 
    (i.e. each movie title's representation is the mean of vectors of each words in its description)
    """
    first_vec  = dict()
    for title, description in first_map.items():
        temp = list()
        for element in description: #element = tokenised words
            try:
                temp.append(second_map[element]) #secondmap is w2v model which should have a responding word vecotr for the tokenise word
            except KeyError:
                pass
        first_vec[title] = np.mean(temp, axis=0)
    
    return first_vec

In [36]:
def search(name, df):
    return df.loc[df['title'].str.lower()==name.lower()]

In [42]:
def get_topN_similar(lookup_id, title_vec, df, N=10):

    sim = list()
    lookup_map = title_vec
    subject_map = title_vec 
        
    for uid, vec in lookup_map.items():
        thisSim = cosine_similarity(vec.reshape(1, -1), subject_map[lookup_id].reshape(1, -1))
        org = search(uid, df).originals.values
        gen = search(uid, df).genres.values
        sim.append((uid, thisSim[0][0], org, gen))
    sim = sorted(sim, key=lambda x: x[1], reverse=True)[:N+1]
    returnDf = pd.DataFrame(columns=['title','similarity','originals','genres'],
                           data = sim)
    return returnDf

In [37]:
def get_most_similar(lookup_id, title_vec, df):

    sim = list()
    lookup_map = title_vec
    subject_map = title_vec 
        
    for uid, vec in lookup_map.items():
        thisSim = cosine_similarity(vec.reshape(1, -1), subject_map[lookup_id].reshape(1, -1))
        org = search(uid, df).originals.values
        gen = search(uid, df).genres.values
        sim.append((uid, thisSim[0][0], org, gen))

    return sorted(sim, key=lambda x: x[1], reverse=True)

In [7]:
def filter_df(keyword):
    """
    Return a dataframe with the filtered result.
    The input value is case-insensitive. 
    """
    if type(keyword) == list:
        return netflixDf.loc[netflixDf['title'].isin(keyword)]
    else:
        return netflixDf.loc[netflixDf['title'].str.lower().isin([keyword.lower()])]

In [8]:
def markerX(key, values):
    return netflixDf.loc[netflixDf[key].str.lower().isin(values)].sort_values(by='pca_2', ascending=False)

def others(key, values):
    return netflixDf.loc[~netflixDf[key].str.lower().isin(values)]

# 3. Analyse Pipeline

## Terms explained
Document -> a bunch of texts <br>
Corpus -> a bunch of documents <br>
Vectors -> a mathematically convenience representation of a document (a bunch of textx) <br>
Models -> an algorithm for transforming vectors from one representation to another <br>

## Read the dataset/ Load the spacy pretrained model

In [9]:
netflixDf = pd.read_csv('finalDataset_v2.csv', usecols=['title','type','description','genres','originals'])

In [10]:
# use pre-trained corpus to help tokenise words
nlp = spacy.load('en_core_web_sm')

## Create the Corpus of tv and movie

In [24]:
movieDf = netflixDf.loc[netflixDf['type']=='movie']
tvshowDf = netflixDf.loc[netflixDf['type']=='tvshow']

In [25]:
movieCorpus = movieDf.description.values.tolist() #list of docs
tvshowCorpus = tvshowDf.description.values.tolist() #list of docs

In [26]:
movieTkDocs = [tokenise(doc) for doc in movieCorpus] #tokenise 
tvshowTkDocs = [tokenise(doc) for doc in tvshowCorpus] #tokenise 

In [27]:
# mapping out the title and each description. so later on i can search 
movieMap = dict(zip(movieDf['title'].str.lower().tolist(), movieTkDocs))
tvshowMap = dict(zip(tvshowDf['title'].str.lower().tolist(), tvshowTkDocs))
# lower the title (easy for search)

### Google news is much faster
It might take a few seconds to train

In [15]:
path = "GoogleNews-vectors-negative300.bin"
w2v = KeyedVectors.load_word2vec_format(path, binary=True)
# It is much faster take less than 2 minutes

In [None]:
# Use wikipedia trained model. It spends around 20 mins to load the model.
# MODEL_FILE = "enwiki_20180420_300d.txt"
# w2v = KeyedVectors.load_word2vec_format(MODEL_FILE)

In [29]:
movieTitleVec = get_vectors(movieMap, w2v)
tvshowTitleVec = get_vectors(tvshowMap, w2v)

In [43]:
get_topN_similar('house of cards', tvshowTitleVec, tvshowDf)

Unnamed: 0,title,similarity,originals,genres
0,house of cards,1.0,[1],"[drama,sport,crime]"
1,bloodline,0.946771,[1],"[drama,thriller,crime]"
2,reign,0.946442,[0],"[drama,fantasy,history]"
3,broadchurch,0.944031,[0],"[crime,drama,mystery,thriller]"
4,peaky blinders,0.943539,[1],"[crime,drama]"
5,the end of the f***ing world,0.942934,[1],"[action-and-adventure,comedy,crime,drama,romance,thriller]"
6,hostages_2013,0.942573,[0],"[drama,thriller]"
7,collateral,0.942508,[1],"[crime,drama,mystery,thriller]"
8,breaking bad,0.942227,[0],"[crime,drama,thriller]"
9,sherlock,0.942069,[0],"[action-and-adventure,crime,drama,mystery,thriller]"


In [39]:
get_most_similar('house of cards', tvshowTitleVec, tvshowDf)[:15]

[('house of cards',
  1.0000002,
  array([1]),
  array(['drama,sport,crime'], dtype=object)),
 ('bloodline',
  0.9467709,
  array([1]),
  array(['drama,thriller,crime'], dtype=object)),
 ('reign',
  0.94644237,
  array([0]),
  array(['drama,fantasy,history'], dtype=object)),
 ('broadchurch',
  0.9440308,
  array([0]),
  array(['crime,drama,mystery,thriller'], dtype=object)),
 ('peaky blinders',
  0.9435389,
  array([1]),
  array(['crime,drama'], dtype=object)),
 ('the end of the f***ing world',
  0.9429343,
  array([1]),
  array(['action-and-adventure,comedy,crime,drama,romance,thriller'],
        dtype=object)),
 ('hostages_2013',
  0.9425726,
  array([0]),
  array(['drama,thriller'], dtype=object)),
 ('collateral',
  0.9425075,
  array([1]),
  array(['crime,drama,mystery,thriller'], dtype=object)),
 ('breaking bad',
  0.94222677,
  array([0]),
  array(['crime,drama,thriller'], dtype=object)),
 ('sherlock',
  0.94206876,
  array([0]),
  array(['action-and-adventure,crime,drama,mystery

# Let's try using our own data to train

In [44]:
tvshowDf = netflixDf.loc[netflixDf['type']=='tvshow']
tvshowDf.reset_index(drop=True, inplace=True)

In [53]:
everything = tvshowDf.drop(columns=['type','originals']).apply(lambda x: ','.join(x.astype(str)), axis=1)

In [57]:
type(everything)

pandas.core.series.Series

In [56]:
tvshowDf.shape

(2060, 5)

In [58]:
tvshowCorpus = everything.values.tolist() #list of docs

In [60]:
tvshowTkDocs = [tokenise(doc) for doc in tvshowCorpus] #tokenise 

In [61]:
tvshowMap = dict(zip(tvshowDf['title'].str.lower().tolist(), tvshowTkDocs))

In [69]:
self_w2v = Word2Vec(tvshowTkDocs, min_count=1,size= 300,workers=3, window =3, sg = 1)

In [70]:
tvshowTitleVec = get_vectors(tvshowMap, self_w2v)

  temp.append(second_map[element]) #secondmap is w2v model which should have a responding word vecotr for the tokenise word


In [72]:
get_topN_similar('our planet', tvshowTitleVec, tvshowDf)

Unnamed: 0,title,similarity,originals,genres
0,our planet,1.0,[1],[documentary]
1,the confession killer,0.998883,[1],"[documentary,crime]"
2,hollywood,0.998619,[1],"[drama,history]"
3,seven seconds,0.998587,[1],"[crime,drama]"
4,unbelievable,0.998535,[1],"[crime,drama]"
5,jeffrey epstein: filthy rich,0.998476,[1],"[documentary,crime]"
6,wild wild country,0.99846,[1],"[documentary,cult,crime]"
7,gypsy,0.998393,[1],"[thriller,drama]"
8,the honeymoon stand up special,0.99838,[1],[comedy]
9,november 13: attack on paris,0.998354,[1],[documentary]


In [68]:
get_topN_similar('house of cards', tvshowTitleVec, tvshowDf)

Unnamed: 0,title,similarity,originals,genres
0,house of cards,1.0,[1],"[drama,sport,crime]"
1,orange is the new black,0.999537,[1],"[comedy,lgbtq,crime,drama]"
2,breaking bad,0.999109,[0],"[crime,drama,thriller]"
3,versailles,0.99905,[0],"[biography,drama,history,romance]"
4,hart of dixie,0.999011,[0],"[comedy,drama,romance]"
5,gilmore girls,0.99899,[0],"[comedy,drama]"
6,quantico,0.998956,[0],"[drama,crime,mystery,thriller]"
7,jane the virgin,0.998928,[0],"[drama,family,romance,comedy]"
8,zoo_2015,0.998911,[0],"[drama,thriller,mystery,science-fiction,documentary,romance]"
9,damnation,0.998865,[0],"[drama,crime]"


In [63]:
get_topN_similar('house of cards', tvshowTitleVec, tvshowDf)

Unnamed: 0,title,similarity,originals,genres
0,house of cards,1.0,[1],"[drama,sport,crime]"
1,broadchurch,0.942771,[0],"[crime,drama,mystery,thriller]"
2,peaky blinders,0.942188,[1],"[crime,drama]"
3,bloodline,0.941971,[1],"[drama,thriller,crime]"
4,breaking bad,0.936305,[0],"[crime,drama,thriller]"
5,reign,0.935107,[0],"[drama,fantasy,history]"
6,orange is the new black,0.934063,[1],"[comedy,lgbtq,crime,drama]"
7,hostages_2013,0.93358,[0],"[drama,thriller]"
8,you,0.933569,[0],"[crime,drama,romance,thriller]"
9,collateral,0.932469,[1],"[crime,drama,mystery,thriller]"


# Try Doc2Vec

In [73]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [75]:
documents = [TaggedDocument(doc, [title]) for title, doc in title_text.items()]
model = Doc2Vec(documents, vector_size=300,  min_count=1, workers=4, epochs=40)

In [423]:
from gensim.test.utils import get_tmpfile

fname = get_tmpfile("my_doc2vec_model")

model.save(fname)
model = Doc2Vec.load(fname)  # you can continue training with the loaded model!

In [440]:
model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=False)

In [77]:
model.docvecs.most_similar('our planet')

[('rapture', 0.7993944883346558),
 ('roman empire', 0.7990072965621948),
 ('(t)error', 0.7692530155181885),
 ('babies', 0.7634181380271912),
 ('the royal house of windsor', 0.7517811059951782),
 ('abstract: the art of design', 0.7497245669364929),
 ('metro', 0.7471555471420288),
 ('flavorful origins', 0.7420789003372192),
 ('alive and kicking', 0.7419354915618896),
 ('cloroformo', 0.737167239189148)]

In [439]:
model.docvecs.most_similar('house of cards')

[('the adderall diaries', 0.9996110200881958),
 ('lionheart', 0.9995298385620117),
 ("ricardo o'farrill: abrazo genial", 0.9994028806686401),
 ('sierra burgess is a loser', 0.9990977048873901),
 ('see you in time', 0.9990734457969666),
 ('the 2000s', 0.9990618824958801),
 ('class rank', 0.9990559220314026),
 ('ninja assassin', 0.9988771080970764),
 ('marching orders', 0.9986553192138672),
 ('chasing trane', 0.9986506700515747)]