In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.models import KeyedVectors, Word2Vec

SPECIAL_CHARS = '[^A-Za-z0-9 ]+'

In [None]:
def preprocess(text):
    tokenized = [word for sent in [re.sub(SPECIAL_CHARS, '', element).split(' ') for 
                                   element in nltk.sent_tokenize(text)] for word in sent]
    lowered = [word.lower() for word in tokenized]
    return lowered

In [None]:
dataset = pd.read_csv('dataset.csv')

In [None]:
ls = list()
for ix in range(dataset.shape[0]):
    content = list()
    for e in dataset.iloc[ix,4:9]:
        content.append(str(e))
    s = ','.join(content)
    ls.append(s)

In [None]:
dataset['everything'] = ls

In [None]:
description_texts = [preprocess(text) for text in dataset['everything'].tolist()]

In [None]:
# # Take out the stopwords
# for i in range(len(description_texts)):
#     description_texts[i] = [word for word in description_texts[i] if word not in stopwords.words('english')]

# I use movie title as unique key. So I map out the title and the tokenised sentences

title_text = dict(zip(dataset['title'].tolist(), description_texts))

In [None]:
model = Word2Vec(min_count=20,
                window=2,
                size=300,
                sample=6e-5,
                alpha=0.03,
                min_alpha=0.0007,
                negative=20,
                )

In [None]:
def get_vectors(first_map, second_map):
    first_vec  = dict()
    for uid, content in first_map.items():
        temp = list()
        for element in content:
            try:
                temp.append(second_map[element])
            except KeyError:
                pass
        first_vec[uid] = np.mean(temp, axis=0)
    
    return first_vec


def get_most_similar(lookup_id):

    sim = list()
    
    lookup_map = title_vec
    subject_map = title_vec
#     else:
#         raise ValueError('Invalid value for parameter kind.')
        
    for uid, vec in lookup_map.items():
        thisSim = cosine_similarity(vec.reshape(1, -1), subject_map[lookup_id].reshape(1, -1))
        sim.append((uid, thisSim[0][0]))

    return sorted(sim, key=lambda x: x[1], reverse=True)

def top_10_similar(title):
    
    x = get_most_similar(title)[1:11]
#     for e in x:
# #         rating = dataset.loc[dataset['title']==e[0]]['rating_value'].values.tolist()[0]
# #         votes = dataset.loc[dataset['title']==e[0]]['votes'].values.tolist()[0]
# #         print(f"Movie title: {e[0]}\nScores: {rating}\nVotes: {votes}\nSimilarity: {e[1]}\n")
#         genres = test.loc[test['title']==e[0]['genres'].values.tolist()]
#         print(f"Movie title: {e[0]}\nGenres: {genres}\nSimilarity: {e[1]}\n")
    print(x)

In [None]:
title_vec = get_vectors(title_text, model)

In [None]:
top_10_similar('House of Cards')

# Test 6k dataset

In [None]:
data6k = pd.read_csv('netflixMovieDb.csv')
data6k = data6k.append(pd.read_csv('netflixTvshowDb.csv')).append(pd.read_csv('dMoviesDb.csv')).append(pd.read_csv('dTvshowsDb.csv'))

In [None]:
data6k.shape

In [None]:
ls = list()
for ix in range(data6k.shape[0]):
    content = list()
    for e in data6k.iloc[ix,4:9]:
        content.append(str(e))
    s = ','.join(content)
    ls.append(s)
    
data6k['everything'] = ls

In [None]:
description_texts = [preprocess(text) for text in data6k['everything'].tolist()]

In [None]:
# Take out the stopwords
for i in range(len(description_texts)):
    description_texts[i] = [word for word in description_texts[i] if word not in stopwords.words('english')]

# I use movie title as unique key. So I map out the title and the tokenised sentences

title_text = dict(zip(data6k['title'].tolist(), description_texts))

In [None]:
model = Word2Vec(description_texts, min_count=2)

In [None]:
title_vec = get_vectors(title_text, model)

In [None]:
top_10_similar('Ultraman')

In [None]:
top_10_similar('House of Cards')

# Spacy test

In [17]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
import re

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
net_originals = pd.read_csv('dataset.csv')

In [5]:
corpus = net_originals.description.values.tolist() #list of docs


In [8]:
def tokenise(doc):
    return [token.text for token in nlp(doc)]

In [9]:
token_corpus = [tokenise(doc) for doc in corpus]

In [14]:
title_text = dict(zip(net_originals['title'].str.lower().tolist(), token_corpus))
# lower the title (easy for search)

In [18]:
path = "GoogleNews-vectors-negative300.bin"
w2v = KeyedVectors.load_word2vec_format(path, binary=True)

In [19]:
def get_vectors(first_map, second_map):
    first_vec  = dict()
    for uid, content in first_map.items():
        temp = list()
        for element in content:
            try:
                temp.append(second_map[element])
            except KeyError:
                pass
        first_vec[uid] = np.mean(temp, axis=0)
    
    return first_vec

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7fa4ade4a100>

In [20]:
title_vec = (title_text, w2v)

In [21]:
title_vec

({'#realityhigh': ['When',
   'nerdy',
   'high',
   'schooler',
   'Dani',
   'finally',
   'attracts',
   'the',
   'interest',
   'of',
   'her',
   'longtime',
   'crush',
   ',',
   'she',
   'lands',
   'in',
   'the',
   'cross',
   'hairs',
   'of',
   'his',
   'ex',
   ',',
   'a',
   'social',
   'media',
   'celebrity',
   '.'],
  '13th': ['In',
   'this',
   'thought',
   '-',
   'provoking',
   'documentary',
   ',',
   'scholars',
   ',',
   'activists',
   'and',
   'politicians',
   'analyze',
   'the',
   'criminalization',
   'of',
   'African',
   'Americans',
   'and',
   'the',
   'U.S.',
   'prison',
   'boom',
   '.'],
  '15 august': ['On',
   'India',
   "'s",
   'Independence',
   'Day',
   ',',
   'a',
   'zany',
   'mishap',
   'in',
   'a',
   'Mumbai',
   'chawl',
   'disrupts',
   'a',
   'young',
   'love',
   'story',
   'while',
   'compelling',
   'the',
   'residents',
   'to',
   'unite',
   'in',
   'aid',
   'of',
   'a',
   'little',
   'boy',
  

In [None]:
nTv = pd.read_csv('dataset.csv')

In [None]:
tv_li = nTv.loc[nTv['type']=='tvshow'].title.tolist()

In [None]:
tv_li

In [None]:
doc = nTv.loc[nTv['title']=='The Witcher'].description.values.tolist()[0]
doc

In [None]:
doc = nlp(doc)

In [None]:
for token in doc:
    print(token.text)