In [103]:
import pandas as pd
import numpy as np
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [104]:
md = pd.read_csv('/Users/ayushsingh/Desktop/Movie-Reco/data/movies_metadata.csv')

In [105]:
credits = pd.read_csv('/Users/ayushsingh/Desktop/Movie-Reco/data/credits.csv')
keywords = pd.read_csv('/Users/ayushsingh/Desktop/Movie-Reco/data/keywords.csv')

In [106]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md = md.drop([19730, 29503, 35587])
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [107]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [108]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [109]:
md['cast'] = md['cast'].apply(literal_eval)

In [110]:
md['crew'] = md['crew'].apply(literal_eval)

In [111]:
md['keywords'] = md['keywords'].apply(literal_eval)

In [112]:
md['cast_size'] = md['cast'].apply(lambda x: len(x))
md['crew_size'] = md['crew'].apply(lambda x: len(x))

In [113]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [114]:
md['director'] = md['crew'].apply(get_director)

In [115]:
md['cast'] = md['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['cast'] = md['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [116]:
md['keywords'] = md['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [117]:
md['cast'] = md['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [118]:
md['director'] = md['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
md['director'] = md['director'].apply(lambda x: [x,x, x])

In [119]:
s = md.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]

In [120]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [121]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [122]:
md['keywords'] = md['keywords'].apply(filter_keywords)
md['keywords'] = md['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
md['keywords'] = md['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [123]:
md['soup'] = md['keywords'] + md['cast'] + md['director'] + md['genres']
md['soup'] = md['soup'].apply(lambda x: ' '.join(x))

In [124]:
import time
import re
from datasketch import MinHash, MinHashLSHForest

In [125]:
#Preprocess will split a string of text into individual tokens/shingles based on whitespace.
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [126]:
#print('The shingles (tokens) are:', preprocess(md['soup'][0]))

In [131]:
#Number of Permutations
permutations = 10

In [128]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data['soup']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [129]:
def predict(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]['title']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

In [132]:
forest = get_forest(md, permutations)

It took 23.90125298500061 seconds to build forest.


In [133]:
md = md.reset_index()
titles = md['title']
indices = pd.Series(md.index, index=md['title'])
def get_movie_indice(title, indices):
    return indices[title]

In [134]:
title = 'Inception'

user_movie = md['soup'][get_movie_indice(title, indices)]
user_movie

'lossoflov dream kidnap sleep subconsci heist redempt femalehero leonardodicaprio josephgordon-levitt ellenpage christophernolan christophernolan christophernolan Action Thriller Science Fiction Mystery Adventure'

In [136]:
num_recommendations = 20
title = 'The Dark Knight Rises'

user_movie = md['soup'][get_movie_indice(title, indices)]

result = predict(user_movie, md, permutations, num_recommendations, forest)
print('\n Top Recommendation(s) is(are) \n', result)

It took 0.005944967269897461 seconds to query forest.

 Top Recommendation(s) is(are) 
 3                 Waiting to Exhale
7                      Tom and Huck
13                            Nixon
15                           Casino
23                           Powder
26                     Now and Then
27                       Persuasion
29                   Shanghai Triad
30                  Dangerous Minds
33                             Babe
36           Across the Sea of Time
38                         Clueless
39         Cry, the Beloved Country
40                      Richard III
41                  Dead Presidents
42                      Restoration
45    How To Make An American Quilt
48            When Night Is Falling
49               The Usual Suspects
50                   Guardian Angel
Name: title, dtype: object


In [137]:
title = 'The Dark Knight Rises'

user_movie = md['soup'][get_movie_indice(title, indices)]
user_movie

'dccomic crimefight terrorist secretident burglar hostagedrama timebomb gothamc vigilant cover-up superhero villai tragichero terror destruct catwoman catburglar imax flood criminalunderworld batman christianbale michaelcaine garyoldman christophernolan christophernolan christophernolan Action Crime Drama Thriller'

In [138]:
title = 'Waiting to Exhale'

user_movie2 = md['soup'][get_movie_indice(title, indices)]
user_movie2

'basedonnovel interracialrelationship singlemoth divorc chickflick whitneyhouston angelabassett lorettadevine forestwhitaker forestwhitaker forestwhitaker Comedy Drama Romance'