In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
md = pd.read_csv('/Users/ayushsingh/Desktop/Movie-Reco/data/movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
credits = pd.read_csv('/Users/ayushsingh/Desktop/Movie-Reco/data/credits.csv')
keywords = pd.read_csv('/Users/ayushsingh/Desktop/Movie-Reco/data/keywords.csv')

In [4]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md = md.drop([19730, 29503, 35587])
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [5]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [6]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [7]:
md['cast'] = md['cast'].apply(literal_eval)

In [8]:
md['crew'] = md['crew'].apply(literal_eval)

In [9]:
md['keywords'] = md['keywords'].apply(literal_eval)

In [11]:
md['cast_size'] = md['cast'].apply(lambda x: len(x))
md['crew_size'] = md['crew'].apply(lambda x: len(x))

In [14]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [15]:
md['director'] = md['crew'].apply(get_director)

In [16]:
md['cast'] = md['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['cast'] = md['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [17]:
md['keywords'] = md['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [18]:
md['cast'] = md['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [19]:
md['director'] = md['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
md['director'] = md['director'].apply(lambda x: [x,x, x])

In [20]:
s = md.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]

In [21]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [22]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [23]:
md['keywords'] = md['keywords'].apply(filter_keywords)
md['keywords'] = md['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
md['keywords'] = md['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [24]:
md['soup'] = md['keywords'] + md['cast'] + md['director'] + md['genres']
md['soup'] = md['soup'].apply(lambda x: ' '.join(x))

In [28]:
import time
import re
from datasketch import MinHash, MinHashLSHForest

In [29]:
#Preprocess will split a string of text into individual tokens/shingles based on whitespace.
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [30]:
#print('The shingles (tokens) are:', preprocess(md['soup'][0]))

The shingles (tokens) are: ['jealousi', 'toy', 'boy', 'friendship', 'friend', 'rivalri', 'boynextdoor', 'newtoy', 'toycomestolif', 'tomhanks', 'timallen', 'donrickles', 'johnlasseter', 'johnlasseter', 'johnlasseter', 'animation', 'comedy', 'family']


In [89]:
#Number of Permutations
permutations = 256

In [90]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data['soup']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [91]:
def predict(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]['title']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

In [92]:
forest = get_forest(md, permutations)

It took 105.61240291595459 seconds to build forest.


In [93]:
md = md.reset_index()
titles = md['title']
indices = pd.Series(md.index, index=md['title'])
def get_movie_indice(title, indices):
    return indices[title]

ValueError: cannot insert level_0, already exists

In [95]:
title = 'Inception'

user_movie = md['soup'][get_movie_indice(title, indices)]
user_movie

'lossoflov dream kidnap sleep subconsci heist redempt femalehero leonardodicaprio josephgordon-levitt ellenpage christophernolan christophernolan christophernolan Action Thriller Science Fiction Mystery Adventure'

In [98]:
num_recommendations = 20
title = 'Toy Story'

user_movie = md['soup'][get_movie_indice(title, indices)]

result = predict(user_movie, md, permutations, num_recommendations, forest)
print('\n Top Recommendation(s) is(are) \n', result)

It took 0.01534581184387207 seconds to query forest.

 Top Recommendation(s) is(are) 
 0                                              Toy Story
9090                                            LadyBugs
28291                                Cody the Robosapien
22915                                     The Lego Movie
37637    The Tangerine Bear: Home in Time for Christmas!
23176                              Mr. Peabody & Sherman
26001                         Toy Story That Time Forgot
811                          The Adventures of Pinocchio
22571                         Mio in the Land of Faraway
21556           And You Thought Your Parents Were Weird!
7736                                  War of the Buttons
16971                                    Mars Needs Moms
21324     Lego Batman: The Movie - DC Super Heroes Unite
589                                            Pinocchio
32468                                  The Biscuit Eater
18282                                               Hugo
3