In [60]:
#Python Imports
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import pandas as pd
import re
from pandas.io.json import json_normalize
import json
import pandas as pd
from surprise import SVD, accuracy
from surprise.model_selection import cross_validate, train_test_split
from surprise import Dataset
from surprise import Reader

from collections import defaultdict
from sklearn.decomposition import TruncatedSVD

from sklearn import metrics

import sys

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval

In [61]:
def convert_ids(ids_in_csv):
    return pd.to_numeric(ids_in_csv, errors='coerce').astype('int64')

def convert_to_float(ids_in_csv):
    return pd.to_numeric(ids_in_csv, errors='coerce').astype('float64')

def to_json(csv_entry):
    return json.loads(re.sub('\'', '"', csv_entry))


def get_top_n(predictions, n=200):
    '''SUPRISE API
    Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def get_movie_name(movie_id):
    return ratings_with_movie_names[ratings_with_movie_names.id == movie_id]['title'].iloc[0]


def print_user_prediction(userId, predictions_dict, meta_df):
    users_viewed_movies = ratings_with_movie_names[ratings_with_movie_names['userId'] == userId][
        ['rating', 'original_title']]
    print(f'User {userId} has viewed the following movies:\n')

    for row in users_viewed_movies.itertuples():
        rating = row[1]
        original_title = row[2]
        print(f'\t{original_title}, Rating: {rating}')

    print(f'\nThe following movies are recommended for User {userId}\n')
    recommended_movies = [get_movie_name(mov_id[0], meta_df) for mov_id in predictions_dict[userId]]

    for movie in recommended_movies:
        print(f'\t{movie}')

def get_movie_name(movie_id, movie_meta_df):
    return movie_meta_df[movie_meta_df.id == movie_id]['title'].iloc[0]

def get_movie_id(title, movie_meta_df):
    return movie_meta_df[movie_meta_df.title == title]['id'].iloc[0]


def get_all_movies_in_cluster(cluster_number, cluster_dict, meta_df):
    movies = cluster_dict[cluster_number]
    return [get_movie_name(mov, meta_df) for mov in movies]

def get_cluster_number(movie, cluster_zip):
    for cluster, movie_id in cluster_zip:

        if movie_id == movie:
            return cluster

    raise Exception('Movie not found in cluster')

In [3]:
movies_metadata_df1 = pd.read_csv('the-movies-dataset/movies_metadata.csv'
                                 , converters={ 'id': lambda x: convert_ids(x)
                                               , 'imdb_id': lambda x: convert_ids(x)
                                               ,'popularity': lambda x: convert_to_float(x)
                                               ,'genres': lambda x: to_json(x)}
                                 , usecols=['id', 'original_title'
                                                , 'genres' #'homepage'
                                                , 'overview', 'popularity', 'poster_path'
                                                , 'release_date', 'revenue', 'runtime'
                                                , 'spoken_languages', 'title'
                                                , 'vote_average', 'vote_count']
                                , dtype={'populariy': np.float64}
                                , parse_dates=True)


movies_lookup_df = pd.read_csv('the-movies-dataset/movies_metadata.csv'
                        , converters={'id': lambda x: convert_ids(x), 'imdb_id': lambda x: convert_ids(x)}
                       ,usecols=['id', 'title'])

#####################################
##SVD DATA SET
movies_df = pd.read_csv('the-movies-dataset/movies_metadata.csv'
                        , converters={'id': lambda x: convert_ids(x), 'imdb_id': lambda x: convert_ids(x)}
                       ,usecols=['id', 'original_title', 'belongs_to_collection'
                                 , 'budget', 'genres', 'homepage'
                                 ,'imdb_id', 'overview', 'popularity', 'poster_path'
                                 , 'production_companies','release_date', 'revenue', 'runtime',
                                 'spoken_languages', 'status', 'tagline', 'title', 'video',
                                 'vote_average', 'vote_count'])
#####################################

ratings_df = pd.read_csv('the-movies-dataset/ratings_small.csv')



# content_filter_df = pd.read_pickle('content_filter_df.pkl')
# content_filter_df = content_filter_df[['id',
#  'popularity',
#  #'release_date',
#  'vote_average',
#  'release_year',
#  0,1,2,3,4,5, 6, 7, 8,9,10,11,12,13,14,15,16,17,18,19]]
# content_filter_df = content_filter_df.dropna()

# idx = pd.Index(content_filter_df['id'])
# idx
# content_filter_df.index = idx

FileNotFoundError: File b'the-movies-dataset/movies_metadata.csv' does not exist

In [4]:
###May need Fuzzy matching, but for now:
movies_df = movies_df[movies_df.spoken_languages == """[{'iso_639_1': 'en', 'name': 'English'}]"""]

In [5]:
ratings_with_movie_names = ratings_df.merge(movies_df[['id', 'original_title']], how='left', left_on='movieId', right_on='id')
ratings_with_movie_names = ratings_with_movie_names[ratings_with_movie_names.original_title.isnull() == False]

In [6]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_with_movie_names[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

# SVD : Collaborative Filtering

In [7]:
algo = SVD(verbose=True)
algo.fit(trainset)

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, n_jobs=-1, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8861  0.9130  0.8963  0.8826  0.9074  0.8971  0.0118  
MAE (testset)     0.6847  0.7031  0.6988  0.6800  0.6946  0.6922  0.0087  
Fit time          2.35    2.55    2.55    2.28    1.89    2.32    0.24    
Test time         0.08    0.10    0.10    0.06    0.05    0.08    0.02    


{'test_rmse': array([0.88605672, 0.91297384, 0.89632064, 0.88259872, 0.90740724]),
 'test_mae': array([0.68471755, 0.70313438, 0.69877428, 0.67995533, 0.69460831]),
 'fit_time': (2.3541698455810547,
  2.5501461029052734,
  2.545943021774292,
  2.2793657779693604,
  1.893211841583252),
 'test_time': (0.08416199684143066,
  0.09843826293945312,
  0.10104489326477051,
  0.05861711502075195,
  0.052559852600097656)}

In [18]:
predictions = algo.test(testset)
### Tune this value to get fewer results faster, but less options to choose from
top_n = get_top_n(predictions)


predicted_movies_by_name = defaultdict(list)

### This builds the dictionary of predicted movies for all users
for key, value in top_n.items():
    predicted_movies_by_name[key] = [get_movie_name(mov_id[0], movies_metadata_df1) for mov_id in value]

In [19]:
from collections import namedtuple
UserFavoriteRating = namedtuple('UserFavoriteRating', ['title', 'rating'])

def users_top_n_movies(n, userId, predictions_dict, meta_df):
    users_viewed_movies = ratings_with_movie_names[ratings_with_movie_names['userId'] == userId][['rating', 'original_title']]
    
    viewed_movies = []

    for row in users_viewed_movies.itertuples():
        rating = row[1]
        original_title = row[2]
        film = UserFavoriteRating(original_title, rating)
        viewed_movies.append(film)
    
    sorted(viewed_movies, key=lambda film: film[1])
    
    return viewed_movies[0:n]

# GET A USERS TOP RATED MOVIES

In [20]:
users_top_n_movies(6, 10, predicted_movies_by_name, movies_metadata_df1)

[UserFavoriteRating(title='Star Trek: The Motion Picture', rating=4.0),
 UserFavoriteRating(title='The Million Dollar Hotel', rating=4.0),
 UserFavoriteRating(title='Eyes Wide Shut', rating=4.0),
 UserFavoriteRating(title='The Conversation', rating=3.0),
 UserFavoriteRating(title='Point Break', rating=3.0),
 UserFavoriteRating(title='A Brief History of Time', rating=5.0)]

# At this point, you should have user personas to get a pool of movies to choose from, not simply pre-made users:

In [21]:
print_user_prediction(47, top_n, movies_metadata_df1)

User 47 has viewed the following movies:

	48 Hrs., Rating: 4.0
	Back to the Future Part II, Rating: 4.0
	20,000 Leagues Under the Sea, Rating: 5.0
	Muriel's Wedding, Rating: 4.0
	High Noon, Rating: 4.0
	Terminator 3: Rise of the Machines, Rating: 4.0
	Faster, Pussycat! Kill! Kill!, Rating: 4.0
	Silent Hill, Rating: 4.0
	The Conversation, Rating: 3.0
	To Kill a Mockingbird, Rating: 3.0

The following movies are recommended for User 47

	Sleepless in Seattle
	The Million Dollar Hotel
	The Good Thief
	Lonely Hearts
	Galaxy Quest
	Nell
	Flags of Our Fathers
	Hard Target
	License to Wed
	Point Break
	Birdman of Alcatraz
	My Darling Clementine
	Mission: Impossible II
	While You Were Sleeping
	Frankenstein
	The Thomas Crown Affair
	Murder She Said
	End of the World
	Boat
	Beetlejuice
	Beverly Hills Cop III
	My Name Is Bruce
	Men in Black II
	Confession of a Child of the Century
	Straw Dogs
	Edward Scissorhands
	The Bachelor
	Dr. Jekyll and Mr. Hyde
	Space Jam
	The 39 Steps
	Cold Mountain
	Bo

In [22]:
UserFavoriteRating = namedtuple('UserFavoriteRating', ['title', 'rating'])
def collab_filter_recommendations(user, top_ns, movie_meta_df):
    
    predictions = top_ns[user]
    
    return [UserFavoriteRating(get_movie_name(pred[0], movie_meta_df), pred[1]) for pred in predictions]

In [23]:
collab_filter_recommendations(47, top_n, movies_metadata_df1)

[UserFavoriteRating(title='Sleepless in Seattle', rating=4.868342810017726),
 UserFavoriteRating(title='The Million Dollar Hotel', rating=4.698752943845523),
 UserFavoriteRating(title='The Good Thief', rating=4.690038051145996),
 UserFavoriteRating(title='Lonely Hearts', rating=4.686957743582364),
 UserFavoriteRating(title='Galaxy Quest', rating=4.665849789470337),
 UserFavoriteRating(title='Nell', rating=4.654714414811269),
 UserFavoriteRating(title='Flags of Our Fathers', rating=4.628800442160122),
 UserFavoriteRating(title='Hard Target', rating=4.608794397057453),
 UserFavoriteRating(title='License to Wed', rating=4.607982223059793),
 UserFavoriteRating(title='Point Break', rating=4.5884399893310475),
 UserFavoriteRating(title='Birdman of Alcatraz', rating=4.531716309511345),
 UserFavoriteRating(title='My Darling Clementine', rating=4.526230373403165),
 UserFavoriteRating(title='Mission: Impossible II', rating=4.522152931720601),
 UserFavoriteRating(title='While You Were Sleeping', 

# CONTENT FILTERING

In [200]:
movies_metadata_df = pd.read_csv('/Users/ayushsingh/Desktop/Movie-Reco/data/movies_metadata.csv'
                                 , converters={ 'id': lambda x: convert_ids(x)
                                               , 'imdb_id': lambda x: convert_ids(x)
                                               ,'popularity': lambda x: convert_to_float(x)
                                               ,'genres': lambda x: to_json(x)}
                                 , usecols=['id', 'original_title'
                                                , 'genres' #'homepage'
                                                , 'overview', 'popularity', 'poster_path'
                                                , 'release_date', 'revenue', 'runtime'
                                                , 'spoken_languages', 'title'
                                                , 'vote_average', 'vote_count']
                                , dtype={'populariy': np.float64}
                                , parse_dates=True)
movies_metadata_df = movies_metadata_df.drop_duplicates(subset=['id'])

In [147]:
def get_recommendations(title, tfidf_matrix):
    # Get the index of the movie that matches the title
    idx = movies_df[title]
    cosine_sim = linear_kernel(tfidf_matrix[idx], tfidf_matrix)
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[0]))
    
    pickle.dump(tfidf_matrix ,open("feature.pkl","wb"))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[0:21]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    #print(idx, title , movies_metadata_df['title'].iloc[idx],movies_metadata_df['genres'].iloc[idx])
    return movies_metadata_df['title'].iloc[movie_indices]

In [148]:
movies_metadata_df.shape

(45434, 13)

In [201]:
# Load keywords and credits
credits = pd.read_csv('/Users/ayushsingh/Desktop/Movie-Reco/data/credits.csv')
keywords = pd.read_csv('/Users/ayushsingh/Desktop/Movie-Reco/data/keywords.csv')

credits = credits.drop_duplicates(subset=['id'])
keywords = keywords.drop_duplicates(subset=['id'])

# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
movies_metadata_df['id'] = movies_metadata_df['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
movies_metadata_df = credits.merge(movies_metadata_df, on='id')
movies_metadata_df = movies_metadata_df.merge(keywords, on='id')


In [150]:
movies_df = pd.Series(movies_metadata_df.index, index=movies_metadata_df['title']).drop_duplicates()

In [151]:
movies_metadata_df.head()

Unnamed: 0,cast,crew,id,genres,original_title,overview,popularity,poster_path,release_date,revenue,runtime,spoken_languages,title,vote_average,vote_count,keywords
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Toy Story,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Jumanji,6.9,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Grumpier Old Men,6.5,92.0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Waiting to Exhale,6.1,34.0,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,"[{'id': 35, 'name': 'Comedy'}]",Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Father of the Bride Part II,5.7,173.0,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [202]:
features = ['cast', 'crew', 'keywords']
for feature in features:
    movies_metadata_df[feature] = movies_metadata_df[feature].apply(literal_eval)

In [203]:
movies_metadata_df['genres'] = movies_metadata_df['genres'].fillna('[]')
movies_metadata_df['genres'] = movies_metadata_df['genres'].apply(lambda x: literal_eval(str(x)))

In [204]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

# Returns the list top  elements or entire list; whichever is more.
def get_list_(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        return names

    #Return empty list in case of missing/malformed data
    return []

def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
        
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [205]:
# Define new director, cast, genres and keywords features that are in a suitable form.
movies_metadata_df['director'] = movies_metadata_df['crew'].apply(get_director)

features = ['cast', 'genres']
for feature in features:
    movies_metadata_df[feature] = movies_metadata_df[feature].apply(get_list)
    
movies_metadata_df['keywords'] = movies_metadata_df['keywords'].apply(get_list_)

In [206]:
print(movies_metadata_df['keywords'][13])

['usa president', 'presidential election', 'watergate scandal', 'biography', 'government', 'historical figure']


In [207]:
s = movies_metadata_df.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [208]:
s = s.value_counts()

In [209]:
s = s[s > 1]

In [210]:
stemmer = SnowballStemmer('english')

In [211]:
s

woman director          3039
independent film        1914
murder                  1285
based on novel           822
musical                  726
sex                      679
violence                 647
nudity                   629
revenge                  618
biography                613
suspense                 581
love                     565
female nudity            555
sport                    535
police                   451
duringcreditsstinger     440
teenager                 434
sequel                   432
friendship               404
world war ii             390
drug                     358
stand-up comedy          349
prison                   348
high school              314
martial arts             310
suicide                  306
silent film              304
rape                     304
film noir                302
kidnapping               300
                        ... 
aunt nephew incest         2
chaos and mayham           2
snake venom                2
popular girl  

In [279]:
def filter_keywords(x):
    words = []
    for i in x:
            if i in s:
                words.append(i)
    return words

def apply_stemmer(x):
    return [stemmer.stem(i) for i in x]

In [272]:
for i in movies_metadata_df:
    i = filter_keywords(i)

In [277]:
'cuba' in s

True

In [273]:
movies_metadata_df['keywords']

0        [jealousy, toy, boy, friendship, friends, riva...
1        [board game, disappearance, based on children'...
2        [fishing, best friend, duringcreditsstinger, o...
3        [based on novel, interracial relationship, sin...
4        [baby, midlife crisis, confidence, aging, daug...
5        [robbery, detective, bank, obsession, chase, s...
6        [paris, brother brother relationship, chauffeu...
7                                                       []
8          [terrorist, hostage, explosive, vice president]
9        [cuba, falsely accused, secret identity, compu...
10       [white house, usa president, new love, widower...
11                                        [dracula, spoof]
12       [wolf, dog-sledding race, alaska, dog, goose, ...
13       [usa president, presidential election, waterga...
14       [exotic island, treasure, map, ship, scalp, pi...
15       [poker, drug abuse, 1970s, overdose, illegal p...
16       [bowling, based on novel, servant, country lif.

In [280]:
movies_metadata_df['keywords'] = movies_metadata_df['keywords'].apply(apply_stemmer)

In [282]:
movies_metadata_df['keywords']

0        [jealousi, toy, boy, friendship, friend, rival...
1        [board gam, disappear, based on children's boo...
2         [fish, best friend, duringcreditssting, old men]
3        [based on novel, interracial relationship, sin...
4        [babi, midlife crisi, confid, age, daughter, m...
5        [robberi, detect, bank, obsess, chase, shoot, ...
6        [pari, brother brother relationship, chauffeur...
7                                                       []
8                 [terrorist, hostag, explos, vice presid]
9        [cuba, falsely accus, secret ident, computer v...
10       [white hous, usa presid, new lov, widow, wildl...
11                                        [dracula, spoof]
12       [wolf, dog-sledding rac, alaska, dog, goos, be...
13       [usa presid, presidential elect, watergate sca...
14       [exotic island, treasur, map, ship, scalp, pirat]
15       [poker, drug abus, 1970s, overdos, illegal pro...
16       [bowl, based on novel, servant, country lif, j.

In [307]:
# Apply clean_data function to your features.
features = ['cast', 'director', 'genres']

for feature in features:
    movies_metadata_df[feature] = movies_metadata_df[feature].apply(clean_data)

# Create a new soup feature
movies_metadata_df['soup'] = movies_metadata_df.apply(create_soup, axis=1)


In [315]:

count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(movies_metadata_df['soup'])
count_matrix

<44097x361271 sparse matrix of type '<class 'numpy.int64'>'
	with 900213 stored elements in Compressed Sparse Row format>

In [322]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [362]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores.sort(key = lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

print(get_recommendations('The Dark Knight').head(10))

0           862
1          8844
2         15602
3         31357
4         11862
5           949
6         11860
7         45325
8          9091
9           710
10         9087
11        12110
12        21032
13        10858
14         1408
15          524
16         4584
17            5
18         9273
19        11517
20         8012
21         1710
22         9691
23        12665
24          451
25        16420
26         9263
27        17015
28          902
29        37557
          ...  
45401     63266
45402     45527
45403    455661
45404    327237
45405     84710
45406     39562
45407     14008
45408     44330
45409     49279
45410     44333
45411     49277
45412     49271
45413     44324
45414    122036
45415     14885
45416     49280
45417    106807
45418    276895
45419    404604
45420    420346
45421     67179
45422     84419
45423    390959
45424    289923
45425    222848
45426     30840
45428    111109
45429     67758
45430    227506
45431    461257
Name: id, Length: 44097,

In [333]:
sim_scores = list(enumerate(cosine_sim[12477]))

In [339]:

sim_scores.sort(key = lambda x: x[1], reverse=True)

In [357]:
sim_scores

[(12477, 1.0),
 (15049, 0.29411764705882354),
 (28738, 0.29250896965085227),
 (2904, 0.27022640946662696),
 (9210, 0.269069117598525),
 (5327, 0.25048971643405976),
 (10167, 0.25048971643405976),
 (22081, 0.24253562503633297),
 (37582, 0.24253562503633297),
 (43637, 0.24253562503633297),
 (13655, 0.23529411764705882),
 (2771, 0.22256595362986287),
 (3595, 0.22256595362986287),
 (5122, 0.22256595362986287),
 (10945, 0.22256595362986287),
 (15717, 0.22256595362986287),
 (21439, 0.2193817272381392),
 (32114, 0.2193817272381392),
 (39621, 0.2193817272381392),
 (42700, 0.2193817272381392),
 (1195, 0.21170244960998527),
 (17347, 0.21170244960998527),
 (20538, 0.21170244960998527),
 (5308, 0.21110016546037452),
 (26155, 0.21110016546037452),
 (13825, 0.20228869496966945),
 (15424, 0.20228869496966945),
 (18826, 0.20228869496966945),
 (40629, 0.20228869496966945),
 (8250, 0.20180183819889375),
 (10646, 0.20180183819889375),
 (11492, 0.20180183819889375),
 (13622, 0.20180183819889375),
 (22506,

In [355]:
movies_metadata_df['title'][6218]

'High Society'

# Graph Filter

In [24]:
pip install neo4j

Note: you may need to restart the kernel to use updated packages.


In [25]:
pip install neomodel

Note: you may need to restart the kernel to use updated packages.


In [27]:
from neo4j import GraphDatabase

uri = "bolt://localhost:11002"
driver = GraphDatabase.driver(uri, auth=('neo4j', 'ayush'))

In [31]:
session = driver.session()
query = "MATCH (n) RETURN count(n)"
no_count = session.run(query)
for i in no_count:
    print(i)
print(no_count)

<Record count(n)=10371>
<neo4j.BoltStatementResult object at 0x1660dd310>


In [32]:
import json
from datetime import datetime
from pprint import pprint
import codecs
import os
import json
import logging
import random
from py2neo import Graph, Path, Node, Relationship, cypher

In [39]:
#walk_limit defines the number of nodes it can hop
walk_limit = 5 

#All types of relationships possible, will be helpful in defining probabilities
relationships = ["FAVORITE","GENRES","SIMILAR","WATCHED"]

In [40]:
def generateRandomPath():
    #query to select a random node to start with
    query = '''
        START t=node(*) 
        MATCH (a)-[]->(t) 
        RETURN a
        SKIP {n} LIMIT 1'''
    
    #get total number of nodes
    node_count_query =  "MATCH (n) RETURN count(n)"
    node_count = 10371
    session = driver.session()
    
    #select a random node
    #random_node = (session.run(query,{"n":random.randint(0,node_count - 1), "m":1}))
    random_node = (session.run(query,{"n":862, "m":1}))

    #print(type(random_node))
    #x = random_node.single()[0]
#     print(x)
#     print(x.id)
    #print(random_node.single()[0])
    
#     for i in random_node:
#         print (i)
    r_node =  random_node.single()[0]
    return walk(r_node, 0) # start random walk and return the final node

In [41]:
path = []

In [42]:
def walk(start_node, current_length, prev_node=None, max_length = walk_limit):
    #print(type(start_node.labels))
    if (start_node.labels == {'Movies'}):
        #print(start_node)
        path.append(start_node)
    else:
        current_length-=1
    #return if path length exceed limit
    if (current_length >= max_length):
        return {start_node:current_length}
    
    #query to get all incoming and outgoing paths from a node
    query = """match p=(n)-[]-() where id(n)={id} return p"""
    session = driver.session()
    rels = session.run(query,{"id":start_node.id})
#     for i in rels:
#         print(i)
    #Convert the output to a list
    relationships = rels.data()
    #print(relationships)
    size = len(relationships)
    
    #select a random node
    node = random.randint(0,size-1)
    next_node = relationships[node]["p"].end_node
    #print(relationships[node]["p"].end_node)
    current_node = relationships[node]["p"].start_node
    
    #If next random node is similar to previous node, choose another
    if(next_node.id == current_node.id and size > 1):
        node = random.randint(0,size-1)
        next_node = relationships[node]["p"].end_node
        current_node = relationships[node]["p"].start_node
    #if previous node is the only possible path, terminate
    elif(next_node.id == current_node.id and size == 1):
        
        return {current_node:current_length}
        
    return walk(next_node, current_length + 1, current_node)

In [43]:
print(generateRandomPath())
#for i in range(5):
    
for i in path:
    print(i)

{<Node id=728 labels={'Movies'} properties={'rating_mean': '3.5833333333333335', 'title': 'Flirting With Disaster (1996)', 'movieId': '125'}>: 5}
<Node id=3385 labels={'Movies'} properties={'rating_mean': '4.0375', 'title': 'Road Warrior, The (Mad Max 2) (1981)', 'movieId': '3703'}>
<Node id=2456 labels={'Movies'} properties={'rating_mean': '3.383720930232558', 'title': 'Fly, The (1986)', 'movieId': '2455'}>
<Node id=1803 labels={'Movies'} properties={'rating_mean': '2.94', 'title': 'Event Horizon (1997)', 'movieId': '1590'}>
<Node id=1815 labels={'Movies'} properties={'rating_mean': '2.4615384615384617', 'title': 'Mimic (1997)', 'movieId': '1603'}>
<Node id=3480 labels={'Movies'} properties={'rating_mean': '2.2948717948717947', 'title': 'Hollow Man (2000)', 'movieId': '3826'}>
<Node id=728 labels={'Movies'} properties={'rating_mean': '3.5833333333333335', 'title': 'Flirting With Disaster (1996)', 'movieId': '125'}>


# GRAPH CODE HERE

In [28]:
def get_count_first_degree_films_of(tx, title):
    for record in tx.run("MATCH (origin:MOVIE)-[:APPEARED_IN]-(actor)-[:APPEARED_IN]-(first_movie:MOVIE)"
                         "WHERE origin.title = {title} "
                         "RETURN count(*)", title=title):
        print(record[0])
        
def get_first_degree_films_of(tx, title):
    nodes = []
    
    for record in tx.run("MATCH (origin:MOVIE)-[:APPEARED_IN]-(actor)-[:APPEARED_IN]-(first_movie:MOVIE)"
                         "WHERE origin.title = {title} "
                         "RETURN first_movie", title=title):
        
        nodes.append(record.data())
    
    return nodes

first_degree_away = None
with driver.session() as session:
    first_degree_away = session.read_transaction(get_first_degree_films_of, 'Superman')

IndexError: list index out of range

In [16]:
GraphMember = namedtuple('GraphMember', ['title', 'movie_id'])

def neo4j_results_to_tuples(results):
    return [GraphMember(node['first_movie'].get('title'), node['first_movie'].get('movie_id')) for node in results]

NameError: name 'namedtuple' is not defined

In [None]:
neo4j_results_to_tuples(first_degree_away)

In [None]:
def get_first_degree_films_of(tx, title):
    nodes = []

    for record in tx.run("MATCH (origin:MOVIE)-[:APPEARED_IN]-(actor)-[:APPEARED_IN]-(first_movie:MOVIE)"
                         "WHERE origin.title = {title} "
                         "RETURN first_movie", title=title):
        nodes.append(record.data())

    return nodes

# Get First Degree Away Films of Sleepless in Seattle

In [None]:
first_degree_away = None
with driver.session() as session:
    first_degree_away = session.read_transaction(get_first_degree_films_of, 'Sleepless in Seattle')

In [None]:
def neo4j_results_to_tuples(results):
    return [GraphMember(node['first_movie'].get('title'), node['first_movie'].get('movie_id')) for node in results]

In [None]:
def get_connected_movies(list_favorite_movies, driver):
    list_connected_movies = []

    with driver.session() as session:
        for movie in list_favorite_movies:
            first_degree_away_films = session.read_transaction(get_first_degree_films_of, movie)
            film_tups = neo4j_results_to_tuples(first_degree_away_films)
            list_connected_movies.extend(film_tups)

    return list_connected_movies

## Run a test with a user

In [None]:
user_number = 321
predictions_dict = predicted_movies_by_name
get_graph_on = users_top_n_movies(10, user_number, predictions_dict, movies_metadata_df)

In [None]:
get_graph_on

In [None]:
def get_top_three_favs(user_id):
    
    favorite_seen_movie_array = users_top_n_movies(200, user_id, predicted_movies_by_name, movies_metadata_df1)
    sorted_seen_movies = sorted(favorite_seen_movie_array, key=lambda k: k[1], reverse=True)
    return sorted_seen_movies[0:3]

In [None]:
get_top_three_favs(47)

# Collab Filter List

In [None]:
collab_filtered_movies = sorted(collab_filter_recommendations(47, top_n, movies_metadata_df1), key=lambda k: k[1], reverse=True)
collab_filtered_movies

## Content Filtered

In [None]:
# #get_movie_id('The Endless Summer', movies_metadata_df1)
# content_filtered_tups = top_n_closest_content_filtered(50, 321, content_filter_df)
# content_filtered_tups

In [None]:
content_filtered_tups = get_recommendations('Sherlock Holmes',count_matrix)
content_filtered_tups

In [None]:
favorite_list = get_top_three_favs(321)
movie_names = [movie[0] for movie in favorite_list]

In [None]:
first_degree_away_films = get_connected_movies([mov[0] for mov in top_three], driver)

In [None]:
graph_id_content_filter = set({movie[1] for movie in first_degree_away})

In [None]:
graph_name_collab_filter = set({movie[0] for movie in first_degree_away})

In [None]:
content_id_set = set(movie[0] for movie in content_filtered_tups)

In [None]:
collab_movie_name_set = set(movie[0] for movie in collab_filtered_movies)

In [None]:
graph_name_collab_filter & collab_movie_name_set

In [None]:
content_id_set & graph_id_content_filter

In [None]:
get_movie_name(550, movies_metadata_df1)