In [16]:
import pandas as pd
import numpy as np
import re
import sklearn.metrics.pairwise as pw
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

In [34]:
movies = pd.read_csv(r'D:\python\MovieRecommender-master\data\movies.csv')
ratings = pd.read_csv(r'D:\python\MovieRecommender-master\data\ratings.csv')

movies_rated = pd.read_excel(r'D:\python\MovieRecommender-master\data\bbianchi.xlsx',index_col='Movies')

In [35]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [36]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [37]:
display(movies['movieId'].value_counts().sort_values(ascending=False).head()) 
display(movies['title'].value_counts().sort_values(ascending=False).head(10) > 1)

83969     1
101577    1
26629     1
45062     1
79879     1
Name: movieId, dtype: int64

Saturn 3 (1980)                            True
War of the Worlds (2005)                   True
Eros (2004)                                True
Confessions of a Dangerous Mind (2002)     True
Emma (1996)                                True
Jingle All the Way (1996)                 False
Number 23, The (2007)                     False
Assignment, The (1997)                    False
Flawless (1999)                           False
Contempt (Mépris, Le) (1963)              False
Name: title, dtype: bool

In [38]:
duplicate_movies = movies.groupby('title').filter(lambda x: len(x) == 2)
duplic_ids = duplicate_movies['movieId'].values
#Duplicated titles
duplicate_movies = duplicate_movies[['movieId','title']]
# Checking the id with most reviews
review_count = pd.DataFrame(ratings[ratings['movieId'].isin(duplic_ids)]['movieId'].value_counts())
review_count.reset_index(inplace=True)
review_count.columns = ['movieId','count']
duplicated_df = pd.merge(duplicate_movies, review_count, on='movieId')
display(duplicated_df)
## Getting duplicates with low review count
duplicated_df.sort_values(by=['title','count'],ascending=[True,False])
duplicated_ids = duplicated_df.drop_duplicates(subset ="title", 
                     keep = 'last', inplace = False)['movieId']

Unnamed: 0,movieId,title,count
0,838,Emma (1996),30
1,2851,Saturn 3 (1980),4
2,6003,Confessions of a Dangerous Mind (2002),15
3,26958,Emma (1996),1
4,32600,Eros (2004),1
5,34048,War of the Worlds (2005),50
6,64997,War of the Worlds (2005),2
7,144606,Confessions of a Dangerous Mind (2002),1
8,147002,Eros (2004),1
9,168358,Saturn 3 (1980),1


In [39]:
# Removing duplicated ids with low review count from movie database
movies = movies.loc[~movies['movieId'].isin(duplicated_ids)]
# Removing duplicated ids with low review count from rating database
ratings = ratings.loc[~ratings['movieId'].isin(duplicated_ids)]

In [40]:
genres = list(set('|'.join(list(movies["genres"].unique())).split('|')))
genres.remove('(no genres listed)')

#Creating dummy columns for each genre
for genre in genres:
    movies[genre] = movies['genres'].map(lambda val: 1 if genre in val else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [41]:
#Creating colum with film year
movies['year'] = movies['title'].map(lambda val: int(re.search('\(([0-9]{4})\)',val).group(1)) 
                                     if re.search('\(([0-9]{4})\)',val)!= None 
                                     else 0)   
# Film Decade
for decade in range(1930,2020,10):
    movies['decade_'+str(decade)] = np.where((movies['year'] < decade+10) & (movies['year'] >= decade) ,1,0)
#     print('column created','decade_' + str(decade))
    
movies['decade_none'] = np.where(movies['year'] == 0 ,1,0)
movies['decade_other'] = np.where((movies['year'] != 0) & (movies['year'] <1930) ,1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in t

In [42]:
movies.drop('genres', axis=1,inplace= True)  
ratings.drop('timestamp', axis=1,inplace= True) 
movies_rated.dropna(axis=0, inplace=True)

In [43]:
df = pd.merge(ratings, movies, on='movieId')
print(df.shape)
# df.head(2)

(100830, 35)


In [44]:
def item_based_recom(input_dataframe,input_film_name):    
    pivot_item_based = pd.pivot_table(input_dataframe,
                                      index='title',
                                      columns=['userId'], values='rating')  
    sparse_pivot = sparse.csr_matrix(pivot_item_based.fillna(0))
    recommender = pw.cosine_similarity(sparse_pivot)
    recommender_df = pd.DataFrame(recommender, 
                                  columns=pivot_item_based.index,
                                  index=pivot_item_based.index)
    ## Item Rating Based Cosine Similarity
    cosine_df = pd.DataFrame(recommender_df[film_name].sort_values(ascending=False))
    cosine_df.reset_index(level=0, inplace=True)
    cosine_df.columns = ['title','cosine_sim']
    return cosine_df

In [45]:
categories = ['Film-Noir', 'Adventure', 'Children',
           'IMAX', 'Crime', 'Documentary', 'Fantasy', 'Musical', 'Romance',
           'Mystery', 'Thriller', 'Animation', 'Action', 'Comedy', 'War', 'Drama',
           'Western', 'Sci-Fi', 'Horror']

people_who_rated = {611: "Bernardo Bianchi"}
films_rated = movies_rated.to_dict()

# film_name = 'Inception (2010)' 
film_name ='Iron Man 2 (2010)' 
user_id = 611

In [50]:
def generate_recomendations(df,film_name,input_films_rated,top_results=5,cat=categories):
    print("User name: " + "Favorite Movie:", film_name+'\n\n')
    print("Films you might enjoy based that you watched", film_name)
    ## Item Rating Based Cosine Similarity
    cos_sim = item_based_recom(df,film_name)
    display(cos_sim[1:top_results+1])
    
generate_recomendations(df,film_name,films_rated,5)

User name: Favorite Movie: Iron Man 2 (2010)


Films you might enjoy based that you watched Iron Man 2 (2010)


Unnamed: 0,title,cosine_sim
1,X-Men: First Class (2011),0.699524
2,"Avengers, The (2012)",0.695324
3,Iron Man 3 (2013),0.677566
4,Iron Man (2008),0.641986
5,Guardians of the Galaxy (2014),0.638427
