In [7]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import time

In [8]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [9]:
ratings.drop('timestamp', axis = 1, inplace=True)

In [10]:
# Extracting duplicated movie ids
duplicate_movies = movies.groupby('title').filter(lambda x: len(x) == 2)
duplic_ids = duplicate_movies['movieId'].values
#Duplicated titles
duplicate_movies = duplicate_movies[['movieId','title']]
# Checking the id with most reviews
review_count = pd.DataFrame(ratings[ratings['movieId'].isin(duplic_ids)]['movieId'].value_counts())
review_count.reset_index(inplace=True)
review_count.columns = ['movieId','count']
duplicated_df = pd.merge(duplicate_movies, review_count, on='movieId')
display(duplicated_df)
## Getting duplicates with low review count
duplicated_df.sort_values(by=['title','count'],ascending=[True,False])
duplicated_ids = duplicated_df.drop_duplicates(subset ="title", 
                     keep = 'last', inplace = False)['movieId']

Unnamed: 0,movieId,title,count
0,838,Emma (1996),30
1,2851,Saturn 3 (1980),4
2,6003,Confessions of a Dangerous Mind (2002),15
3,26958,Emma (1996),1
4,32600,Eros (2004),1
5,34048,War of the Worlds (2005),50
6,64997,War of the Worlds (2005),2
7,144606,Confessions of a Dangerous Mind (2002),1
8,147002,Eros (2004),1
9,168358,Saturn 3 (1980),1


In [11]:
# Removing duplicated ids with low review count from movie database
movies = movies.loc[~movies['movieId'].isin(duplicated_ids)]
# Removing duplicated ids with low review count from rating database
ratings = ratings.loc[~ratings['movieId'].isin(duplicated_ids)]

In [12]:
movies.to_csv('movies_preprocessed.csv', index=False)
ratings.to_csv('ratings_preprocessed.csv', index=False)

In [13]:
def get_index_from_title(title):
    movie_id = movies[movies['title'] == title]
    movie_id = list(movie_id['movieId'])
    return movie_id[0]

In [14]:
ratings_dict =  {'Lion King, The (1994)':5,
            'Dark Knight, The (2008)':5,
            'Mystic River (2003)':5,
            'Perks of Being a Wallflower, The (2012)':4,
            'Scott Pilgrim vs. the World (2010)':4
                }

user_id = 999

ids = []

for movie_titles in ratings_dict.keys():
    x = get_index_from_title(movie_titles)
    ids.append(x)
    
id_list = [user_id] * len(ratings_dict)

user_ratings = list(zip(id_list,ids, ratings_dict.values()))

user_ratings = pd.DataFrame(user_ratings, columns=ratings.columns)
user_ratings

Unnamed: 0,userId,movieId,rating
0,999,364,5
1,999,58559,5
2,999,6870,5
3,999,96821,4
4,999,79702,4


In [45]:
ratings = pd.concat([ratings, user_ratings])
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
0,999,364,5.0
1,999,58559,5.0
2,999,6870,5.0
3,999,96821,4.0


In [46]:
combined = pd.merge(movies,ratings,on='movieId')
combined

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5
...,...,...,...,...,...
100830,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0
100831,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5
100832,193585,Flint (2017),Drama,184,3.5
100833,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5


In [47]:
moviemat = combined.pivot_table(index='userId',columns='title',values='rating').fillna(0)
moviemat

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
610,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,...,0.0,4.0,3.5,3.0,0.0,0.0,2.0,1.5,0.0,0.0


In [48]:
moviemat_sparse = csr_matrix(moviemat.values)
moviemat_sparse

<611x9719 sparse matrix of type '<class 'numpy.float64'>'
	with 100835 stored elements in Compressed Sparse Row format>

In [49]:
cosine_sim = cosine_similarity(moviemat_sparse)

In [50]:
#k = top n similar users

k = 10

In [51]:
def user_based_recomm():
    combined = pd.merge(movies,ratings,on='movieId')

    moviemat = combined.pivot_table(index='userId',columns='title',values='rating').fillna(0)
    moviemat_sparse = csr_matrix(moviemat.values)
    cosine_sim = cosine_similarity(moviemat_sparse)

    k = 10

    moviemat 
    recommender_df = pd.DataFrame(cosine_sim, 
                                  columns=moviemat.index,
                                  index=moviemat.index)


    ## Item Rating Based Cosine Similarity
    cosine_df = pd.DataFrame(recommender_df[user_id].sort_values(ascending=False))
    cosine_df.reset_index(level=0, inplace=True)
    cosine_df.columns = ['userId','cosine_sim']
    similar_usr = list(cosine_df['userId'][1:k+1].values)
    similarities = list(cosine_df['cosine_sim'][1:k+1].values)

    sims_dict = dict(zip(similar_usr, similarities))

    similar_usr_df = moviemat.T[similar_usr].fillna(0)

    for i, j in sims_dict.items():
        similar_usr_df[i] = similar_usr_df[i] * j

    similar_usr_df['mean rating'] = similar_usr_df[list(sims_dict.keys())].mean(numeric_only=True,axis=1)
    similar_usr_df.sort_values('mean rating', ascending=False,inplace = True)

    watched = list(ratings_dict.keys())

    similar_usr_df = similar_usr_df[~similar_usr_df.index.isin(watched)]
    
    titles = similar_usr_df.index
    mean_rating = list(similar_usr_df['mean rating'])
    
    recos = pd.DataFrame(columns=['title','mean rating'])
    recos['title'] = titles
    recos['mean rating'] = mean_rating
    
    recos = pd.merge(movies,recos,on='title')
    
    recos.sort_values(by='mean rating', ascending = False, inplace=True)
    recos.reset_index(drop=True, inplace=True)

    return recos.head(20)

In [52]:
df = user_based_recomm()

In [53]:
df

Unnamed: 0,movieId,title,genres,mean rating
0,318,"Shawshank Redemption, The (1994)",Crime|Drama,0.47577
1,68954,Up (2009),Adventure|Animation|Children|Drama,0.424455
2,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,0.423926
3,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,0.407843
4,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,0.407087
5,527,Schindler's List (1993),Drama|War,0.364495
6,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,0.334792
7,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,0.334792
8,109487,Interstellar (2014),Sci-Fi|IMAX,0.313876
9,858,"Godfather, The (1972)",Crime|Drama,0.273114
