In [91]:
import numpy as np
import pandas as pd
import re
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [92]:
df_movies  = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')

In [93]:
dataset = pd.merge(df_movies,df_ratings,on="movieId")

In [94]:
dataset

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,5.0,859046895
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,1303501039
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,5.0,858610933
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.0,850815810
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.0,851766286
...,...,...,...,...,...,...
105334,148238,A Very Murray Christmas (2015),Comedy,475,3.0,1451213043
105335,148626,The Big Short (2015),Drama,458,4.0,1452014749
105336,148626,The Big Short (2015),Drama,576,4.5,1451687664
105337,148626,The Big Short (2015),Drama,668,4.5,1451148148


In [95]:
df = dataset['movieId'].ne(dataset['movieId'].shift()).cumsum() 
df = dataset.groupby(df).first()
df = df.iloc[:,:3]
df.index.name='index'

In [96]:
df

Unnamed: 0_level_0,movieId,title,genres
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,2,Jumanji (1995),Adventure|Children|Fantasy
3,3,Grumpier Old Men (1995),Comedy|Romance
4,4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
10321,146684,Cosmic Scrat-tastrophe (2015),Animation|Children|Comedy
10322,146878,Le Grand Restaurant (1966),Comedy
10323,148238,A Very Murray Christmas (2015),Comedy
10324,148626,The Big Short (2015),Drama


In [97]:
ratings_mean = dataset.groupby('movieId')['rating'].mean()
ratings_count = dataset.groupby('movieId')['rating'].count()
rating = pd.DataFrame({'Mean_Ratings':ratings_mean,
                'Ratings_count':ratings_count})

In [98]:
df = pd.merge(df,rating,on="movieId")
df['genres'] = [re.sub("\|"," ",str(genre)) for genre in df['genres']]
df['movie_year'] = df['title'].str.split(r" \(",expand=True).iloc[:,1]
df['title'] = df['title'].str.split(r" \(",expand=True).iloc[:,0]
df['movie_year'] = [re.sub("\)","",str(year)) for year in df['movie_year']]
df['vectors'] = df['title'] +" " + df['genres']
df.insert(0,'Index',[0 for i in range(len(df.index))])
orig_rows = len(df.index)

In [99]:
df

Unnamed: 0,Index,movieId,title,genres,Mean_Ratings,Ratings_count,movie_year,vectors
0,0,1,Toy Story,Adventure Animation Children Comedy Fantasy,3.907328,232,1995,Toy Story Adventure Animation Children Comedy ...
1,0,2,Jumanji,Adventure Children Fantasy,3.353261,92,1995,Jumanji Adventure Children Fantasy
2,0,3,Grumpier Old Men,Comedy Romance,3.189655,58,1995,Grumpier Old Men Comedy Romance
3,0,4,Waiting to Exhale,Comedy Drama Romance,2.818182,11,1995,Waiting to Exhale Comedy Drama Romance
4,0,5,Father of the Bride Part II,Comedy,3.250000,62,1995,Father of the Bride Part II Comedy
...,...,...,...,...,...,...,...,...
10320,0,146684,Cosmic Scrat-tastrophe,Animation Children Comedy,4.000000,1,2015,Cosmic Scrat-tastrophe Animation Children Comedy
10321,0,146878,Le Grand Restaurant,Comedy,2.500000,1,1966,Le Grand Restaurant Comedy
10322,0,148238,A Very Murray Christmas,Comedy,3.000000,1,2015,A Very Murray Christmas Comedy
10323,0,148626,The Big Short,Drama,4.333333,3,2015,The Big Short Drama


In [100]:
vec = TfidfVectorizer(stop_words = 'english')

In [101]:
df1 = df.sort_values(by = 'Ratings_count', ascending = False)

In [102]:
df1

Unnamed: 0,Index,movieId,title,genres,Mean_Ratings,Ratings_count,movie_year,vectors
260,0,296,Pulp Fiction,Comedy Crime Drama Thriller,4.160000,325,1994,Pulp Fiction Comedy Crime Drama Thriller
316,0,356,Forrest Gump,Comedy Drama Romance War,4.138264,311,1994,Forrest Gump Comedy Drama Romance War
279,0,318,"Shawshank Redemption, The",Crime Drama,4.454545,308,1994,"Shawshank Redemption, The Crime Drama"
426,0,480,Jurassic Park,Action Adventure Sci-Fi Thriller,3.659864,294,1993,Jurassic Park Action Adventure Sci-Fi Thriller
525,0,593,"Silence of the Lambs, The",Crime Horror Thriller,4.194828,290,1991,"Silence of the Lambs, The Crime Horror Thriller"
...,...,...,...,...,...,...,...,...
10269,0,136654,The Face of an Angel,Drama,1.500000,1,2015,The Face of an Angel Drama
10270,0,136800,Robot Overlords,Action Adventure Sci-Fi,1.500000,1,2014,Robot Overlords Action Adventure Sci-Fi
10271,0,136890,Eastern Boys,Drama,4.000000,1,2014,Eastern Boys Drama
10273,0,138104,Justice League: Gods and Monsters,Action Animation,4.000000,1,2015,Justice League: Gods and Monsters Action Anima...


In [103]:
df.loc[10325] = [1, [], '', '', [], np.nan, [], ''] 

In [104]:
def initial_recommendation():
    count = 0
    movies = []
    mov_ids = []
    sim_score = []

    for ind in df1.index:
        if count<10:
            if df1['Mean_Ratings'][ind]>3.5:
                mov_ids.append(df1['movieId'][ind])
                movies.append(df[df.movieId==df1['movieId'][ind]]['title'].values[0] + ' (' + df[df.movieId==df1['movieId'][ind]]['movie_year'].values[0] + ')')
                sim_score.append(0)
                count+=1
        else:
            break

    recommendation = pd.DataFrame({'Movie_Id':mov_ids, 'Movie':movies, 'Cosine_Similarity_Score':sim_score})
    return(recommendation)

In [105]:
def input_movie(movie,rating):
    if rating > 2.5:
        title = df['title'][df.index[-1]] + ' ' + movie
        movie_id = df['movieId'][df.index[-1]]
        movie_id.append(df[df.title==movie]['movieId'].values[0])
        movie_year = df['movie_year'][df.index[-1]]
        movie_year.append(df[df.title==movie]['movie_year'].values[0])
        genre = df['genres'][df.index[-1]] + ' ' + df[df.title==movie]['genres'].values[0]
        rate = df['Mean_Ratings'][df.index[-1]]
        rate.append(rating)
        #vectors = df['vectors'][df.index[-1]] + " " + movie + " " + genre
        vectors = movie + " " + genre
        df.loc[df.index[-1]] = [1, movie_id, title, genre, rate, np.nan, movie_year, vectors] 
    return

In [106]:
df

Unnamed: 0,Index,movieId,title,genres,Mean_Ratings,Ratings_count,movie_year,vectors
0,0,1,Toy Story,Adventure Animation Children Comedy Fantasy,3.907328,232.0,1995,Toy Story Adventure Animation Children Comedy ...
1,0,2,Jumanji,Adventure Children Fantasy,3.353261,92.0,1995,Jumanji Adventure Children Fantasy
2,0,3,Grumpier Old Men,Comedy Romance,3.189655,58.0,1995,Grumpier Old Men Comedy Romance
3,0,4,Waiting to Exhale,Comedy Drama Romance,2.818182,11.0,1995,Waiting to Exhale Comedy Drama Romance
4,0,5,Father of the Bride Part II,Comedy,3.25,62.0,1995,Father of the Bride Part II Comedy
...,...,...,...,...,...,...,...,...
10321,0,146878,Le Grand Restaurant,Comedy,2.5,1.0,1966,Le Grand Restaurant Comedy
10322,0,148238,A Very Murray Christmas,Comedy,3.0,1.0,2015,A Very Murray Christmas Comedy
10323,0,148626,The Big Short,Drama,4.333333,3.0,2015,The Big Short Drama
10324,0,149532,Marco Polo: One Hundred Eyes,(no genres listed),4.0,1.0,2015,Marco Polo: One Hundred Eyes (no genres listed)


In [107]:
def recommend(movie,rating):
    
    if rating<2.5:
        return
    
    vecs = vec.fit_transform(df['vectors'].apply(lambda x: np.str_(x)))
    cos_similarity = cosine_similarity(vecs)
    
    scores = {}
    movie_id = df[df.title==movie]['movieId'].values[0]
    
    for i in range(df.index[-1]):
        if (movie_id != df['movieId'][i]):
            scores[df['movieId'][i]] = cos_similarity[df.index[-1]][i]
            
    scores=(sorted(scores.items(), key=lambda elem:(elem[1], elem[0]),reverse=True))
    
    movies = []
    sim_score = []
    mov_ids = []
    count=0
    
    for s in scores:
        if count<10:
            if s[0] not in df['movieId'][df.index[-1]]:
                movie_name = df[df.movieId==s[0]]['title'].values[0] + ' (' + df[df.movieId==s[0]]['movie_year'].values[0] + ')'
                if len(dataset[dataset.title==movie_name]['movieId'].values)!=0:
                    movies.append(movie_name)
                    sim_score.append(s[1])
                    mov_ids.append(dataset[dataset.title==movie_name]['movieId'].values[0])
                    count+=1
    recommendation = pd.DataFrame({'Movie Id':mov_ids, 'Movie':movies, 'Cosine Similarity Score':sim_score})
    return(recommendation)

In [108]:
recommendation = initial_recommendation()

ValueError: The truth value of an empty array is ambiguous. Use `array.size > 0` to check that an array is not empty.

In [None]:
recommendation

Unnamed: 0,Movie_Id,Movie,Cosine_Similarity_Score
0,296,Pulp Fiction (1994),0
1,356,Forrest Gump (1994),0
2,318,"Shawshank Redemption, The (1994)",0
3,480,Jurassic Park (1993),0
4,593,"Silence of the Lambs, The (1991)",0
5,260,Star Wars: Episode IV - A New Hope (1977),0
6,2571,"Matrix, The (1999)",0
7,589,Terminator 2: Judgment Day (1991),0
8,527,Schindler's List (1993),0
9,110,Braveheart (1995),0


In [None]:
movie_name = input('Enter the movie: ')
rating = float(input('Enter the rating: '))

Enter the movie: Spider-Man
Enter the rating: 4


In [None]:
input_movie(movie_name,rating)

  return asarray(a).ndim


In [None]:
recommendation = recommend(movie_name,rating)

In [None]:
recommendation

Unnamed: 0,Movie Id,Movie,Cosine Similarity Score
0,52722,Spider-Man 3 (2007),0.775019
1,8636,Spider-Man 2 (2004),0.755724
2,95510,"Amazing Spider-Man, The (2012)",0.638183
3,71057,9 (2009),0.584952
4,9004,D.A.R.Y.L. (1985),0.574398
5,130520,Home (2015),0.567379
6,138036,"Man from U.N.C.L.E., The (2015)",0.562446
7,110553,The Amazing Spider-Man 2 (2014),0.558876
8,68954,Up (2009),0.532973
9,48774,Children of Men (2006),0.490687


In [None]:
df

Unnamed: 0,Index,movieId,title,genres,Mean_Ratings,Ratings_count,movie_year,vectors
0,0,1,Toy Story,Adventure Animation Children Comedy Fantasy,3.907328,232.0,1995,Toy Story Adventure Animation Children Comedy ...
1,0,2,Jumanji,Adventure Children Fantasy,3.353261,92.0,1995,Jumanji Adventure Children Fantasy
2,0,3,Grumpier Old Men,Comedy Romance,3.189655,58.0,1995,Grumpier Old Men Comedy Romance
3,0,4,Waiting to Exhale,Comedy Drama Romance,2.818182,11.0,1995,Waiting to Exhale Comedy Drama Romance
4,0,5,Father of the Bride Part II,Comedy,3.25,62.0,1995,Father of the Bride Part II Comedy
...,...,...,...,...,...,...,...,...
10321,0,146878,Le Grand Restaurant,Comedy,2.5,1.0,1966,Le Grand Restaurant Comedy
10322,0,148238,A Very Murray Christmas,Comedy,3.0,1.0,2015,A Very Murray Christmas Comedy
10323,0,148626,The Big Short,Drama,4.333333,3.0,2015,The Big Short Drama
10324,0,149532,Marco Polo: One Hundred Eyes,(no genres listed),4.0,1.0,2015,Marco Polo: One Hundred Eyes (no genres listed)
