# movie recommendation system

In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# features we are considering
features = ['genres', 'keywords', 'cast', 'director']

In [8]:
df = pd.read_csv("movie_dataset.csv")

In [9]:
# utitlity functions to be used
def combine_features(row):
    return row['keywords'] +" "+ row["cast"]+" "+row["genres"]+" "+row["director"]
def get_index_from_title(title):
    return df[df.title == title]["index"].index[0]  
def get_title_from_index(index):
    return df[df.index == index]['title']

In [10]:
for feature in features:
    df[feature] = df[feature].fillna('')
df["combined_features"] = df.apply(combine_features, axis=1)

# finfing similarity
cv = CountVectorizer()
countMatrix = cv.fit_transform(df["combined_features"])
cosine_sim = cosine_similarity(countMatrix)

In [11]:
movieLikedByUser = "Spectre"
movie_index = get_index_from_title(movieLikedByUser)
similar_movies = list(enumerate(cosine_sim[movie_index]))

sorted_movie = sorted(similar_movies, key=lambda x:x[1], reverse=True)[1:10]


ans = list()
for movie in sorted_movie:
    ans.append(list(get_title_from_index(movie[0]))[0])
    
print(ans)
    

['Skyfall', 'Quantum of Solace', 'The Girl with the Dragon Tattoo', 'The Hunger Games: Catching Fire', 'Johnny English Reborn', 'One for the Money', 'The Adventurer: The Curse of the Midas Box', 'Nancy Drew', 'Diamonds Are Forever']


# collaborative filtering on toy_dataset

In [12]:
from scipy import sparse

In [18]:
ratings = pd.read_csv("toy_dataset.csv", index_col=0)

In [21]:
ratings = ratings.fillna(0)
ratings

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,0.0,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,0.0
user 3,1.0,0.0,0.0,4.0,5.0,4.0
user 4,0.0,2.0,1.0,4.0,0.0,3.0
user 5,1.0,0.0,2.0,3.0,3.0,4.0


In [33]:
def standardize(row):
    new_row = (row-row.mean())/(row.max()-row.min())
    return new_row
ratings_std = ratings.apply(standardize)
# because we are using item to item CF
item_similarity = cosine_similarity(ratings_std.T)
item_similarity

array([[ 1.        ,  0.70668875,  0.81368151, -0.79941088, -0.02539184,
        -0.91410609],
       [ 0.70668875,  1.        ,  0.72310153, -0.84515425, -0.5189993 ,
        -0.84337386],
       [ 0.81368151,  0.72310153,  1.        , -0.84794611, -0.3799803 ,
        -0.80218063],
       [-0.79941088, -0.84515425, -0.84794611,  1.        ,  0.14803913,
         0.72374686],
       [-0.02539184, -0.5189993 , -0.3799803 ,  0.14803913,  1.        ,
         0.39393939],
       [-0.91410609, -0.84337386, -0.80218063,  0.72374686,  0.39393939,
         1.        ]])

In [37]:
# creating dataframe with similarity wrt item to item
# to find user to user similarity dont take transpose
item_similarity_df = pd.DataFrame(item_similarity, index=ratings.columns, columns=ratings.columns)
item_similarity_df


Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
action1,1.0,0.706689,0.813682,-0.799411,-0.025392,-0.914106
action2,0.706689,1.0,0.723102,-0.845154,-0.518999,-0.843374
action3,0.813682,0.723102,1.0,-0.847946,-0.37998,-0.802181
romantic1,-0.799411,-0.845154,-0.847946,1.0,0.148039,0.723747
romantic2,-0.025392,-0.518999,-0.37998,0.148039,1.0,0.393939
romantic3,-0.914106,-0.843374,-0.802181,0.723747,0.393939,1.0


In [64]:

# making recommendation 
# if rating is less than 2.5 means user does not like the movie
# so make +ve to negative and negative to positive
# using rating -2.5
def get_similar_movies(movie_name, user_rating):
    similar_score = item_similarity_df[movie_name]*(user_rating-2.5)
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score
get_similar_movies('romantic3',1)

action1      1.371159
action2      1.265061
action3      1.203271
romantic2   -0.590909
romantic1   -1.085620
romantic3   -1.500000
Name: romantic3, dtype: float64

In [67]:
# what if rates many movies
action_lover = [('action1',5), ('romantic2',1), ('romantic3',1)]
similar_movies = pd.DataFrame()

for movie, rating in action_lover:
    similar_movie = get_similar_movies(movie,rating)
    similar_movies = similar_movies.append(similar_movie)
    
# similar_movies
similar_movies.sum().sort_values(ascending=False)

action1      3.909247
action2      3.810282
action3      3.807445
romantic2   -2.154389
romantic1   -3.306206
romantic3   -4.376174
dtype: float64