#**IDS 561 Final Project: Recommender System for Movies**
###Group Members (Prajakta Iyer, Kajol Shaikh, Akanksha Gautam) 


##**Importing Libraries and Mounting the Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.gtlib.gatech.edu/pub/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz

In [3]:
!ls

drive  sample_data  spark-3.1.2-bin-hadoop3.2.tgz


In [None]:
!tar -xvf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark

In [5]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"
!java -version
!echo $JAVA_HOME

openjdk version "11.0.11" 2021-04-20
OpenJDK Runtime Environment (build 11.0.11+9-Ubuntu-0ubuntu2.18.04)
OpenJDK 64-Bit Server VM (build 11.0.11+9-Ubuntu-0ubuntu2.18.04, mixed mode, sharing)
/usr/lib/jvm/java-11-openjdk-amd64


In [6]:
#Import findspark and create spark session
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [7]:
#Importing pyspark sql functions
import numpy as np
import pandas as pd
import pyspark.sql.functions as fn
from pyspark.sql.functions import col

#***ALGORITHM 1***

#**Popularity based recommender**
#Displays top 250 popularly watched movies based on IMDb weighted rating formula

#Load data into dataframe
movies = pd.read_csv('/My Drive/movies_metadata.csv', low_memory=False)


In [31]:
movies = pd.read_csv('movies_metadata.csv', low_memory=False)

##Find out average rating of a movie on IMDB(Out of 10)

In [32]:
Avg_vote = movies['vote_average'].mean()
print(Avg_vote)

5.618207215134213


##Find the minimum number of votes required to be in the chart

In [13]:
min_votes = movies['vote_count'].quantile(0.90)
print(min_votes)

160.0


##Filter out movies that satisfy minimum required votes condition

In [14]:
movies_filter = movies.copy().loc[movies['vote_count'] >= min_votes]
movies_filter.shape

(4555, 24)

##Weighted Rating Function using the IMDb formula

Reference: https://help.imdb.com/article/imdb/track-movies-tv/ratings-faq/G67Y87TFYYP6TWAV?ref_=helpms_helpart_inline#

The following formula is used to calculate the Top Rated 250 titles. This formula provides a true 'Bayesian estimate', which takes into account the number of votes each title has received, minimum votes required to be on the list, and the mean vote for all titles:

weighted rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C

Where:

R = average for the movie (mean) = (rating)

v = number of votes for the movie = (votes)

m = minimum votes required to be listed in the Top Rated list (currently 25,000)

C = the mean vote across the whole report

In [15]:
def WR(x, m=min_votes, C=Avg_vote):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+min_votes) * R) + (min_votes/(min_votes+v) * Avg_vote)

In [16]:
movies_filter['score'] = movies_filter.apply(WR, axis=1)

##Sorting movies based on score obtained by applying above function


In [17]:
movies_filter = movies_filter.sort_values('score', ascending=False)
movies_top_250 = movies_filter[['title', 'vote_count', 'vote_average', 'score']].head(250)

In [18]:
movies_top_250.head(10)

Unnamed: 0,title,vote_count,vote_average,score
45425,The Shawshank Redemption,8358.0,8.5,8.445869
45408,The Godfather,6024.0,8.5,8.425439
45380,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
45444,The Dark Knight,12269.0,8.3,8.265477
45432,Fight Club,9678.0,8.3,8.256385
45446,Pulp Fiction,8670.0,8.3,8.251406
45411,Schindler's List,4436.0,8.3,8.206639
45433,Whiplash,4376.0,8.3,8.205404
45406,Spirited Away,3968.0,8.3,8.196055
45403,Life Is Beautiful,3643.0,8.3,8.187171


In [20]:
movies_top_250.to_csv('output_top250.csv')

#***ALGORITHM 2***

#**Content Based Filtering**
#Features used: Genre, language, cast, keywords

In [21]:
!pip install numpy
!pip install scikit-surprise

Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 253kB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1617620 sha256=ebf7cc986134e93497c4f2dbca78b633bbfece8dd46f14852757eb97c57c17a1
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [22]:
!pip install sklearn-recommender 

Collecting sklearn-recommender
  Downloading https://files.pythonhosted.org/packages/98/03/db8aca53881d49ec4209c79c93759b1319383ad0f9d86dead377b6e368d5/sklearn-recommender-0.1.5.tar.gz
Building wheels for collected packages: sklearn-recommender
  Building wheel for sklearn-recommender (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn-recommender: filename=sklearn_recommender-0.1.5-cp37-none-any.whl size=17467 sha256=cc597b93cd708532c29a704df08aa6b3ada7ba172bf1a92f7c1e39f42e2d1f86
  Stored in directory: /root/.cache/pip/wheels/03/05/b9/f61552cb6d8789bb155c3ae362063a5834cef924994c939951
Successfully built sklearn-recommender
Installing collected packages: sklearn-recommender
Successfully installed sklearn-recommender-0.1.5


In [38]:
#Data download: https://www.kaggle.com/rounakbanik/the-movies-dataset/data?select=movies_metadata.csv
movies = spark.read.option("header", "true").csv("movies_metadata.csv")
ratings = spark.read.option("header", "true").csv("ratings.csv")
credits = spark.read.option("header", "true").csv("credits.csv")
keywords = spark.read.option("header", "true").csv("keywords.csv")

In [39]:
movies.registerTempTable("movies")
credits.registerTempTable("credits")
keywords.registerTempTable("keywords")

In [40]:
keywords = keywords.selectExpr("id as movieId", "keywords as keywords")
credits = credits.selectExpr("cast as cast","crew as crew","id as movieId")

In [41]:
#Join movies and credits on movieId
df1 = movies.join(credits,credits.movieId == movies.id,how='left') 

In [42]:
df1.show()

+-----+---------------------+------+--------------------+--------------------+------+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+-------+-------+--------------------+--------+--------------------+--------------------+-----+------------+----------+--------------------+--------------------+-------+
|adult|belongs_to_collection|budget|              genres|            homepage|    id|  imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|release_date|revenue|runtime|    spoken_languages|  status|             tagline|               title|video|vote_average|vote_count|                cast|                crew|movieId|
+-----+---------------------+------+--------------------+--------------------+------+---------+-----------------+--------------------+--------------------+----------+------------

In [43]:
#Join the previous resultant table with keywords on the same movieId
left_join = df1.join(keywords,df1.id == keywords.movieId,how='left')

In [44]:
#Convert to Pandas dataframe
movies_pandas = left_join.toPandas()

In [45]:
movies_pandas.head(10)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,movieId,movieId.1,keywords
0,But when Ahmed,the perfect scapegoat for the attack,is charged,the gravitas of his accusation sucks Eddie in...,"marking the start of a dangerous freefall.""",1.574392,/hPObJrJhUw92DU5zeLGA3npkKnV.jpg,"[{'name': 'Thelma Films', 'id': 6844}]","[{'iso_3166_1': 'FR', 'name': 'France'}]",2/24/2016,0,107,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,A Decent Man,FALSE,6.1,7,,,,,,,,,,
1,FALSE,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10752, 'n...",,100010.0,tt0032477,en,Flight Command,"A rookie flyer, Ens. Alan Drake, joins the fam...",0.769266,/zrLmIoNozsKM4CkjXG1abOxOZj9.jpg,"[{'name': 'Metro-Goldwyn-Mayer (MGM)', 'id': 8...","[{'iso_3166_1': 'US', 'name': 'United States o...",12/27/1940,0,116,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"""T-H-R-I-L-L AMERICA! Here come The Flying """"H...",Flight Command,FALSE,6,1,,,,100010.0,"[{'id': 3203, 'name': 'pilot'}, {'id': 3588, '..."
2,FALSE,,37000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",,10096.0,tt0337563,en,13 Going on 30,After total humiliation at her thirteenth birt...,12.632595,/iSvz2Nk1vzgm7bEMqMazhQ1F3zA.jpg,"[{'name': 'Columbia Pictures', 'id': 5}, {'nam...","[{'iso_3166_1': 'US', 'name': 'United States o...",4/13/2004,96455697,98,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,"For some, 13 feels like it was just yesterday....",13 Going on 30,FALSE,6.3,1260,,,,10096.0,"[{'id': 242, 'name': 'new york'}, {'id': 1003,..."
3,FALSE,,0,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 53,...",,103432.0,tt0070497,en,Outrage,One man decides to wage war against a gang of ...,0.109861,/3KkQs9mIdTKOw5tdnObuBfEgycj.jpg,"[{'name': 'ABC Circle Films', 'id': 2166}]","[{'iso_3166_1': 'US', 'name': 'United States o...",11/28/1973,0,74,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Outrage,FALSE,5.2,3,,,,103432.0,"[{'id': 387, 'name': 'california'}, {'id': 201..."
4,FALSE,"{'id': 135489, 'name': 'Wishmaster Collection'...",5000000,"[{'id': 27, 'name': 'Horror'}]",,10351.0,tt0120524,en,Wishmaster,The Djinn having been released from his ancien...,5.215107,/t7rep7SCqiy4COUsq2cOcdQwzU0.jpg,"[{'name': 'Live Entertainment', 'id': 285}, {'...","[{'iso_3166_1': 'US', 'name': 'United States o...",9/19/1997,15719109,90,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Be careful what you wish for.,Wishmaster,FALSE,5.6,114,,,,10351.0,"[{'id': 657, 'name': 'fire'}, {'id': 2250, 'na..."
5,FALSE,,34000000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,10436.0,tt0106226,en,The Age of Innocence,Tale of 19th century New York high society in ...,8.013617,/xdkFwyufgtTR4E9rk4CL2r4l00a.jpg,"[{'name': 'Columbia Pictures', 'id': 5}]","[{'iso_3166_1': 'US', 'name': 'United States o...",9/17/1993,32255440,139,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,In a world of tradition. In an age of innocenc...,The Age of Innocence,FALSE,7,172,"[{'cast_id': 1, 'character': 'Newland Archer',...","[{'credit_id': '52fe43709251416c750106b3', 'de...",10436.0,10436.0,"[{'id': 213, 'name': 'upper class'}, {'id': 24..."
6,FALSE,,16000000,"[{'id': 53, 'name': 'Thriller'}, {'id': 878, '...",,1090.0,tt0139809,en,The Thirteenth Floor,Computer scientist Hannon Fuller has discovere...,9.356587,/19XUd3YoAY2cwWJnutvq5VEtiIO.jpg,"[{'name': 'Columbia Pictures', 'id': 5}, {'nam...","[{'iso_3166_1': 'US', 'name': 'United States o...",4/16/1999,18564088,100,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Question reality. You can go there even though...,The Thirteenth Floor,FALSE,6.8,381,,,,1090.0,"[{'id': 310, 'name': 'artificial intelligence'..."
7,FALSE,,0,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",,11078.0,tt0271668,en,National Security,Two mismatched security guards are thrown toge...,6.409297,/5YFPkMgy5NCzUSYl5Ep8p0vK9qB.jpg,"[{'name': 'Columbia Pictures', 'id': 5}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1/17/2003,50097949,88,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,They only look like cops.,National Security,FALSE,5.5,244,,,,11078.0,"[{'id': 3016, 'name': 'security camera'}, {'id..."
8,FALSE,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,11332.0,tt0154443,en,8 ½ Women,"""After the death of his wife, wealthy business...",Simato (Inoh),to waive her pachinko debts). They sign one-y...,another a kabuki performer,etc.). Philip soon becomes dominated by his f...,Palmira,who has no interest in Storey as a lover,despite what their contract might stipulate. ...,the concubines' contracts expire,and Storey is left alone with Giulietta (the ...,played by Fujiwara) and of course the money a...,1.687388,/9HXeaaFjiNYkkEHmbjnBFfgbqwp.jpg,"[{'name': 'Delux Productions', 'id': 1472}, {'...","[{'iso_3166_1': 'DE', 'name': 'Germany'}, {'is...",,,,11332.0,"[{'id': 494, 'name': 'father son relationship'..."
9,FALSE,,0,"[{'id': 36, 'name': 'History'}, {'id': 99, 'na...",,114524.0,tt0097099,en,Common Threads: Stories from the Quilt,"In the late 1970s, a mysterious new disease be...",0.544277,/tAmvWQl1R0uiydLnG0o9PgRyanS.jpg,"[{'name': 'New Yorker Films', 'id': 5448}, {'n...","[{'iso_3166_1': 'US', 'name': 'United States o...",12/1/1989,0,79,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Common Threads: Stories from the Quilt,FALSE,10,2,"[{'cast_id': 3, 'character': 'Narrator', 'cred...","[{'credit_id': '52fe4b4dc3a36847f81fd189', 'de...",114524.0,114524.0,[]


In [46]:
#Function to process the features in the dataset
import json

def process_feature(feature_txt):
  try:
    feature_txt = str(feature_txt).replace("'","\"")
    res = ''
    jdata = json.loads(feature_txt)
    for feat in jdata:
      genre = feat['name']
      res += str(genre) + ' '
    return res
  except Exception as e:
    return ''

In [47]:
#Applying the function to all required columns
movies_pandas['genres'] = movies_pandas['genres'].apply(lambda x: process_feature(x))
movies_pandas['spoken_languages'] = movies_pandas['spoken_languages'].apply(lambda x: process_feature(x))
movies_pandas['cast'] = movies_pandas['cast'].apply(lambda x: process_feature(x))
movies_pandas['keywords'] = movies_pandas['keywords'].apply(lambda x: process_feature(x))

In [48]:
df2 = movies_pandas.head(10000)

In [49]:
df2.to_csv('df2.csv')

##Declare the set of features based on which we will filter content of movies

In [50]:
features = ['genres','spoken_languages','cast','keywords']

In [51]:
def combine_features(row):
  return row['genres']+" "+row['spoken_languages']+" "+row['cast']+" "+row['keywords']

In [52]:
#Cleaning data
for feature in features:
  df2[feature] = df2[feature].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [53]:
#Applying function to dataframe
df2['combined_features'] = df2.apply(combine_features,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


##Combining above selected features

In [54]:
df2['combined_features'][1]

'Drama War  English   pilot navy '

##Importing libraries

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [56]:
#Using CountVectorizer to transform a corpora of text to term counts
cv = CountVectorizer()
count_matrix = cv.fit_transform(df2['combined_features'])

##Calculating cosine similarity

In [57]:
cosine_sim = cosine_similarity(count_matrix)

##Taking as input,  movie that the user likes

In [59]:
movies_user_likes = "Spider-Man 3"

##Defining functions to extract movie titles and index

In [60]:
def get_title_from_index(index):
  return df2[df2.index == index]["title"].values[0]

In [61]:
def get_index_from_title(title):
  return df2[df2.title == title].index.values[0]

In [62]:
movie_index = get_index_from_title(movies_user_likes)

In [63]:
similar_movies = list(enumerate(cosine_sim[movie_index]))

##Sorting the similar movies

In [64]:
sorted_similar_movies = sorted(similar_movies,key = lambda x:x[1],reverse = True)

In [65]:
df3 = pd.DataFrame(sorted_similar_movies)
df3.to_csv('similar_movies.csv')


##**Displaying the top 50 recommendations for the user that likes Spider-Man**

In [66]:
i=0
suggestion = ''
for movie in sorted_similar_movies:
  print(get_title_from_index(movie[0]))
  suggestion += get_title_from_index(movie[0])+'\n'
  i=i+1
  if i>50:
    break

Spider-Man 3
The Mummy: Tomb of the Dragon Emperor
The Story of Robin Hood and His Merrie Men
Soldiers of Fortune
Indiana Jones and the Temple of Doom
The Hire: Hostage
Blunt Force Trauma
Blunt Force Trauma
The One Warrior
Legendary: Tomb of the Dragon
Quest of the Delta Knights
Pokémon Heroes: Latios and Latias
Bionicle 3: Web of Shadows
The Navigator: A Medieval Odyssey
Maniac Cop 3: Badge of Silence
Steel
Percy Jackson & the Olympians: The Lightning Thief
The Cloth
Harry Potter and the Chamber of Secrets
Dragonlance: Dragons Of Autumn Twilight
24: Redemption
Agent for H.A.R.M.
Rampage
Inkheart
/3oGuv651I03RtLrlBRwn9sClaaN.jpg
Kickboxer 2:  The Road Back
A Dangerous Man
Beatdown
Allan Quatermain and the Lost City of Gold
Assault of the Sasquatch
Sharpe's Company
The Condemned 2
The Search for Santa Paws
Shaolin Temple
Gun Hill
The Golden Voyage of Sinbad
Battle For SkyArk
Freedom Strike
Skate Or Die
The Kill Hole
Rising Fear
Le Jaguar
Caged Heat
April Rain
Only the Strong
2012: Ice A

In [67]:
print(suggestion)

Spider-Man 3
The Mummy: Tomb of the Dragon Emperor
The Story of Robin Hood and His Merrie Men
Soldiers of Fortune
Indiana Jones and the Temple of Doom
The Hire: Hostage
Blunt Force Trauma
Blunt Force Trauma
The One Warrior
Legendary: Tomb of the Dragon
Quest of the Delta Knights
Pokémon Heroes: Latios and Latias
Bionicle 3: Web of Shadows
The Navigator: A Medieval Odyssey
Maniac Cop 3: Badge of Silence
Steel
Percy Jackson & the Olympians: The Lightning Thief
The Cloth
Harry Potter and the Chamber of Secrets
Dragonlance: Dragons Of Autumn Twilight
24: Redemption
Agent for H.A.R.M.
Rampage
Inkheart
/3oGuv651I03RtLrlBRwn9sClaaN.jpg
Kickboxer 2:  The Road Back
A Dangerous Man
Beatdown
Allan Quatermain and the Lost City of Gold
Assault of the Sasquatch
Sharpe's Company
The Condemned 2
The Search for Santa Paws
Shaolin Temple
Gun Hill
The Golden Voyage of Sinbad
Battle For SkyArk
Freedom Strike
Skate Or Die
The Kill Hole
Rising Fear
Le Jaguar
Caged Heat
April Rain
Only the Strong
2012: Ice A

##Export result into csv file

In [68]:
import io

In [69]:
data = io.StringIO(suggestion)
df = pd.read_csv(data, sep=",")

In [70]:
df.to_csv('suggestions.csv')

#***ALGORITHM 3***

#**Collaborative Filtering method**
#Item-Item collaborative filtering that suggests movies based on user history of movie ratings

In [72]:
#Loading Data into dataframe
movies = spark.read.option("header", "true").csv("movies_metadata.csv")
ratings = spark.read.option("header", "true").csv("ratings.csv")
credits = spark.read.option("header", "true").csv("credits.csv")

##Join movies and ratings on movieId

In [73]:
movies.registerTempTable("movies")
ratings.registerTempTable("ratings")
left_join = ratings.join(movies,ratings.movieId == movies.id,how='left') 

In [74]:
left_join = left_join.select('userId','title','rating')

In [75]:
left_join.show()

+------+--------------------+------+
|userId|               title|rating|
+------+--------------------+------+
|     1|   Three Colors: Red|   1.0|
|     1|       The 400 Blows|   4.5|
|     1|Sleepless in Seattle|   5.0|
|     1|                null|   5.0|
|     1|        Rocky Balboa|   5.0|
|     1|       Fools Rush In|   4.0|
|     1|/8teH96d4Hcg1BWwC...|   4.5|
|     1|                null|   5.0|
|     1|First came love.....|   4.0|
|     1|Shriek If You Kno...|   4.0|
|     1|                null|   5.0|
|     1|                null|   5.0|
|     1|                null|   4.0|
|     1|The Mystery of Ch...|   3.5|
|     1|Confession of a C...|   4.0|
|     1|                null|   5.0|
|     1|                null|   5.0|
|     1|                null|   5.0|
|     1|                null|   5.0|
|     1|                null|   5.0|
+------+--------------------+------+
only showing top 20 rows



In [76]:
top_ratings = pd.DataFrame(left_join.head(10000),columns=['userId','title','rating'])

##Installing packages

In [77]:
!pip install numpy
!pip install scikit-surprise
!pip install sklearn-recommender 
!pip install scipy



##Importing libraries

In [78]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [79]:
#Find out unique users and movies
unique_users = top_ratings.userId.unique()
unique_movies = top_ratings.title.unique()

In [80]:
print(f'No of unnique user {len(unique_users)}')
print(f'No of unnique movies {len(unique_movies)}')
top_ratings.head()

No of unnique user 120
No of unnique movies 1058


Unnamed: 0,userId,title,rating
0,1,Three Colors: Red,1.0
1,1,The 400 Blows,4.5
2,1,Sleepless in Seattle,5.0
3,1,,5.0
4,1,Rocky Balboa,5.0


In [82]:
am = pd.DataFrame(np.zeros(shape=(1058,120)), columns=top_ratings.userId.unique(), index=top_ratings.title.unique())

In [83]:
for index, row in top_ratings.iterrows():
  userid = row['userId']
  title = row['title']
  rating = row['rating']
  am.loc[title,userid] = float(rating)

In [84]:
#Taking transpose because we need item-item similarity to be calculated
am_trans = am.transpose()

##Obtaining userId versus movies matrix with user ratings for each movie

In [85]:
am_trans.head(3)

Unnamed: 0,Three Colors: Red,The 400 Blows,Sleepless in Seattle,NaN,Rocky Balboa,Fools Rush In,/8teH96d4Hcg1BWwCePXcuHrcYxw.jpg,First came love... then came Reverend Frank.,Shriek If You Know What I Did Last Friday the Thirteenth,The Mystery of Chess Boxing,Confession of a Child of the Century,Caesar Must Die,Four Rooms,Jarhead,Pirates of the Caribbean: Dead Man's Chest,Talk to Her,Hero,Donnie Darko,86,Night on Earth,A Nightmare on Elm Street,The Matrix Revolutions,Interview with the Vampire,Beauty and the Beast,Monty Python and the Holy Grail,The Passion of Joan of Arc,Almost Famous,Mrs. Doubtfire,"[{'iso_639_1': 'en', 'name': 'English'}]",Monsoon Wedding,Reservoir Dogs,Once Were Warriors,596349,I Love You to Death,My Super Ex-Girlfriend,Rebecca,Batman & Robin,The Departed,Meet the Parents,The Vanishing,...,Chicago,Battletruck,98410061,Pufnstuf,The Coast Guard,Dear Brigitte,The Lives of Others,Primal Fear,The Mummy Returns,Memoirs of a Geisha,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Tanguy,Reindeer Games,Wet Hot American Summer,A Good Marriage,Extraordinary Illusions,The Pornographers,Zatôichi's Pilgrimage,Dollman vs. Demonic Toys,Monster High,95000,One in the Chamber,Chi lavora è perduto,Before Sunrise,Raiders of the Lost Ark,A Streetcar Named Desire,Faces,The Edukators,Death Proof,The War of the Roses,Broken Flowers,CATch her in IMAX,Trainspotting,Down by Law,Working Girl,Evil Dead II,The Hi-Lo Country,Young Adam,Batman Begins,Saw IV
1,1.0,4.5,5.0,5.0,5.0,4.0,4.5,4.0,4.0,3.5,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,4.0,4.0,3.0,4.0,5.0,4.0,4.0,4.0,4.0,3.0,3.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,4.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,4.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##Calculating cosine similarities of the standardized ratings

In [86]:
#Standardize ratings of users that have too harsh or too easy ratings
def standardize(row):
  new_row = (row - row.mean())/(row.max() - row.min())
  return new_row

ratings_std = am_trans.apply(standardize)
item_similarity = cosine_similarity(ratings_std.T)
print(item_similarity)

[[ 1.          0.09302903  0.37062245 ...  0.11275382  0.11275382
   0.11275382]
 [ 0.09302903  1.          0.19137283 ... -0.0146567  -0.0146567
  -0.0146567 ]
 [ 0.37062245  0.19137283  1.         ...  0.19310355  0.19310355
   0.19310355]
 ...
 [ 0.11275382 -0.0146567   0.19310355 ...  1.          1.
   1.        ]
 [ 0.11275382 -0.0146567   0.19310355 ...  1.          1.
   1.        ]
 [ 0.11275382 -0.0146567   0.19310355 ...  1.          1.
   1.        ]]


In [87]:
ratings_std.shape

(120, 1058)

In [88]:
item_similarity_df = pd.DataFrame(item_similarity, index = am_trans.columns, columns=am_trans.columns)
item_similarity_df.head(20)

Unnamed: 0,Three Colors: Red,The 400 Blows,Sleepless in Seattle,NaN,Rocky Balboa,Fools Rush In,/8teH96d4Hcg1BWwCePXcuHrcYxw.jpg,First came love... then came Reverend Frank.,Shriek If You Know What I Did Last Friday the Thirteenth,The Mystery of Chess Boxing,Confession of a Child of the Century,Caesar Must Die,Four Rooms,Jarhead,Pirates of the Caribbean: Dead Man's Chest,Talk to Her,Hero,Donnie Darko,86,Night on Earth,A Nightmare on Elm Street,The Matrix Revolutions,Interview with the Vampire,Beauty and the Beast,Monty Python and the Holy Grail,The Passion of Joan of Arc,Almost Famous,Mrs. Doubtfire,"[{'iso_639_1': 'en', 'name': 'English'}]",Monsoon Wedding,Reservoir Dogs,Once Were Warriors,596349,I Love You to Death,My Super Ex-Girlfriend,Rebecca,Batman & Robin,The Departed,Meet the Parents,The Vanishing,...,Chicago,Battletruck,98410061,Pufnstuf,The Coast Guard,Dear Brigitte,The Lives of Others,Primal Fear,The Mummy Returns,Memoirs of a Geisha,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Tanguy,Reindeer Games,Wet Hot American Summer,A Good Marriage,Extraordinary Illusions,The Pornographers,Zatôichi's Pilgrimage,Dollman vs. Demonic Toys,Monster High,95000,One in the Chamber,Chi lavora è perduto,Before Sunrise,Raiders of the Lost Ark,A Streetcar Named Desire,Faces,The Edukators,Death Proof,The War of the Roses,Broken Flowers,CATch her in IMAX,Trainspotting,Down by Law,Working Girl,Evil Dead II,The Hi-Lo Country,Young Adam,Batman Begins,Saw IV
Three Colors: Red,1.0,0.093029,0.370622,0.086811,0.310827,0.038245,0.17399,0.171466,0.183669,0.022435,0.084002,-0.040724,-0.058932,0.039404,0.074167,-0.047055,-0.066411,0.037733,0.311513,0.181528,0.310467,-0.047055,-0.007177,0.104636,-0.054224,0.247406,0.060762,0.186163,0.236376,0.502089,0.115114,0.4255,0.037685,0.308655,-0.047055,0.011609,-0.047055,-0.080527,0.094388,-0.094119,...,-0.047055,-0.047055,-0.047055,-0.065508,-0.047055,-0.047055,-0.047055,-0.047055,-0.047055,-0.047055,-0.047055,-0.047055,-0.047055,-0.047055,-0.047055,0.139389,0.192658,0.192658,0.192658,0.192658,0.192658,0.192658,0.192658,-0.047055,-0.065508,-0.047055,-0.047055,0.017728,0.219293,-0.047055,-0.047055,0.062222,-0.047055,-0.047055,-0.047055,0.03013,-0.047055,0.112754,0.112754,0.112754
The 400 Blows,0.093029,1.0,0.191373,0.067143,0.269985,0.248751,0.153082,0.138395,0.058616,0.237231,0.085211,0.426169,-0.044157,0.136375,0.235771,-0.014657,-0.020686,-0.038795,0.006813,-0.05663,0.105093,-0.014657,-0.029622,0.039301,-0.01689,-0.08868,-0.040943,0.176817,0.054291,0.010329,-0.055659,0.037526,-0.02553,0.225544,-0.014657,-0.05318,-0.014657,-0.025083,-0.025425,-0.029316,...,-0.014657,-0.014657,-0.014657,-0.020404,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.014657,-0.020404,-0.014657,-0.014657,-0.020184,-0.014657,-0.014657,-0.014657,-0.020603,-0.014657,-0.014657,-0.014657,-0.020603,-0.014657,-0.014657,-0.014657,-0.014657
Sleepless in Seattle,0.370622,0.191373,1.0,-0.088621,0.203679,0.231707,0.356855,0.368484,0.223277,0.130377,0.156673,0.097073,-0.069021,0.172528,0.184909,-0.047774,-0.067426,-0.042245,0.369585,-0.127606,0.205707,-0.047774,-0.017039,0.228578,-0.055053,0.302851,0.062802,0.152977,0.513979,0.293488,0.158337,0.359492,0.130988,0.353624,0.144928,0.21748,-0.047774,0.006576,0.076309,-0.007957,...,-0.047774,-0.047774,-0.047774,-0.066509,-0.047774,-0.047774,-0.047774,-0.047774,-0.047774,-0.047774,-0.047774,-0.047774,-0.047774,-0.047774,-0.047774,0.169016,0.169016,0.169016,0.169016,0.169016,0.169016,0.169016,0.169016,-0.047774,-0.066509,-0.047774,-0.047774,0.058603,-0.047774,-0.047774,-0.047774,0.126329,-0.047774,-0.047774,-0.047774,0.077958,-0.047774,0.193104,0.193104,0.193104
,0.086811,0.067143,-0.088621,1.0,0.215807,0.00845,0.107831,0.026817,0.082867,0.064747,0.010998,0.11428,0.065618,-0.061019,-0.201613,-0.036326,-0.051269,-0.006192,0.056868,0.03861,-0.079881,-0.036326,-0.051573,0.014701,-0.029999,-0.015183,0.083024,-0.026516,-0.093652,0.06746,-0.028827,-0.086074,-0.017036,0.058096,-0.036326,0.086464,-0.036326,-0.029872,-0.0154,-0.020253,...,-0.216453,-0.216453,0.071751,-0.020479,-0.072351,-0.072351,-0.108376,-0.108376,-0.108376,-0.108376,-0.108376,-0.108376,-0.108376,-0.108376,-0.108376,0.071751,0.071751,0.071751,0.071751,0.071751,0.071751,0.071751,0.071751,0.035725,0.049735,0.035725,0.035725,0.111211,0.035725,0.035725,0.035725,0.050218,0.035725,0.035725,0.035725,0.050218,0.035725,0.035725,0.035725,0.035725
Rocky Balboa,0.310827,0.269985,0.203679,0.215807,1.0,0.464542,0.404237,0.249561,0.335132,0.230323,0.096497,0.244217,-0.107355,-0.010937,0.041492,-0.035633,-0.050291,-0.094318,0.221937,-0.036273,0.13395,-0.035633,0.034113,0.13164,-0.041062,0.122067,0.087564,0.156389,0.155002,0.219219,0.168457,0.264706,0.082355,0.2228,-0.035633,0.244878,-0.035633,-0.06098,0.065665,0.033956,...,-0.035633,-0.035633,-0.035633,0.165232,0.221568,0.221568,-0.035633,-0.035633,-0.035633,-0.035633,-0.035633,-0.035633,-0.035633,-0.035633,-0.035633,-0.035633,-0.035633,-0.035633,-0.035633,-0.035633,-0.035633,-0.035633,-0.035633,-0.035633,-0.049607,-0.035633,-0.035633,-0.04907,0.285868,-0.035633,-0.035633,-0.050089,-0.035633,-0.035633,-0.035633,-0.050089,-0.035633,-0.035633,-0.035633,-0.035633
Fools Rush In,0.038245,0.248751,0.231707,0.00845,0.464542,1.0,0.31656,0.243549,0.196154,0.261318,0.044815,0.264919,-0.104719,-0.004069,0.05072,-0.034758,-0.049057,-0.092003,0.14871,-0.003276,0.235474,-0.034758,-0.070249,0.225629,-0.040054,0.092619,0.123935,0.32149,0.259139,0.141631,0.283462,0.002805,0.051373,0.250214,0.168705,0.324256,-0.034758,-0.059483,0.29546,0.078465,...,-0.034758,-0.034758,-0.034758,0.178214,0.236527,0.236527,-0.034758,-0.034758,-0.034758,-0.034758,-0.034758,-0.034758,-0.034758,-0.034758,-0.034758,-0.034758,-0.034758,-0.034758,-0.034758,-0.034758,-0.034758,-0.034758,-0.034758,-0.034758,-0.048389,-0.034758,-0.034758,-0.047866,-0.034758,-0.034758,-0.034758,-0.048859,-0.034758,-0.034758,-0.034758,-0.048859,-0.034758,-0.034758,-0.034758,-0.034758
/8teH96d4Hcg1BWwCePXcuHrcYxw.jpg,0.17399,0.153082,0.356855,0.107831,0.404237,0.31656,1.0,0.289038,0.495068,0.102848,0.107975,0.10353,0.013317,0.009185,0.049725,-0.055065,-0.077717,-0.145753,0.149605,-0.071391,0.145894,-0.055065,-0.0509,0.229753,-0.063455,0.1785,0.013116,0.186518,0.229233,0.260207,0.140566,0.239536,-0.013736,0.278646,-0.055065,0.483139,0.201052,0.206315,0.314919,0.220516,...,-0.055065,-0.055065,0.201052,0.215716,0.124217,0.124217,-0.055065,-0.055065,-0.055065,-0.055065,-0.055065,-0.055065,-0.055065,-0.055065,-0.055065,-0.055065,-0.055065,-0.055065,-0.055065,-0.055065,-0.055065,-0.055065,-0.055065,-0.055065,-0.076659,-0.055065,-0.055065,-0.07583,-0.055065,-0.055065,-0.055065,-0.077404,-0.055065,-0.055065,-0.055065,-0.077404,-0.055065,-0.055065,-0.055065,-0.055065
First came love... then came Reverend Frank.,0.171466,0.138395,0.368484,0.026817,0.249561,0.243549,0.289038,1.0,0.413887,0.302529,0.515134,0.171833,-0.1074,-0.072037,-0.02679,-0.051827,-0.073147,-0.137182,0.397401,-0.06776,0.166228,-0.051827,0.001689,0.21022,-0.059724,0.276171,-0.028021,0.173495,0.28352,0.360188,0.286153,0.22905,0.054564,0.085358,-0.051827,0.052305,-0.051827,-0.006948,0.026102,0.096977,...,-0.051827,-0.051827,0.19895,0.214128,0.123717,0.123717,0.048483,0.048483,0.048483,0.048483,0.048483,0.048483,0.048483,0.048483,0.048483,-0.051827,0.173872,0.173872,0.173872,0.173872,0.173872,0.173872,0.173872,-0.051827,0.095427,-0.051827,-0.051827,-0.071371,-0.051827,0.148794,0.148794,0.048009,0.148794,0.148794,0.148794,-0.072853,-0.051827,-0.051827,-0.051827,-0.051827
Shriek If You Know What I Did Last Friday the Thirteenth,0.183669,0.058616,0.223277,0.082867,0.335132,0.196154,0.495068,0.413887,1.0,0.221085,0.213439,0.135079,-0.078836,-0.007854,-0.007316,-0.044323,-0.062556,-0.117319,0.21928,-0.025907,0.222175,-0.044323,0.003318,0.33984,-0.051076,0.224162,0.085818,0.029533,0.416848,0.287431,0.157376,0.245942,0.049212,0.058604,-0.044323,0.339122,0.124526,0.034228,0.285759,0.003454,...,-0.044323,-0.044323,0.237092,0.095005,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.044323,-0.061705,-0.044323,-0.044323,-0.061037,-0.044323,-0.044323,-0.044323,-0.062304,-0.044323,-0.044323,-0.044323,-0.062304,-0.044323,-0.044323,-0.044323,-0.044323
The Mystery of Chess Boxing,0.022435,0.237231,0.130377,0.064747,0.230323,0.261318,0.102848,0.302529,0.221085,1.0,0.313088,0.298514,-0.050675,-0.044446,-0.041401,-0.01682,-0.023739,-0.044521,0.062419,-0.064988,0.123528,-0.01682,-0.033994,0.112554,-0.019383,0.042244,-0.046986,-0.049578,0.082902,0.104748,0.261201,0.002455,-0.029298,-0.041717,-0.01682,0.133285,-0.01682,-0.028785,-0.029178,-0.033643,...,-0.01682,-0.01682,-0.01682,-0.023416,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.01682,-0.023416,-0.01682,-0.01682,-0.023163,-0.01682,-0.01682,-0.01682,-0.023644,-0.01682,-0.01682,-0.01682,-0.023644,-0.01682,-0.01682,-0.01682,-0.01682


##Function to get similar movies based on similarity score and previous user ratings

In [89]:
def get_similar_movies(movie_name, user_rating):
  # similar_score = item_similarity_df[movie_name]*(user_rating-item_similarity_df[movie_name].mean())
  similar_score = item_similarity_df[movie_name]*(user_rating-5)
  similar_score = similar_score.sort_values(ascending=False)
  return similar_score

print(get_similar_movies('Men in Black II', 8))

Men in Black II                             3.000000
Point Break                                 1.735708
The Talented Mr. Ripley                     1.674192
[{'iso_639_1': 'en', 'name': 'English'}]    1.641126
Say Anything...                             1.561881
                                              ...   
The Vanishing                              -0.291105
King Kong vs. Godzilla                     -0.291484
Summer in Berlin                           -0.301020
Dave Chappelle's Block Party               -0.327922
Summer Storm                               -0.379649
Name: Men in Black II, Length: 1058, dtype: float64


##Input from previous user ratings (movie,rating)

In [90]:
action_lover = [("Men in Black II",9),("Point Break",8),("The Lord of the Rings",2),("Armageddon",10),("Mr. Bean's Holiday",2),("The Yes Men",1)]

##Obtaining similar movies based on previously rated movies using similarity score 

In [91]:
similar_movies = pd.DataFrame()

for movie,rating in action_lover:
  similar_movies = similar_movies.append(get_similar_movies(movie,rating),ignore_index=True)

similar_movies.head()
op = similar_movies.sum().sort_values(ascending = False).to_frame()

In [92]:
most_rec = op.head(5)
least_rec = op.tail(5)

In [93]:
most_rec.to_csv('most_similar.csv')
least_rec.to_csv('least_similar.csv')