### Goals
- Build a movie recommender system

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Loading dataset
movies_df = pd.read_csv("./ml-latest/movies.csv")
# links_df = pd.read_csv("./ml-latest/links.csv")
ratings_df = pd.read_csv("./ml-latest/ratings.csv")
# tags_df= pd.read_csv("./ml-latest/tags.csv")
movies_df.head()
print(ratings_df.head())

   userId  movieId  rating   timestamp
0       1      169     2.5  1204927694
1       1     2471     3.0  1204927438
2       1    48516     5.0  1204927435
3       2     2571     3.5  1436165433
4       2   109487     4.0  1436165496


In [3]:
movies_df[movies_df["title"] == "Gladiator (2000)"]

Unnamed: 0,movieId,title,genres
3488,3578,Gladiator (2000),Action|Adventure|Drama


In [4]:
print(movies_df.shape)
print(movies_df.size)
# print(movies_df.dtypes)
# print(movies_df.info())
# print(movies_df.describe())
print(ratings_df.shape)
print(ratings_df.size)
# print(ratings_df.dtypes)

(34208, 3)
102624
(22884377, 4)
91537508


In [5]:
temp_df = movies_df.head().copy()
temp_df["title"]
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [6]:
# Preprocessing
# Collaborative filtering
# Constest based filtering
# Movie dataset from Kaggle
# Preprocessing
# cleaning the data
# Understand how to do collaborative filtering
# By using similarity score 
# print(movies_df["title"])
# Cleaning movies data
movies_df["year"] = movies_df.title.str.extract("(\(\d{4}\))", expand=False)
movies_df["year"] = movies_df.year.str.extract("(\d{4})", expand=False)
movies_df["title"] = movies_df.title.str.replace(r"(\(\d{4}\))", "", regex=True)
movies_df["title"] = movies_df["title"].apply(lambda x: x.strip())
movies_df = movies_df.drop("genres", axis=1)
movies_df

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995
...,...,...,...
34203,151697,Grand Slam,1967
34204,151701,Bloodmoney,2010
34205,151703,The Butterfly Circus,2009
34206,151709,Zero,2015


In [7]:
# ratings data cleaning
ratings_df= ratings_df.drop("timestamp", axis=1)
ratings_df.head()


Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


- Collaborating filtering to predict ratings
- Using Pearson Correlation Function

$$

$$

In [8]:
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [9]:
# find the row present in the movies based on the title present in the userInput
# then add that data to input
# removed not required field
inputMovies = pd.DataFrame(userInput)
inputMovies = movies_df[movies_df["title"].isin(inputMovies["title"].tolist())]
inputMovies = inputMovies.drop(["year"], axis=1)
inputMovies.shape
print(inputMovies)
movies_df.head()
ratings_df.head()

      movieId                title
0           1            Toy Story
1           2              Jumanji
293       296         Pulp Fiction
1246     1274                Akira
1885     1968  Breakfast Club, The


Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [10]:
# Users that watched same movies as inputMovies
users = ratings_df[ratings_df["movieId"].isin(inputMovies["movieId"].tolist())]
users.shape
users.head()


Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0


In [14]:
# group the rows by userId
userSubset = users.groupby(["userId"])
userSubset.head()

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0
...,...,...,...
22883679,247738,296,4.0
22884132,247751,1,4.0
22884142,247751,296,4.0
22884164,247751,1274,5.0


In [17]:
# Looking into the users
userSubset.get_group(1130)

  userSubset.get_group(1130)


Unnamed: 0,userId,movieId,rating
104167,1130,1,0.5
104168,1130,2,4.0
104214,1130,296,4.0
104363,1130,1274,4.5
104443,1130,1968,4.5


In [19]:
# sorting the users with most in common
userSubset = sorted(userSubset, key=lambda x: len(x[1]), reverse=True)
userSubset[0:3]

[((75,),
        userId  movieId  rating
  7507      75        1     5.0
  7508      75        2     3.5
  7540      75      296     5.0
  7633      75     1274     4.5
  7673      75     1968     5.0),
 ((106,),
        userId  movieId  rating
  9083     106        1     2.5
  9084     106        2     3.0
  9115     106      296     3.5
  9198     106     1274     3.0
  9238     106     1968     3.5),
 ((686,),
         userId  movieId  rating
  61336     686        1     4.0
  61337     686        2     3.0
  61377     686      296     4.0
  61478     686     1274     4.0
  61569     686     1968     5.0)]

In [22]:
# Similarity of users to input users
userSubset = userSubset[0:100]

In [None]:
# Implementing pearson co-efficient
pearsonCorrelation = {}

for name, group in userSubset: