# Recommendations Engine For Movies

The moving rating data used in this excercise was taken from [Grouplens](https://grouplens.org/datasets/movielens/).

## Importing Data

In [137]:
import pandas as pd
import numpy as np
import os
import re
from collections import Counter

In [138]:
base_dir = "./cornell-data/scale_whole_review/scale_whole_review/{}/txt.parag"

In [139]:
class DataLoader:
    base_dir = "./cornell-data/scale_whole_review/scale_whole_review/{}/txt.parag"
    review_titles = {}
    
    def __init__(self, name):
        self.name = name
        if name == "Dennis+Schwartz":
            self.title_setter = self.title_setter_by_director_token
        else:
            self.title_setter = self.title_setter_by_frequence
    
    def filling_reviews_titles(self):
        directory = self.base_dir.format(self.name)
        #Selecting an title setter:
        review_files = os.listdir(directory)
        for review_file in review_files:
            review_text = self.open_file(directory,review_file)
            review_id = int(re.split("\.",review_file)[0])
            s = self.title_setter(review_text, review_id)
            
    def open_file(self,directory,review_file):
        with open("{}/{}".format(directory,review_file), "rb") as file:
            text_review = file.read().decode('utf-8',errors='ignore')
        return text_review
    
    def title_setter_by_director_token(self,review_text, review_id): 
        try:
            title = re.findall('([A-Z,.\:\-\!\/\'\s\d]+)\s+.*\([dD]irect', review_text)
            self.review_titles[review_id] = re.split("\,",title[0])[0]
        except:
            pass

    def title_setter_by_frequence(self,review_text, review_id):
        try:
            words_array = re.findall('([A-Z][A-Z\:\-\!\/\'\s\d]+) [a-z]',review_text)
            title = Counter(words_array).most_common(1)[0][0]
            self.review_titles[review_id] = title
        except:
            pass

In [140]:
names = ["Dennis+Schwartz","James+Berardinelli","Scott+Renshaw","Steve+Rhodes"]
for name in names:
    data_loader = DataLoader(name)
    data_loader.filling_reviews_titles()

In [148]:
movies_df = pd.DataFrame([DataLoader.review_titles],index=["movieId"]).T
movies_df.reset_index(inplace=True)
movies_df.rename(columns={"index":"reviewId"},inplace=True)
movies_df["title"] = movies_df["movieId"]

In [142]:
id_base_dir = "./cornell-data/scale_data/scaledata/{}/id.{}"
rating_base_dir = "./cornell-data/scale_data/scaledata/{}/rating.{}"
ratings = pd.DataFrame([],columns=["reviewId","rating","userId"])
for name in names:
    rates = pd.read_table(rating_base_dir.format(name,name),names=["rating"])
    ids = pd.read_table(id_base_dir.format(name,name),names=["reviewId"])
    user_ratings = pd.concat([ids,rates],axis=1)
    user_ratings["userId"] = name
    ratings = pd.concat([ratings,user_ratings])
ratings_df = pd.merge(ratings,movies_df, on="reviewId")
ratings_df = ratings_df.groupby(["userId","movieId"]).head(1)

In [143]:
ratings_df_1 = pd.read_csv("./ml-latest-small/ratings.csv")
movies_df_1 = pd.read_csv("./ml-latest-small/movies.csv")

In [145]:
movies_df.head(2)

Unnamed: 0,reviewId,movieId
0,1858,THE SANDLOT
1,1859,RIFF-RAFF


In [147]:
movies_df_1.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [146]:
ratings_df_1.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


## Creating the User Item Table

To create the User - Item Matrix lets use a pivot table.

In [130]:
R_df = ratings_df.pivot(index="userId", columns="movieId", values="rating").fillna(0)
R_df

movieId,UNMADE BEDS,13TH LETTER,15 MINUTES,200 CIGARETTES,2001: A SPACE ODYSSEY,3000 MILES TO GRACELAND,4D MAN,711 OCEAN DRIVE,8 1/2 WOMEN,8MM,...,YOU ONLY LIVE TWICE,YOU'VE GOT MAIL,YOUNG GIRLS OF ROCHEFORT,YOUR FRIENDS AND NEIGHBORS,Z,ZABRISKIE POINT,ZERO EFFECT,ZIP,ZOOLANDER,ZULU
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dennis+Schwartz,0.7,0.4,0.4,0.3,0.9,0.3,0.5,0.5,0.3,0.4,...,0.0,0.0,0.8,0.0,0.5,0.8,0.6,0.0,0.0,0.8
James+Berardinelli,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.63,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0
Scott+Renshaw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Steve+Rhodes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.7,0.8,0.0,0.8,0.0,0.0,0.0,0.7,0.8,0.0


## Singular Value Decomposition

In [127]:
R = R_df.as_matrix()
from scipy.sparse.linalg import svds
U,sigma,Vt = svds(R,k=np.min([(np.min(R.shape)-1),2]))

## Making Predictions

In [128]:
all_users_predicted_ratings = np.dot(np.dot(U,np.diag(sigma)),Vt)

In [129]:
all_users_predicted_ratings

array([[ 0.08573165,  0.04898951,  0.04898951, ...,  0.19458933,
         0.22238781,  0.09797903],
       [-0.11150762, -0.06371864, -0.06371864, ..., -0.02265939,
        -0.02589645, -0.12743728],
       [ 0.04861361,  0.0277792 ,  0.0277792 , ...,  0.2115271 ,
         0.24174525,  0.05555841],
       [ 0.19458933,  0.1111939 ,  0.1111939 , ...,  0.54843848,
         0.62678684,  0.22238781]])

## How to execute it

The below command executes the movies recommendation engine for the user with id 2, it making 6 recommendations, and showing the top 8 historical ratings.
```bash
python recommender.py '{"user id":2, "Recommendation limit": 3, "Historical limit":8}'
```

## References

1. [Generals on movies recommendation systems.](https://blog.statsbot.co/recommendation-system-algorithms-ba67f39ac9a3)
1. [Matrix factorization recommender.](https://beckernick.github.io/matrix-factorization-recommender/)
2. [A movie recommendation system inplemented on Spark.](https://www.packtpub.com/books/content/building-recommendation-engine-spark)
3. [About the Netflix recommendation system.](https://medium.com/netflix-techblog/netflix-recommendations-beyond-the-5-stars-part-1-55838468f429)
4. [Performance metrics.](https://en.wikipedia.org/wiki/Information_retrieval#Precision_at_K)
5. [Movie ratings dataset.](https://grouplens.org/datasets/movielens/)