<a href="https://colab.research.google.com/github/lalesafarzade/Recommendation_system_Project/blob/main/Notebooks/5.best_movies_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
sns.set_style('white')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pickle

In [2]:
def memory_decreaser(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
    #else:
       # df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    
    return df

In [3]:
link_url="https://movielens1.s3.amazonaws.com/links.csv"
movies_ml_25_url="https://movielens1.s3.amazonaws.com/movies.csv"
movie_ml_25_rating_url="https://movielens1.s3.amazonaws.com/ratings.csv"
metadata_updated_url="https://movielens1.s3.amazonaws.com/metadata_updated.json"
metadata_rating_url="https://movielens1.s3.amazonaws.com/ratings.json"

In [4]:
rating_df=pd.read_json(metadata_rating_url, lines=True)
json_metadata_updated=pd.read_json(metadata_updated_url, lines=True)

In [5]:
json_metadata_updated=memory_decreaser(json_metadata_updated)
rating_df=memory_decreaser(rating_df)

Memory usage after optimization is: 3.23 MB
Memory usage after optimization is: 434.72 MB


In [6]:
json_metadata_updated.head(2)

Unnamed: 0,title,directedBy,starring,avgRating,imdbId,item_id
0,Toy Story (1995),John Lasseter,"Tim Allen, Tom Hanks, Don Rickles, Jim Varney,...",3.89146,114709,1
1,Jumanji (1995),Joe Johnston,"Jonathan Hyde, Bradley Pierce, Robin Williams,...",3.26605,113497,2


In [7]:
json_metadata_updated.head(2)
len(json_metadata_updated)

84661

In [8]:
df = pd.merge(rating_df,json_metadata_updated,on='item_id')
df.head(1)

Unnamed: 0,item_id,user_id,rating,title,directedBy,starring,avgRating,imdbId
0,5,997206,3.0,Father of the Bride Part II (1995),Charles Shyer,"Steve Martin, Martin Short, Diane Keaton, Kimb...",3.0762,113041


In [9]:
len(df)

28454882

In [10]:
df=df[(df['starring']!="")&(df['directedBy']!="")]

In [11]:
len(df)

28351332

In [12]:
del rating_df
del json_metadata_updated
gc.collect()

0

## Making a Ratings dataframe with average rating and number of ratings:

In [13]:
ratings = pd.DataFrame(df.groupby('title').mean())[['rating','avgRating']]
ratings['num of ratings'] = pd.DataFrame(df.groupby('title')['rating'].count())
ratings.head(2)

Unnamed: 0_level_0,rating,avgRating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
(2016),5.0,5.0,1
(2019),2.794118,2.66,17


## Filtering Qualified movies by IMDB formula


In [35]:
mean_vote_report= df['avgRating'].mean()
minimum_votes= ratings['num of ratings'].quantile(0.9)
def weighted_rating(x, m=minimum_votes, C=mean_vote_report):
    v = x['num of ratings']
    R = x['avgRating']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

q_movies =ratings.copy().loc[ratings['num of ratings']>=minimum_votes]
q_movies['score'] = q_movies.apply(weighted_rating, axis= 1)
q_movies.sort_values(by= 'score', ascending= False, inplace= True)

q_movies

Unnamed: 0_level_0,rating,avgRating,num of ratings,score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Shawshank Redemption, The (1994)",4.423161,4.41985,98967,4.415954
"Godfather, The (1972)",4.332356,4.33078,61565,4.325158
"Usual Suspects, The (1995)",4.290029,4.27727,62749,4.272122
Planet Earth (2006),4.464010,4.45460,1681,4.264430
"Godfather: Part II, The (1974)",4.262693,4.26899,39373,4.260910
...,...,...,...,...
Spice World (1997),1.823346,1.82654,3204,2.030114
Glitter (2001),1.141522,1.15201,749,2.025512
Gigli (2003),1.203771,1.21653,822,2.016975
Epic Movie (2007),1.471042,1.45256,1295,1.974807


## Adding more features

In [36]:
df1=df[['directedBy','starring','imdbId','title']].drop_duplicates()
df2=df1.drop_duplicates(subset=['title'])
df2=df2.set_index('title')

In [37]:
q_movies=pd.concat([q_movies, df2], axis=1).dropna()
q_movies.head(2)

Unnamed: 0_level_0,rating,avgRating,num of ratings,score,directedBy,starring,imdbId
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Shawshank Redemption, The (1994)",4.423161,4.41985,98967.0,4.415954,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",111161
"Godfather, The (1972)",4.332356,4.33078,61565.0,4.325158,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",68646


## Filtering movies with score more than 3.5

In [38]:
q_movies=q_movies[q_movies["score"]>4]

In [39]:
q_movies

Unnamed: 0_level_0,rating,avgRating,num of ratings,score,directedBy,starring,imdbId
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Shawshank Redemption, The (1994)",4.423161,4.41985,98967.0,4.415954,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",111161
"Godfather, The (1972)",4.332356,4.33078,61565.0,4.325158,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",68646
"Usual Suspects, The (1995)",4.290029,4.27727,62749.0,4.272122,Bryan Singer,"Stephen Baldwin, Gabriel Byrne, Benicio Del To...",114814
Planet Earth (2006),4.464010,4.45460,1681.0,4.264430,Alastair Fothergill,David Attenborough,795176
"Godfather: Part II, The (1974)",4.262693,4.26899,39373.0,4.260910,Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",71562
...,...,...,...,...,...,...,...
Singin' in the Rain (1952),4.022798,4.02005,12918.0,4.004071,"Stanley Donen, Gene Kelly","Gene Kelly, Donald O'Connor, Jean Hagen, Debbi...",45152
Manon of the Spring (Manon des sources) (1986),4.067402,4.06637,3264.0,4.003239,Claude Berri,"Yves Montand, Daniel Auteuil, Emmanuelle Béart...",91480
WALL·E (2008),4.007262,4.01027,28779.0,4.003112,Andrew Stanton,"Ben Burtt, Elissa Knight, Jeff Garlin",910970
The Imitation Game (2014),4.013223,4.01275,16600.0,4.000411,Morten Tyldum,"Benedict Cumberbatch,Keira Knightley,Matthew G...",2084970


## adding genres features from another dataset

In [40]:
link=pd.read_csv(link_url)
movies_ml_25=pd.read_csv(movies_ml_25_url)
movie_25=pd.merge(movies_ml_25,link,on='movieId')
movie_25.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [41]:
movie_25=movie_25.drop_duplicates()
movie_25=movie_25.drop_duplicates(subset=['title'])
movie_25=movie_25.set_index('title')
movie_25=movie_25.drop(['movieId','imdbId'],axis=1)
movie_25.head(2)

Unnamed: 0_level_0,genres,tmdbId
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,862.0
Jumanji (1995),Adventure|Children|Fantasy,8844.0


In [42]:
res=pd.concat([q_movies, movie_25], axis=1)

In [43]:
res=res.dropna()

In [44]:
res.head(2)

Unnamed: 0_level_0,rating,avgRating,num of ratings,score,directedBy,starring,imdbId,genres,tmdbId
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"Shawshank Redemption, The (1994)",4.423161,4.41985,98967.0,4.415954,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",111161.0,Crime|Drama,278.0
"Godfather, The (1972)",4.332356,4.33078,61565.0,4.325158,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",68646.0,Crime|Drama,238.0


In [46]:
res.isna().sum()

rating            0
avgRating         0
num of ratings    0
score             0
directedBy        0
starring          0
imdbId            0
genres            0
tmdbId            0
dtype: int64

In [47]:
best_movies=res.head(150)

In [51]:
#For Website
from google.colab import files
best_movies.to_csv('best_movies.csv',index=False)
files.download('best_movies.csv')
html=best_movies.to_html()
# write html to file
text_file = open("../templates/best_movies.html", "w")
text_file.write(html)
text_file.close()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>