<a href="https://colab.research.google.com/github/lalesafarzade/Recommendation_system_Project/blob/lale/Notebooks/6.Recommendation_system_Content_Based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
sns.set_style('white')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pickle

In [2]:
def memory_decreaser(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
    #else:
       # df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    
    return df

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
rating_df=pd.read_json('/content/drive/MyDrive/genum_2021/ratings.json', lines=True)
json_metadata_updated=pd.read_json('/content/drive/MyDrive/genum_2021/metadata_updated.json', lines=True)

In [5]:
json_metadata_updated=memory_decreaser(json_metadata_updated)
rating_df=memory_decreaser(rating_df)

Memory usage after optimization is: 3.23 MB
Memory usage after optimization is: 434.72 MB


In [6]:
json_metadata_updated.head(2)

Unnamed: 0,title,directedBy,starring,avgRating,imdbId,item_id
0,Toy Story (1995),John Lasseter,"Tim Allen, Tom Hanks, Don Rickles, Jim Varney,...",3.89146,114709,1
1,Jumanji (1995),Joe Johnston,"Jonathan Hyde, Bradley Pierce, Robin Williams,...",3.26605,113497,2


In [7]:
json_metadata_updated.head(2)
len(json_metadata_updated)

84661

In [8]:
df = pd.merge(rating_df,json_metadata_updated,on='item_id')
df.head(1)

Unnamed: 0,item_id,user_id,rating,title,directedBy,starring,avgRating,imdbId
0,5,997206,3.0,Father of the Bride Part II (1995),Charles Shyer,"Steve Martin, Martin Short, Diane Keaton, Kimb...",3.0762,113041


In [9]:
len(df)

28454882

In [10]:
df=df[(df['starring']!="")&(df['directedBy']!="")]

In [11]:
len(df)

28351332

In [15]:
del rating_df
del json_metadata_updated
gc.collect()

587

## Making a Ratings dataframe with average rating and number of ratings:

In [16]:
ratings = pd.DataFrame(df.groupby('title').mean())[['rating','avgRating']]
ratings['num of ratings'] = pd.DataFrame(df.groupby('title')['rating'].count())
ratings.head(2)

Unnamed: 0_level_0,rating,avgRating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
(2016),5.0,5.0,1
(2019),2.794118,2.66,17


## Filtering Qualified movies by IMDB formula


In [207]:
mean_vote_report= df['avgRating'].mean()
minimum_votes= ratings['num of ratings'].quantile(0.7)
def weighted_rating(x, m=minimum_votes, C=mean_vote_report):
    v = x['num of ratings']
    R = x['avgRating']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

q_movies =ratings.copy().loc[ratings['num of ratings']>=minimum_votes]
q_movies['score'] = q_movies.apply(weighted_rating, axis= 1)
q_movies.sort_values(by= 'score', ascending= False, inplace= True)

q_movies

Unnamed: 0_level_0,rating,avgRating,num of ratings,score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Planet Earth (2006),4.464010,4.45460,1681,4.442114
"Shawshank Redemption, The (1994)",4.423161,4.41985,98967,4.419643
Band of Brothers (2001),4.389901,4.41643,1317,4.401207
"Godfather, The (1972)",4.332356,4.33078,61565,4.330481
Parasite (2019),4.226323,4.33892,2059,4.329979
...,...,...,...,...
Son of the Mask (2005),1.242120,1.24212,698,1.315089
Gigli (2003),1.203771,1.21653,822,1.279488
Glitter (2001),1.141522,1.15201,749,1.222843
From Justin to Kelly (2003),1.010684,1.02160,468,1.139080


## Adding more features

In [208]:
df1=df[['directedBy','starring','imdbId','title']].drop_duplicates()
df2=df1.drop_duplicates(subset=['title'])
df2=df2.set_index('title')

In [209]:
q_movies=pd.concat([q_movies, df2], axis=1).dropna()
q_movies.head(2)

Unnamed: 0_level_0,rating,avgRating,num of ratings,score,directedBy,starring,imdbId
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Planet Earth (2006),4.46401,4.4546,1681.0,4.442114,Alastair Fothergill,David Attenborough,795176
"Shawshank Redemption, The (1994)",4.423161,4.41985,98967.0,4.419643,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",111161


## Filtering movies with score more than 3.5

In [219]:
q_movies=q_movies[q_movies["score"]>3.2]

In [220]:
q_movies

Unnamed: 0_level_0,rating,avgRating,num of ratings,score,directedBy,starring,imdbId
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Planet Earth (2006),4.464010,4.45460,1681.0,4.442114,Alastair Fothergill,David Attenborough,795176
"Shawshank Redemption, The (1994)",4.423161,4.41985,98967.0,4.419643,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",111161
Band of Brothers (2001),4.389901,4.41643,1317.0,4.401207,Phil Alden Robinson,"Damian Lewis,Ron Livingston,Frank John Hughes,...",185906
"Godfather, The (1972)",4.332356,4.33078,61565.0,4.330481,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",68646
Parasite (2019),4.226323,4.33892,2059.0,4.329979,Bong Joon-ho,"Song Kang-ho,Lee Sun-kyun,Cho Yeo-jeong,Choi W...",6751668
...,...,...,...,...,...,...,...
Foxfire (1996),3.194338,3.19286,1042.0,3.200131,Annette Haywood-Carter,"Hedy Burress, Angelina Jolie, Jenny Lewis, Jen...",116353
Scouts Guide to the Zombie Apocalypse (2015),3.160000,3.16981,250.0,3.200117,Christopher Landon,"Tye Sheridan,Logan Miller,Joey Morgan,Sarah Du...",1727776
Berlin Syndrome (2017),3.080000,3.09906,75.0,3.200091,Cate Shortland,"Teresa Palmer,Max Riemelt,Matthias Habich,Emma...",3335606
Kevin Smith: Too Fat For 40 (2010),2.925926,2.91935,27.0,3.200038,Joey Figueroa,Kevin Smith,1705115


## adding genres features from another dataset

In [221]:
link=pd.read_csv('/content/drive/MyDrive/links.csv')
movies_ml_25=pd.read_csv('/content/drive/MyDrive/movies.csv')
movie_25=pd.merge(movies_ml_25,link,on='movieId')
movie_25.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [222]:
movie_25=movie_25.drop_duplicates()
movie_25=movie_25.drop_duplicates(subset=['title'])
movie_25=movie_25.set_index('title')
movie_25=movie_25.drop(['movieId','imdbId'],axis=1)
movie_25.head(2)

Unnamed: 0_level_0,genres,tmdbId
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,862.0
Jumanji (1995),Adventure|Children|Fantasy,8844.0


In [261]:
res=pd.concat([q_movies, movie_25], axis=1)

In [262]:
res=res.dropna()

In [263]:
res.head(2)

Unnamed: 0_level_0,rating,avgRating,num of ratings,score,directedBy,starring,imdbId,genres,tmdbId
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Planet Earth (2006),4.46401,4.4546,1681.0,4.442114,Alastair Fothergill,David Attenborough,795176.0,Documentary,192040.0
"Shawshank Redemption, The (1994)",4.423161,4.41985,98967.0,4.419643,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",111161.0,Crime|Drama,278.0


## String Preprocessing

In [248]:
def text_preprocessing(x):
  if isinstance(x, str):
    return x.replace(" ", "").lower()
  else:
    return ''

In [249]:
res1=res.copy()
features = ['directedBy', 'starring', 'genres']

for feature in features:
    res1[feature] = res1[feature].apply(text_preprocessing)

In [250]:

res1['genres'] = res1['genres'].str.split('|').str.join(' ')
res1['directedBy'] = res1['directedBy'].str.split(',').str.join(' ')
res1['starring'] = res1['starring'].str.split(',').str.join(' ')
def keyword_creater(x):
    
    return x['starring']+ ' ' +x['directedBy'] + ' '+ x['genres']

res['keyword'] = res1.apply(keyword_creater, axis=1)
keyword_df=res[['keyword']]
keyword_df.head(2)

Unnamed: 0_level_0,keyword
title,Unnamed: 1_level_1
Planet Earth (2006),davidattenborough alastairfothergill documentary
"Shawshank Redemption, The (1994)",timrobbins morganfreeman bobgunton williamsadl...


In [251]:
res

Unnamed: 0_level_0,rating,avgRating,num of ratings,score,directedBy,starring,imdbId,genres,tmdbId,keyword
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Planet Earth (2006),4.464010,4.45460,1681.0,4.442114,Alastair Fothergill,David Attenborough,795176.0,Documentary,192040.0,davidattenborough alastairfothergill documentary
"Shawshank Redemption, The (1994)",4.423161,4.41985,98967.0,4.419643,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",111161.0,Crime|Drama,278.0,timrobbins morganfreeman bobgunton williamsadl...
Band of Brothers (2001),4.389901,4.41643,1317.0,4.401207,Phil Alden Robinson,"Damian Lewis,Ron Livingston,Frank John Hughes,...",185906.0,Action|Drama|War,331214.0,damianlewis ronlivingston frankjohnhughes scot...
"Godfather, The (1972)",4.332356,4.33078,61565.0,4.330481,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",68646.0,Crime|Drama,238.0,marlonbrando alpacino jamescaan richards.caste...
Parasite (2019),4.226323,4.33892,2059.0,4.329979,Bong Joon-ho,"Song Kang-ho,Lee Sun-kyun,Cho Yeo-jeong,Choi W...",6751668.0,Comedy|Drama,496243.0,songkang-ho leesun-kyun choyeo-jeong choiwoo-s...
...,...,...,...,...,...,...,...,...,...,...
Foxfire (1996),3.194338,3.19286,1042.0,3.200131,Annette Haywood-Carter,"Hedy Burress, Angelina Jolie, Jenny Lewis, Jen...",116353.0,Drama,18555.0,hedyburress angelinajolie jennylewis jennyshim...
Scouts Guide to the Zombie Apocalypse (2015),3.160000,3.16981,250.0,3.200117,Christopher Landon,"Tye Sheridan,Logan Miller,Joey Morgan,Sarah Du...",1727776.0,Action|Comedy|Horror,273477.0,tyesheridan loganmiller joeymorgan sarahdumont...
Berlin Syndrome (2017),3.080000,3.09906,75.0,3.200091,Cate Shortland,"Teresa Palmer,Max Riemelt,Matthias Habich,Emma...",3335606.0,Drama|Thriller,363126.0,teresapalmer maxriemelt matthiashabich emmabad...
Kevin Smith: Too Fat For 40 (2010),2.925926,2.91935,27.0,3.200038,Joey Figueroa,Kevin Smith,1705115.0,Comedy,48132.0,kevinsmith joeyfigueroa comedy


## CountVectorizer

In [252]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(keyword_df['keyword'])
count_matrix.shape

(11428, 29882)

## Cosine_similarity

In [253]:
my_cosine= cosine_similarity(count_matrix, count_matrix)
my_cosine.shape

(11428, 11428)

## Recommending

In [267]:
res=res.reset_index()
res2=res.sort_values(by='title')
indices = pd.Series(index=res2['title'], data = res2.index).drop_duplicates()
indices

title
$9.99 (2008)                                     10775
'71 (2014)                                        3078
'Round Midnight (1986)                            3556
'Twas the Night Before Christmas (1974)           6748
'night Mother (1986)                              6383
                                                 ...  
eXistenZ (1999)                                   8576
loudQUIETloud: A Film About the Pixies (2006)     2380
À nos amours (1983)                               6024
À nous la liberté (Freedom for Us) (1931)         4271
Я худею (2018)                                    8903
Length: 11428, dtype: int64

In [268]:
def recommendations(title, cosine_sim= my_cosine,k=10):
    
    sim_scores = list(enumerate(cosine_sim[indices[title]]))
    sim_scores = sorted(sim_scores, key= lambda x : x[1], reverse= True)
    sim_scores = sim_scores[1:k+1]
    movie_indices = [i[0] for i in sim_scores]
    return res.iloc[movie_indices][['title','score']],

In [269]:
recommendations('Sabrina (1995)',my_cosine,4)

(                          title     score
 8109          Ghost Town (2008)  3.383095
 3655             Tootsie (1982)  3.619436
 1618  As Good as It Gets (1997)  3.775847
 9777       Morning Glory (2010)  3.295055,)

In [None]:
recommendations('Godfather, The (1972)',my_cosine,6)

## checking the recommendation

In [270]:
title=['Sabrina (1995)','Ghost Town (2008)','Tootsie (1982)','As Good as It Gets (1997)','Morning Glory (2010)']

In [272]:
res[res['title'].isin(title)][['title','directedBy','starring','genres']]

Unnamed: 0,title,directedBy,starring,genres
1618,As Good as It Gets (1997),James L. Brooks,"Cuba Gooding Jr., Greg Kinnear, Jack Nicholson...",Comedy|Drama|Romance
3655,Tootsie (1982),Sydney Pollack,"Dustin Hoffman, Jessica Lange, Teri Garr, Dabn...",Comedy|Romance
8109,Ghost Town (2008),David Koepp,"Greg Kinnear, Ricky Gervais, Téa Leoni",Comedy|Fantasy|Romance
8310,Sabrina (1995),Sydney Pollack,"Harrison Ford, Greg Kinnear, Nancy Marchand, J...",Comedy|Romance
9777,Morning Glory (2010),Roger Michell,"Rachel McAdams, Harrison Ford, Diane Keaton, J...",Comedy|Drama|Romance


## Pickling

In [273]:
res_dict=res.to_dict('records')

In [274]:
from google.colab import files
pickle.dump(res_dict,open('res_dict.pkl',"wb"), protocol=4)

files.download('res_dict.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [275]:
indices_dict=indices.to_dict()
pickle.dump(indices_dict,open('indices.pkl',"wb"))

files.download('indices.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [276]:
import bz2
import pickle
import _pickle as cPickle
def compressed_pickle(title, data):
  with bz2.BZ2File(title + '.pbz2',"wb") as f: 
    cPickle.dump(data, f)

compressed_pickle('my_cosine', my_cosine) 

files.download('my_cosine.pbz2')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Load any compressed pickle file
#def decompress_pickle(file):
  #data = bz2.BZ2File(file, "rb")
  #data = cPickle.load(data)
  #return data


#data = decompress_pickle('/content/my_cosinemy_cosine.pbz2') 