<a href="https://colab.research.google.com/github/lalesafarzade/Recommendation_system_Project/blob/lale/Notebooks/6.Recommendation_system_Content_Based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
sns.set_style('white')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pickle
#from configs import api_key

In [2]:
def memory_decreaser(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
    #else:
       # df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    
    return df

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Reading files

In [4]:
movie_df=pd.read_csv('/content/drive/MyDrive/movies.csv')
link_df=pd.read_csv('/content/drive/MyDrive/links.csv')
rating_df=pd.read_csv('/content/drive/MyDrive/ratings.csv')
json_metadata=pd.read_json('/content/drive/MyDrive/metadata_updated.json', lines=True)

In [5]:
movie=pd.merge(link_df, movie_df, on="movieId")
movie.drop(['tmdbId'],axis=1,inplace=True)
df1=json_metadata[['imdbId','directedBy','starring','avgRating']]
mvielense_df = pd.merge(movie, df1, on=["imdbId"],how='inner')
mvielense_df.head(2)

Unnamed: 0,movieId,imdbId,title,genres,directedBy,starring,avgRating
0,1,114709,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,John Lasseter,"Tim Allen, Tom Hanks, Don Rickles, Jim Varney,...",3.89146
1,2,113497,Jumanji (1995),Adventure|Children|Fantasy,Joe Johnston,"Jonathan Hyde, Bradley Pierce, Robin Williams,...",3.26605


In [6]:
del movie_df
del link_df
del json_metadata
del movie

gc.collect()


50

In [7]:
mvielense_df=memory_decreaser(mvielense_df)

Memory usage after optimization is: 3.33 MB


In [8]:
df = pd.merge(rating_df,mvielense_df,on='movieId')
df.head(1)

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,title,genres,directedBy,starring,avgRating
0,1,296,5.0,1147880044,110912,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,Quentin Tarantino,"John Travolta, Samuel L. Jackson, Tim Roth, Am...",4.1876


In [9]:
df=memory_decreaser(df)

Memory usage after optimization is: 1716.58 MB


#### Making a Ratings dataframe with average rating and number of ratings:

In [10]:
ratings = pd.DataFrame(df.groupby('title').mean())[['rating','avgRating']]
ratings['num of ratings'] = pd.DataFrame(df.groupby('title')['rating'].count())
ratings.head(2)

Unnamed: 0_level_0,rating,avgRating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"""BLOW THE NIGHT!"" Let's Spend the Night Together (1983)",3.0,3.0,1
"""Great Performances"" Cats (1998)",2.896648,2.87205,179


## Filtering Qualified movies by IMDB formula

In [12]:
mean_vote_report= df['avgRating'].mean()
minimum_votes= ratings['num of ratings'].quantile(0.6)
def weighted_rating(x, m=minimum_votes, C=mean_vote_report):
    v = x['num of ratings']
    R = x['avgRating']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

q_movies =ratings.copy().loc[ratings['num of ratings']>=minimum_votes]
q_movies['score'] = q_movies.apply(weighted_rating, axis= 1)
q_movies.sort_values(by= 'score', ascending= False, inplace= True)
q_movies=q_movies.reset_index()
q_movies


Unnamed: 0,title,rating,avgRating,num of ratings,score
0,Planet Earth II (2016),4.483096,4.46924,1124,4.460998
1,Planet Earth (2006),4.464797,4.45460,1747,4.449364
2,"Shawshank Redemption, The (1994)",4.413576,4.41985,81482,4.419741
3,Band of Brothers (2001),4.398599,4.41643,1356,4.409974
4,Cosmos,4.326715,4.37227,277,4.343081
...,...,...,...,...,...
24309,Son of the Mask (2005),1.232227,1.24212,633,1.277772
24310,Gigli (2003),1.214380,1.21653,758,1.246713
24311,Glitter (2001),1.125561,1.15201,669,1.187099
24312,From Justin to Kelly (2003),1.027578,1.02160,417,1.080451


## Merging dataframes to have more features

In [13]:
df2=mvielense_df[['title','genres','imdbId','directedBy','starring','avgRating']]
df2[df2['title'].isin(q_movies['title'].to_list())]
movies_df = pd.merge(q_movies,df2[['title','genres','directedBy','starring']],on='title')


In [16]:
movies_df=movies_df[(movies_df["score"]>3)&((movies_df['starring']!="")&(movies_df['directedBy']!="")&(movies_df['genres']!="(nogenreslisted)"))]
res=movies_df[['title','genres','directedBy',"starring","score"]]
res.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18474 entries, 1 to 19257
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       18474 non-null  object 
 1   genres      18474 non-null  object 
 2   directedBy  18474 non-null  object 
 3   starring    18474 non-null  object 
 4   score       18474 non-null  float64
dtypes: float64(1), object(4)
memory usage: 866.0+ KB


In [17]:
def text_preprocessing(x):
  if isinstance(x, str):
    return x.replace(" ", "").lower()
  else:
    return ''


In [18]:
features = ['directedBy', 'starring', 'genres']

for feature in features:
    res[feature] = res[feature].apply(text_preprocessing)



In [19]:
res.head(4)

Unnamed: 0,title,genres,directedBy,starring,score
1,Planet Earth (2006),documentary,alastairfothergill,davidattenborough,4.449364
2,"Shawshank Redemption, The (1994)",crime|drama,frankdarabont,"timrobbins,morganfreeman,bobgunton,williamsadl...",4.419741
3,Band of Brothers (2001),action|drama|war,philaldenrobinson,"damianlewis,ronlivingston,frankjohnhughes,scot...",4.409974
5,"Godfather, The (1972)",crime|drama,francisfordcoppola,"marlonbrando,alpacino,jamescaan,richards.caste...",4.330628


In [20]:
res['genres'] = res['genres'].str.split('|').str.join(' ')
res['directedBy'] = res['directedBy'].str.split(',').str.join(' ')
res['starring'] = res['starring'].str.split(',').str.join(' ')
def keyword_creater(x):
    
    return x['starring']+ ' ' +x['directedBy'] + ' '+ x['genres']

res['keyword'] = res.apply(keyword_creater, axis=1)
keyword_df=res[['keyword']]
keyword_df.head(2)

Unnamed: 0,keyword
1,davidattenborough alastairfothergill documentary
2,timrobbins morganfreeman bobgunton williamsadl...


## CountVectorizer

In [21]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(keyword_df['keyword'])
count_matrix.shape

(18474, 44290)

## Cosine_similarity

In [22]:

my_cosine= cosine_similarity(count_matrix, count_matrix)
my_cosine.shape

(18474, 18474)

In [23]:
indices = pd.Series(index=mvielense_df['title'], data = mvielense_df.index).drop_duplicates()
indices

title
Toy Story (1995)                          0
Jumanji (1995)                            1
Grumpier Old Men (1995)                   2
Waiting to Exhale (1995)                  3
Father of the Bride Part II (1995)        4
                                      ...  
We (2018)                             62412
Window of the Soul (2001)             62413
Bad Poems (2018)                      62414
A Girl Thing (2001)                   62415
Women of Devil's Island (1962)        62416
Length: 62417, dtype: int64

In [24]:
indices_dict=indices.to_dict()

In [25]:
def get_recommendations(title, cosine_sim= my_cosine,k=10):
    
    sim_scores = list(enumerate(cosine_sim[indices[title]]))
    sim_scores = sorted(sim_scores, key= lambda x : x[1], reverse= True)
    sim_scores = sim_scores[1:k+1]
    movie_indices = [i[0] for i in sim_scores]
    return res.iloc[movie_indices][['title','score']]

In [26]:
get_recommendations('Love and Honor (2006)',my_cosine,20)

Unnamed: 0,title,score
10725,Pal Joey (1957),3.382752
13577,Show Boat (1951),3.272216
973,Once (2006),3.876306
11566,Paris 36 (Faubourg 36) (2008),3.350385
4764,God Help the Girl (2014),3.609403
5226,Jodhaa Akbar (2008),3.591112
6276,Listen to Your Heart (2010),3.547979
8204,San Francisco (1936),3.47477
9999,Hipsters (Stilyagi) (2008),3.409883
14433,Alexander's Ragtime Band (1938),3.238847


## Pickling

In [27]:
rse_dict=res.to_dict('records')

In [28]:
indices_dict=indices.to_dict()

In [29]:
my_cosine.shape

(18474, 18474)

In [30]:
pickle.dump(indices_dict,open('indices.pkl',"wb"))
from google.colab import files
files.download('indices.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [31]:
import bz2
import pickle
import _pickle as cPickle

In [32]:
# Pickle a file and then compress it into a file with extension 
def compressed_pickle(title, data):
  with bz2.BZ2File(title + '.pbz2',"wb") as f: 
    cPickle.dump(data, f)

compressed_pickle('my_cosine', my_cosine) 


pickle.dump(my_cosine,open('my_cosine.pkl',"wb"), protocol=4)

files.download('my_cosine.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [35]:
# Load any compressed pickle file
def decompress_pickle(file):
  data = bz2.BZ2File(file, "rb")
  data = cPickle.load(data)
  return data


data = decompress_pickle('/content/my_cosinemy_cosine.pbz2') 

In [38]:
pickle.dump(rse_dict,open('rse_dict.pkl',"wb"), protocol=4)

files.download('rse_dict.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>