### Creating a recommendation system based on the content simalirity (Content Based)

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("updated_webseries_dataset.csv")

In [3]:
df.head()

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Genre,Description,No of Seasons,Streaming Platform,Series Id
0,Breaking Bad,2008,18+,9.5,100,"Crime,Drama","When Walter White, a New Mexico chemistry teac...",5Seasons,Netflix,1396
1,Game of Thrones,2011,18+,9.3,99,"Action & Adventure,Drama",Seven noble families fight for control of the ...,8Seasons,"HBO MAX,HBO",1399
2,Rick and Morty,2013,18+,9.2,97,"Animation,Comedy",Rick is a mentally-unbalanced but scientifical...,4Seasons,"Free Services,HBO MAX,Hulu",60625
3,Stranger Things,2016,16+,8.8,96,"Drama,Fantasy","When a young boy vanishes, a small town uncove...",3Seasons,Netflix,66732
4,The Boys,2019,18+,8.7,95,"Action & Adventure,Comedy",A group of vigilantes known informally as “The...,2Seasons,Prime Video,76479


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11025 entries, 0 to 11024
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Series Title        11025 non-null  object 
 1   Year Released       11025 non-null  int64  
 2   Content Rating      7083 non-null   object 
 3   IMDB Rating         9801 non-null   float64
 4   R Rating            11025 non-null  int64  
 5   Genre               11025 non-null  object 
 6   Description         11025 non-null  object 
 7   No of Seasons       11025 non-null  object 
 8   Streaming Platform  9146 non-null   object 
 9   Series Id           11025 non-null  int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 861.5+ KB


In [5]:
# Columns we are going to use
# Series Title R-Rating Genre Description Series_id

In [6]:
df.columns

Index(['Series Title', 'Year Released', 'Content Rating', 'IMDB Rating',
       'R Rating', 'Genre', 'Description', 'No of Seasons',
       'Streaming Platform', 'Series Id'],
      dtype='object')

In [7]:
series=df[['Series Title','Series Id','R Rating', 'Genre', 'Description']]

In [8]:
series=series[series["Genre"]!="-1"]

In [9]:
series.head()

Unnamed: 0,Series Title,Series Id,R Rating,Genre,Description
0,Breaking Bad,1396,100,"Crime,Drama","When Walter White, a New Mexico chemistry teac..."
1,Game of Thrones,1399,99,"Action & Adventure,Drama",Seven noble families fight for control of the ...
2,Rick and Morty,60625,97,"Animation,Comedy",Rick is a mentally-unbalanced but scientifical...
3,Stranger Things,66732,96,"Drama,Fantasy","When a young boy vanishes, a small town uncove..."
4,The Boys,76479,95,"Action & Adventure,Comedy",A group of vigilantes known informally as “The...


In [10]:
# converting into a list

In [11]:
series["Genre"]=series["Genre"].apply(lambda i:i.split())

In [12]:
series["Description"]=series["Description"].apply(lambda i:i.split())

In [13]:
series.head()

Unnamed: 0,Series Title,Series Id,R Rating,Genre,Description
0,Breaking Bad,1396,100,"[Crime,Drama]","[When, Walter, White,, a, New, Mexico, chemist..."
1,Game of Thrones,1399,99,"[Action, &, Adventure,Drama]","[Seven, noble, families, fight, for, control, ..."
2,Rick and Morty,60625,97,"[Animation,Comedy]","[Rick, is, a, mentally-unbalanced, but, scient..."
3,Stranger Things,66732,96,"[Drama,Fantasy]","[When, a, young, boy, vanishes,, a, small, tow..."
4,The Boys,76479,95,"[Action, &, Adventure,Comedy]","[A, group, of, vigilantes, known, informally, ..."


In [14]:
# Removing spaces because eg:- Sam is a different entity and Sam Worthington is a different entity 
# we need a unique name there mightbe two sam so it will create confusion

In [15]:
def remove_space(obj):
    return [i.replace(" ","") for i in obj]

In [16]:
series["Genre"]=series["Genre"].apply(remove_space)

In [17]:
series["Description"]=series["Description"].apply(remove_space)

In [18]:
series.head()

Unnamed: 0,Series Title,Series Id,R Rating,Genre,Description
0,Breaking Bad,1396,100,"[Crime,Drama]","[When, Walter, White,, a, New, Mexico, chemist..."
1,Game of Thrones,1399,99,"[Action, &, Adventure,Drama]","[Seven, noble, families, fight, for, control, ..."
2,Rick and Morty,60625,97,"[Animation,Comedy]","[Rick, is, a, mentally-unbalanced, but, scient..."
3,Stranger Things,66732,96,"[Drama,Fantasy]","[When, a, young, boy, vanishes,, a, small, tow..."
4,The Boys,76479,95,"[Action, &, Adventure,Comedy]","[A, group, of, vigilantes, known, informally, ..."


In [19]:
series["tags"]=series.Genre + series.Description

In [20]:
series.head()

Unnamed: 0,Series Title,Series Id,R Rating,Genre,Description,tags
0,Breaking Bad,1396,100,"[Crime,Drama]","[When, Walter, White,, a, New, Mexico, chemist...","[Crime,Drama, When, Walter, White,, a, New, Me..."
1,Game of Thrones,1399,99,"[Action, &, Adventure,Drama]","[Seven, noble, families, fight, for, control, ...","[Action, &, Adventure,Drama, Seven, noble, fam..."
2,Rick and Morty,60625,97,"[Animation,Comedy]","[Rick, is, a, mentally-unbalanced, but, scient...","[Animation,Comedy, Rick, is, a, mentally-unbal..."
3,Stranger Things,66732,96,"[Drama,Fantasy]","[When, a, young, boy, vanishes,, a, small, tow...","[Drama,Fantasy, When, a, young, boy, vanishes,..."
4,The Boys,76479,95,"[Action, &, Adventure,Comedy]","[A, group, of, vigilantes, known, informally, ...","[Action, &, Adventure,Comedy, A, group, of, vi..."


In [21]:
df1=series[["Series Title","Series Id","tags"]]

In [22]:
df1.columns=["Series_Title","Series_Id","Tags"]

In [23]:
df1["Tags"]=df1["Tags"].apply(lambda i:" ".join(i))

In [24]:
df1.head()

Unnamed: 0,Series_Title,Series_Id,Tags
0,Breaking Bad,1396,"Crime,Drama When Walter White, a New Mexico ch..."
1,Game of Thrones,1399,"Action & Adventure,Drama Seven noble families ..."
2,Rick and Morty,60625,"Animation,Comedy Rick is a mentally-unbalanced..."
3,Stranger Things,66732,"Drama,Fantasy When a young boy vanishes, a sma..."
4,The Boys,76479,"Action & Adventure,Comedy A group of vigilante..."


In [25]:
# Converting everything into lowercase

In [27]:
df1["Tags"]=df1["Tags"].apply(lambda i:i.lower())

In [28]:
df1=df1[df1["Series_Title"].duplicated()==False]

In [29]:
df1["Series_Title"].duplicated().sum()

0

In [30]:

def stemmer(text):
    poter_stem=PorterStemmer()
    y = []
    for i in text.split():
        y.append(poter_stem.stem(i))
    return " ".join(y)

In [32]:
df1["Tags"]=df1.Tags.apply(stemmer)

In [33]:
# We are creating names vector and we will give recommendations base on the closest vector(Distance based i.e Cosine dsiatance)
# OR TFIDF method

In [34]:
count_vector=CountVectorizer(max_features=5000,stop_words="english")

In [35]:
vector=count_vector.fit_transform(df1["Tags"]).toarray()

In [36]:
vector[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [37]:
len(count_vector.get_feature_names())

5000

In [38]:
# Using cosine similarity

In [39]:
similarity=cosine_similarity(vector)

In [40]:
similarity.shape

(10544, 10544)

In [41]:
similarity[0]

array([1.        , 0.51923775, 0.39269816, ..., 0.33739293, 0.1474242 ,
       0.30276504])

In [42]:
df1.shape

(10544, 3)

In [119]:
# Creating a function which will give 5 series based on similarity

In [43]:
def recommend(series_name):
    try:
        series_index = df1[df1["Series_Title"] == series_name].index[0]
        distances = similarity[series_index]
        series_list=sorted(list(enumerate(distances)),reverse=True, key=lambda i:i[1])[1:6]
        for i in series_list:
            print(df1.iloc[i[0]].Series_Title)
    except Exception as e:
        print("Check Series name again, Series not found!!!")

In [44]:
sorted(list(enumerate(similarity[0])),reverse=True, key=lambda i:i[1])[1:6]

[(116, 0.6936071720296073),
 (685, 0.6696652510408083),
 (2379, 0.6542654209926725),
 (243, 0.6528124384711252),
 (137, 0.6502621848846118)]

In [45]:
recommend("Breaking Bad")

Broadchurch
American Crime
Maigret
Power
The Shield


In [46]:
df1[df1["Series_Title"]=="Maigret"]

Unnamed: 0,Series_Title,Series_Id,Tags
2510,Maigret,752564,"crime,drama adapt of the novel written by geor..."


In [47]:
pickle.dump(df1.to_dict(),open("series_dict.pkl","wb"))

In [48]:
pickle.dump(similarity,open("similarity.pkl","wb"))