In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
data = pd.read_csv("C:\\Users\\LEGION\\AI CW\\Netflix.csv")
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,14-Aug-20,2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,23-Dec-16,2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,20-Dec-18,2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,16-Nov-17,2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,1-Jan-20,2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       7787 non-null   object
 1   type          7787 non-null   object
 2   title         7787 non-null   object
 3   director      5398 non-null   object
 4   cast          7069 non-null   object
 5   country       7280 non-null   object
 6   date_added    7777 non-null   object
 7   release_year  7787 non-null   int64 
 8   rating        7780 non-null   object
 9   duration      7787 non-null   object
 10  listed_in     7787 non-null   object
 11  description   7787 non-null   object
dtypes: int64(1), object(11)
memory usage: 730.2+ KB


In [20]:
class ContentAnalysis():
    def __init__(self, data_frame, threshold = 0.1, stop_words = 'english', lowercase = True, use_idf = True, norm=u'l2', smooth_idf = True):
        self.data_frame = data_frame
        self.model = TfidfVectorizer(max_df=threshold,stop_words=stop_words, lowercase=lowercase, use_idf=use_idf,norm=norm,smooth_idf=smooth_idf)
        self.vector = False

    def generate_vector(self, data):
        self.vector = self.model.fit_transform(data)
    def find_movies(self, request, top = 5):
        if self.vector is not False:
            content_transformation = self.model.transform([request])
            movie_relatively = np.array(np.dot(content_transformation,np.transpose(self.vector)).toarray()[0])
            index = np.argsort(movie_relatively)[-top:][::-1]
            rate = [movie_relatively[i] for i in index]
            result = zip(index, rate)     
            self.render_result(request, result)
    def recommend_movie(self, request_index , top = 10):
        if self.vector is not False:
            cosine_similarity = linear_kernel(self.vector[request_index:request_index+1], self.vector).flatten()
            index = cosine_similarity.argsort()[-top-1:-1][::-1]
            rate = [cosine_similarity[i] for i in index]
            result = zip(index, rate)     
            self.render_result(str(self.data_frame[request_index:request_index+1]), result)

    def render_result(self, request_content,indices):
        print('Your request : ' + request_content)
        print('++++++++++++++++++++++++++++++++++++++++++++')
        print('Best Results :')
        data = self.data_frame
        for index, rate in indices:
            print(data['title'].loc[index])

In [21]:
vector = ContentAnalysis(data)
vector.generate_vector(data["description"])
vector.recommend_movie(1)

Your request :   show_id   type title           director  \
1      s2  Movie  7:19  Jorge Michel Grau   

                                                cast country date_added  \
1  Demián Bichir, Héctor Bonilla, Oscar Serrano, ...  Mexico  23-Dec-16   

   release_year rating duration                     listed_in  \
1          2016  TV-MA   93 min  Dramas, International Movies   

                                         description  
1  After a devastating earthquake hits Mexico Cit...  
++++++++++++++++++++++++++++++++++++++++++++
Best Results :
The Darkest Dawn
Van Helsing
Héroes
Pandora
Term Life
Slasher
The CEO
Unbreakable Kimmy Schmidt
Bullet Head
Surviving Escobar - Alias JJ


In [22]:
vector = ContentAnalysis(data)
vector.generate_vector(data['description'])
vector.recommend_movie(10)

Your request :    show_id   type title      director  \
10     s11  Movie  1922  Zak Hilditch   

                                                 cast        country  \
10  Thomas Jane, Molly Parker, Dylan Schmid, Kaitl...  United States   

   date_added  release_year rating duration          listed_in  \
10  20-Oct-17          2017  TV-MA  103 min  Dramas, Thrillers   

                                          description  
10  A farmer pens a confession admitting to his wi...  
++++++++++++++++++++++++++++++++++++++++++++
Best Results :
The Mist
I AM A KILLER: RELEASED
Waarrior Savitri
The Blue Umbrella
The Staircase
God's Not Dead
The Water Diviner
The Theory of Everything
72 Dangerous Animals: Latin America
Just Love


In [23]:
vector = ContentAnalysis(data)
vector.generate_vector(data["title"])
vector.find_movies('Sex Education')

Your request : Sex Education
++++++++++++++++++++++++++++++++++++++++++++
Best Results :
Sex Education
Bad Education
The Bad Education Movie
Sex and the City 2
Sex Doll
