In [1]:
# Importing essential tools
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load movie dataset
movies = pd.read_csv("filter_data.csv")

In [3]:
movies

Unnamed: 0,Id,Title,Director,Cast,Genre,Overview,Reviews,0
0,391629,Baaghi,Sabbir Khan,"['Tiger Shroff', 'Shraddha Kapoor', 'Sunil Gro...","Action, Thriller, Romance","Ronny is a rebellious man, who falls in love w...",,
1,25918,Champion,Mark Robson,"['Kirk Douglas', 'Marilyn Maxwell', 'Arthur Ke...",Drama,An unscrupulous boxer fights his way to the to...,,
2,1104040,Gangs of Lagos,Jadesola Osiberu,"['Demi Banwo', 'Adesua Etomi-Wellington', 'Tob...",Crime,A group of friends who each have to navigate t...,,
3,157800,Har Dil Jo Pyar Karega,Raj Kanwar,"['Salman Khan', 'Rani Mukerji', 'Preity Zinta'...","Comedy, Drama",Raj is a struggling singer chasing his dreams ...,,
4,60579,Hey Ram,Kamal Haasan,"['Kamal Haasan', 'Shah Rukh Khan', 'Hema Malin...","History, Drama, Crime",Saketh Ram's wife is raped and killed during d...,,
...,...,...,...,...,...,...,...,...
5553,560204,Arkansas,Clark Duke,"['Liam Hemsworth', 'Clark Duke', 'Vince Vaughn...","Crime, Thriller",Kyle and Swin live by the orders of an Arkansa...,A good but not great film that could've been b...,
5554,19053,Valley Girl,Martha Coolidge,"['Nicolas Cage', 'Deborah Foreman', 'Elizabeth...","Comedy, Romance","Julie, a girl from the valley, meets Randy, a ...",,
5555,429422,Capone,Josh Trank,"['Tom Hardy', 'Linda Cardellini', 'Matt Dillon...","Crime, Drama","The 47-year old Al Capone, after 10 years in p...",<em>'Capone'</em> disappoints.\r\n\r\nIt's not...,
5556,582596,The Wrong Missy,Tyler Spindel,"['David Spade', 'Lauren Lapkus', 'Candace Smit...","Comedy, Romance",A guy meets the woman of his dreams and invite...,,


In [4]:
# Check how many rows and columns 
movies.shape

(5558, 8)

In [5]:
movies.shape

(5558, 8)

In [6]:
# Check null values
movies.isna().sum()

Id             0
Title          0
Director      45
Cast           0
Genre         66
Overview       5
Reviews     3172
0           5558
dtype: int64

In [7]:
# Select a subset of columns from the 'movies' DataFrame
movies = movies[['Id','Title','Director','Cast','Genre','Overview','Reviews']]

In [8]:
# Check number of duplicated entries in the 'Title' column 
movies.duplicated(subset=['Id']).sum()

4

In [9]:
# Drops the duplicated rows in the 'Title' column and modifies the 'movies' dataframe in place.
movies.drop_duplicates(subset=['Id'], inplace=True)

In [10]:
movies.shape

(5554, 7)

In [11]:
# Reset index
movies = movies.reset_index(drop=True)

In [12]:
# Clean the 'Cast' column of the 'movies' DataFrame by removing square brackets and single quotes
movies['Cast'] = movies['Cast'].str.replace("[\[\]']", '', regex=True)

In [13]:
# Fill missing values in Director and Genre columns with mode
movies['Director'] = movies['Director'].fillna(movies['Director'].mode()[0])
movies['Genre'] = movies['Genre'].fillna(movies['Genre'].mode()[0])

# Fill missing values in Overview column with empty string
movies['Overview'] = movies['Overview'].fillna('')
movies['Reviews'] = movies['Reviews'].fillna('')

In [14]:
# movies.isna().sum()

In [15]:
# Import the stopwords module from NLTK for text preprocessing and the pickle module for object serialization
import nltk
from nltk.corpus import stopwords
import pickle

In [16]:
# nltk.download("stopwords")

In [17]:
movies['Overview']

0       Ronny is a rebellious man, who falls in love w...
1       An unscrupulous boxer fights his way to the to...
2       A group of friends who each have to navigate t...
3       Raj is a struggling singer chasing his dreams ...
4       Saketh Ram's wife is raped and killed during d...
                              ...                        
5549    Kyle and Swin live by the orders of an Arkansa...
5550    Julie, a girl from the valley, meets Randy, a ...
5551    The 47-year old Al Capone, after 10 years in p...
5552    A guy meets the woman of his dreams and invite...
5553    In Scooby-Doo’s greatest adventure yet, see th...
Name: Overview, Length: 5554, dtype: object

In [18]:
movies['Cast']

0       Tiger Shroff, Shraddha Kapoor, Sunil Grover, S...
1       Kirk Douglas, Marilyn Maxwell, Arthur Kennedy,...
2       Demi Banwo, Adesua Etomi-Wellington, Tobi Bakr...
3       Salman Khan, Rani Mukerji, Preity Zinta, Neera...
4       Kamal Haasan, Shah Rukh Khan, Hema Malini, Ran...
                              ...                        
5549    Liam Hemsworth, Clark Duke, Vince Vaughn, John...
5550    Nicolas Cage, Deborah Foreman, Elizabeth Daily...
5551    Tom Hardy, Linda Cardellini, Matt Dillon, Kyle...
5552    David Spade, Lauren Lapkus, Candace Smith, Sar...
5553    Amanda Seyfried, Christina Hendricks, Frank We...
Name: Cast, Length: 5554, dtype: object

In [19]:
import re
movies['Reviews'] = movies['Reviews'].apply(lambda x: re.sub(r'[^a-zA-Z\s,]', '', x))

In [20]:
movies['Reviews']

0                                                        
1                                                        
2                                                        
3                                                        
4                                                        
                              ...                        
5549    A good but not great film that couldve been be...
5550                                                     
5551    emCaponeem disappoints\r\n\r\nIts not what I w...
5552                                                     
5553    I guess I can commend Warner Bros for attempti...
Name: Reviews, Length: 5554, dtype: object

In [21]:
# Replace the space between two names for consistent formatting
movies['Cast'] = movies['Cast'].str.replace(' ', '')
movies['Director'] = movies['Director'].str.replace(' ', '')
movies['Genre'] = movies['Genre'].str.replace(' ', '')
movies['Reviews'] = movies['Reviews'].apply(lambda x: ''.join(x))

In [22]:
# movies['Cast']

In [23]:
# Separate the features with commas using split for input to the Naive Bayes algorithm
movies['Overview'] = movies['Overview'].apply(lambda x: x)
movies['Cast'] = movies['Cast'].apply(lambda x: x)
movies['Genre'] = movies['Genre'].apply(lambda x: x)
movies['Director'] = movies['Director'].apply(lambda x: x)
movies['Reviews'] = movies['Reviews'].apply(lambda x: x)

In [24]:
# movies["Overview"]
# movies.columns

In [25]:
# Combine multiple sources of data to create tags for input to the algorithm
movies['Tags'] = movies['Cast'] + movies['Overview'] + movies["Reviews"] + movies['Genre'] + movies['Director'] 

In [26]:
# movies['Tags']

In [27]:
# Filter the DataFrame to select only the relevant columns for our model
new_df = movies[['Id','Title','Tags']]

In [28]:
new_df.rename(columns= {'Id':'movie_id','Title':'movie_title'}, inplace = True)

In [29]:
# new_df.rename(columns= {'Id':'movie_id','Title':'title','Tags':'tags'}, inplace = True)

In [30]:
movies.columns

Index(['Id', 'Title', 'Director', 'Cast', 'Genre', 'Overview', 'Reviews',
       'Tags'],
      dtype='object')

In [31]:
new_df

Unnamed: 0,movie_id,movie_title,Tags
0,391629,Baaghi,"TigerShroff,ShraddhaKapoor,SunilGrover,Sudheer..."
1,25918,Champion,"KirkDouglas,MarilynMaxwell,ArthurKennedy,PaulS..."
2,1104040,Gangs of Lagos,"DemiBanwo,AdesuaEtomi-Wellington,TobiBakre,Ade..."
3,157800,Har Dil Jo Pyar Karega,"SalmanKhan,RaniMukerji,PreityZinta,NeerajVora,..."
4,60579,Hey Ram,"KamalHaasan,ShahRukhKhan,HemaMalini,RaniMukerj..."
...,...,...,...
5549,560204,Arkansas,"LiamHemsworth,ClarkDuke,VinceVaughn,JohnMalkov..."
5550,19053,Valley Girl,"NicolasCage,DeborahForeman,ElizabethDaily,Mich..."
5551,429422,Capone,"TomHardy,LindaCardellini,MattDillon,KyleMacLac..."
5552,582596,The Wrong Missy,"DavidSpade,LaurenLapkus,CandaceSmith,SarahChal..."


In [32]:
# Convert a list of data to a string using the join() method for further processing
new_df['Tags'] = new_df['Tags'].apply(lambda x:''.join(x))

In [33]:
# Convert all characters to lowercase for compatibility with the NLTK library
new_df['Tags'] = new_df['Tags'].apply(lambda x:x.lower())

In [34]:
new_df["Tags"]

0       tigershroff,shraddhakapoor,sunilgrover,sudheer...
1       kirkdouglas,marilynmaxwell,arthurkennedy,pauls...
2       demibanwo,adesuaetomi-wellington,tobibakre,ade...
3       salmankhan,ranimukerji,preityzinta,neerajvora,...
4       kamalhaasan,shahrukhkhan,hemamalini,ranimukerj...
                              ...                        
5549    liamhemsworth,clarkduke,vincevaughn,johnmalkov...
5550    nicolascage,deborahforeman,elizabethdaily,mich...
5551    tomhardy,lindacardellini,mattdillon,kylemaclac...
5552    davidspade,laurenlapkus,candacesmith,sarahchal...
5553    amandaseyfried,christinahendricks,frankwelker,...
Name: Tags, Length: 5554, dtype: object

In [35]:
# Define a function that uses the Porter stemming algorithm from the NLTK library to normalize text
# The function splits the input text into individual words, applies stemming to each word using a `PorterStemmer` instance,
# and returns the normalized text as a string with stemmed words joined by spaces.

import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [36]:
# Import necessary libraries and prepare data for modeling and evaluation of classification model using CountVectorizer feature extraction.
from sklearn.feature_extraction.text import CountVectorizer

# Import a Multinomial NaiveBayes, Gaussian NaiveBayes, RandomForest, Support Vector Machine classifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Convert the Tags column into numerical features using CountVectorizer
cv = CountVectorizer(max_features=20000,strip_accents='ascii', stop_words='english')
from nltk.corpus import stopwords
StopWord = stopwords.words('english')
# cv = CountVectorizer(lowercase=True, strip_accents='ascii',stop_words=StopWord)

vectors = cv.fit_transform(new_df['Tags']).toarray()   
x = vectors
print(x.shape)
print(x[0])

# Convert movie titles to numeric labels
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(new_df['movie_title'])

# Train and test split the data using the 'Title' column as the target variable for classification.
# Split the data into training and testing sets

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=4)

(5554, 20000)
[0 0 0 ... 0 0 0]


### # The cosine similarity matrix for the input data `vectors`.


This code computes the cosine similarity matrix between the rows of the input matrix x using the cosine_similarity() function from scikit-learn's metrics.pairwise module, and assigns it to the variable similarity. The resulting matrix can be used to measure the similarity between different rows or documents represented in the input matrix x.

In [37]:
# Calculates the cosine similarity matrix for the input data vectors.
from sklearn.metrics.pairwise import cosine_similarity
similarty = cosine_similarity(vectors)

In [38]:
def recommend(movie):
    movie_index = new_df[new_df['movie_title'] == movie].index[0]
    distances = similarty[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[0:6]
    print(f"Movies similar to '{movie}':")
    print("------------------------")
    # print(movie) # print the input movie first
    for i in movie_list:
        print(new_df.iloc[i[0]].movie_title)


In [39]:
recommend("Spider-Man")

Movies similar to 'Spider-Man':
------------------------
Spider-Man
Spider-Man 2
Spider-Man: Into the Spider-Verse
The Avengers
Spider-Man: Homecoming
The Amazing Spider-Man


In [40]:
# get model in pickel
import pickle
import os

pickle.dump(new_df, open('model/movies_list.pkl','wb'))
print(os.path.getsize("model/movies_list.pkl"))

12438880


In [41]:
pickle.dump(similarty,open("model/similarity.pkl",'wb'))
print(os.path.getsize("model/similarity.pkl"))

246775491


In [42]:
# Compressing Data of pickel model
import bz2file as bz2 
pickle.dump(similarty,bz2.BZ2File("model/similarity.pkl",'wb'))
 
print(os.path.getsize("model/similarity.pkl"))

100211638


In [43]:
# pip install bz2file, patsy

In [44]:
movies = pickle.load(open('model/movies_list.pkl', 'rb'))
similarity = pickle.load(bz2.BZ2File('model/similarity.pkl', 'rb'))   #filename

In [45]:
movies

Unnamed: 0,movie_id,movie_title,Tags
0,391629,Baaghi,"tigershroff,shraddhakapoor,sunilgrover,sudheer..."
1,25918,Champion,"kirkdouglas,marilynmaxwell,arthurkennedy,pauls..."
2,1104040,Gangs of Lagos,"demibanwo,adesuaetomi-wellington,tobibakre,ade..."
3,157800,Har Dil Jo Pyar Karega,"salmankhan,ranimukerji,preityzinta,neerajvora,..."
4,60579,Hey Ram,"kamalhaasan,shahrukhkhan,hemamalini,ranimukerj..."
...,...,...,...
5549,560204,Arkansas,"liamhemsworth,clarkduke,vincevaughn,johnmalkov..."
5550,19053,Valley Girl,"nicolascage,deborahforeman,elizabethdaily,mich..."
5551,429422,Capone,"tomhardy,lindacardellini,mattdillon,kylemaclac..."
5552,582596,The Wrong Missy,"davidspade,laurenlapkus,candacesmith,sarahchal..."


In [46]:
similarity

array([[1.        , 0.        , 0.        , ..., 0.00996024, 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.1069045 , 0.        ,
        0.0907592 ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.03044156],
       ...,
       [0.00996024, 0.1069045 , 0.        , ..., 1.        , 0.00828742,
        0.14917696],
       [0.        , 0.        , 0.        , ..., 0.00828742, 1.        ,
        0.02251459],
       [0.        , 0.0907592 , 0.03044156, ..., 0.14917696, 0.02251459,
        1.        ]])