In [None]:
# importing the libraries
import pandas as pd
import ast
import time
from tqdm import tqdm

In [None]:
df = pd.read_csv('/content/All_Movies1.csv',lineterminator='\n')

In [None]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Title', 'Director', 'Cast', 'id',
       'genres', 'imdb_id', 'original_language', 'overview', 'popularity',
       'poster_path', 'release_date', 'runtime', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [None]:
# removing useless columns
df.drop(['Unnamed: 0.1', 'Unnamed: 0','original_language'],axis=1,inplace=True)

In [None]:
df.shape

(3359, 14)

## Now extracting Features like
    'Title','Director', 'Cast', 'id','genres', 'overview'
## *to build recommendation engine*

In [None]:
df = df[['Title','Director', 'Cast', 'id','genres', 'overview']]
df.head()

Unnamed: 0,Title,Director,Cast,id,genres,overview
0,Kaun Pravin Tambe?,Jayprad Desai,Shreyas Talpade,946652.0,Drama,Mumbai-born leg-spinner Pravin Tambe made his ...
1,Cobalt Blue,Sachin Kundalkar,"Prateik Babbar,Neelay Mehendale,Anjali Sivaram...",897097.0,"Romance,Drama",When an aspiring author and his free-spirited ...
2,Dasvi,Tushar Jalota,"Abhishek Bachchan,Yami Gautam,Nimrat Kaur",799713.0,"Drama,Comedy","Jailed under a tough cop, an uneducated politi..."
3,Hurdang,Nikhil Nagesh Bhat,"Sunny Kaushal,Nushrratt Bharuccha,Vijay Varma",809309.0,"Drama,Romance","In the 1990s' Allahabad, a brash college stude..."
4,Jersey,Gowtam Tinnanuri,"Shahid Kapoor,Mrunal Thakur,Pankaj Kapur",679018.0,Drama,"An ex-cricketer struggling to make ends meet, ..."


In [None]:
# Applying Transformation in "Director","Cast","genres"
# Transformation :   "FirstName LastName" -> "FirstNameLastName"
def in_format(text):
    lst_text = text.split(',')
    new_text = [word.replace(' ','') for word in lst_text]

    return " ".join(new_text)


df['Director'] = df['Director'].apply(in_format)
df['Cast'] = df['Cast'].apply(in_format)
df['genres'] =df['genres'].apply(in_format)

In [None]:
# removing stop words from the overview feature
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
exclude =stopwords.words('english')


def rem_stopwords(text):
    new_text = []

    for word in text.split():
        if word.lower() not in exclude:
            new_text.append(word)

    return " ".join(new_text)
tqdm.pandas()
df.loc[:,'overview'] = df.loc[:,'overview'].progress_apply(rem_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
100%|██████████| 3359/3359 [00:00<00:00, 12540.01it/s]


In [None]:
# performing Lemmatization over the Overview Column
import spacy
nlp = spacy.load("en_core_web_sm")

def lemmatizing(text):
    new_text = []
    doc = nlp(text)
    for i in doc:
        new_text.append(i.lemma_)

    return " ".join(new_text)

In [None]:
df['overview'] = df['overview'].progress_apply(lemmatizing)

100%|██████████| 3359/3359 [00:43<00:00, 76.85it/s]


In [None]:
# Combining "Director","Cast","Genre","overview" features to form a "Tag" Feature
df['Tags'] = df['Director']+' '+df['Cast']+' '+df['genres']+' '+df['overview']

In [None]:
df = df[['Title','id','Tags']]
df

Unnamed: 0,Title,id,Tags
0,Kaun Pravin Tambe?,946652.0,JaypradDesai ShreyasTalpade Drama Mumbai - bea...
1,Cobalt Blue,897097.0,SachinKundalkar PrateikBabbar NeelayMehendale ...
2,Dasvi,799713.0,TusharJalota AbhishekBachchan YamiGautam Nimra...
3,Hurdang,809309.0,NikhilNageshBhat SunnyKaushal NushrrattBharucc...
4,Jersey,679018.0,GowtamTinnanuri ShahidKapoor MrunalThakur Pank...
...,...,...,...
3354,Les Misérables,82695.0,TomHooper HughJackman RussellCrowe EddieRedmay...
3355,Parental Guidance,88042.0,AndyFickman BillyCrystal BetteMidler MarisaTom...
3356,West of Memphis,84351.0,AmyJ.Berg WestMemphisThree Documentary documen...
3357,Quartet,121826.0,DustinHoffman MaggieSmith TomCourtenay BillyCo...


In [None]:
# Converting the content within the "Tag" feature to lowercase
df.loc[:,'Tags'] = df.loc[:,'Tags'].str.lower()
df

Unnamed: 0,Title,id,Tags
0,Kaun Pravin Tambe?,946652.0,jaypraddesai shreyastalpade drama mumbai - bea...
1,Cobalt Blue,897097.0,sachinkundalkar prateikbabbar neelaymehendale ...
2,Dasvi,799713.0,tusharjalota abhishekbachchan yamigautam nimra...
3,Hurdang,809309.0,nikhilnageshbhat sunnykaushal nushrrattbharucc...
4,Jersey,679018.0,gowtamtinnanuri shahidkapoor mrunalthakur pank...
...,...,...,...
3354,Les Misérables,82695.0,tomhooper hughjackman russellcrowe eddieredmay...
3355,Parental Guidance,88042.0,andyfickman billycrystal bettemidler marisatom...
3356,West of Memphis,84351.0,amyj.berg westmemphisthree documentary documen...
3357,Quartet,121826.0,dustinhoffman maggiesmith tomcourtenay billyco...


# Text Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
tfidf = TfidfVectorizer(stop_words='english',ngram_range=(1, 1),min_df=3)

In [None]:
vectors = tfidf.fit_transform(df['Tags']).toarray()

In [None]:
vectors.shape

(3359, 6711)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
simi = cosine_similarity(vectors)

In [None]:
simi.shape

(3359, 3359)

In [None]:
type(simi)

numpy.ndarray

In [None]:
import numpy as np
# Save the array
np.save('similarity_scores.npy', simi)

In [None]:
def recommend2(movie):
    # getting index of the searched movie
    index = df[df['Title']==movie].index[0]
    # finding the similiarity score of that movie with every other movies
    distances = simi[index]
    # sorting them in descending order on the basis of similiarity score
    simi_mvs_idxWithname = sorted(list(enumerate(simi[index])),reverse=True,key=lambda x:x[1])[1:6]

    simi_movies = []
    for i in simi_mvs_idxWithname:
         simi_movies.append(df.loc[i[0],'Title'])
    return simi_movies

In [None]:
recommend2('Chhichhore')

['Dil Jo Na Keh Saka',
 'Cirkus',
 'Batti Gul Meter Chalu',
 'Shuddh Desi Romance',
 'Everybody Wants Some!!']

In [None]:
recommend2('Logan')

['Sundown',
 'The Wolverine',
 'X-Men: Days of Future Past',
 'Krrish 3',
 'Passing']

In [None]:
recommend2("Spider-Man: No Way Home")

['Spider-Man: Homecoming',
 'Spider-Man: Far From Home',
 'The Amazing Spider-Man 2',
 'The Amazing Spider-Man',
 'Doctor Strange in the Multiverse of Madness']

In [None]:
recommend2("Golmaal Again")

['Followers',
 'Poster Boys',
 'Mr Joe B. Carvalho',
 'Fitoor',
 'Dick Figures: The Movie']

In [None]:
recommend2("Pink")

['Guilty', 'Mission Mangal', 'Soorma', 'Jazbaa', 'It Follows']

In [None]:
recommend2('War')

['Notebook', 'Chakravyuh', 'Naam Shabana', 'Dhaakad', 'Baaghi 3']

In [None]:
recommend2('The Conjuring 2')

['The Conjuring',
 'The Conjuring: The Devil Made Me Do It',
 'Annabelle Comes Home',
 'The Possession',
 'Insidious: Chapter 2']