### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [2]:
# download the dataset to pandas df
films_df = pd.read_csv(r'C:\Users\Chaitra.b.c\Downloads/tmdb_5000_movies.csv')
films_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [3]:
# Creating a dataframe to work with.
movies_df = films_df.copy(deep = True)

In [4]:
# Dataframe information
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [5]:
#  Missing Values
movies_df.isnull().sum() 

# We have to either convert the null values as empty string or replace/ fill them appropriately.

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

In [6]:
# Fill null values as empty string for Homepage, Tagline and overview.
# We can create a Web Scraper that can collect information regarding the movies and their corresponding null cols 
# and update them. For now we will stick with empty string.
# keep in mind that these are text values and later we may use tokenizers that expect strings.

movies_df['homepage'].fillna('', inplace = True)
movies_df['tagline'].fillna('', inplace = True)
movies_df['overview'].fillna('', inplace = True)

In [7]:
# validating
movies_df.isnull().sum()

budget                  0
genres                  0
homepage                0
id                      0
keywords                0
original_language       0
original_title          0
overview                0
popularity              0
production_companies    0
production_countries    0
release_date            1
revenue                 0
runtime                 2
spoken_languages        0
status                  0
tagline                 0
title                   0
vote_average            0
vote_count              0
dtype: int64

In [8]:
# display the truncated values for better understanding
# Keywords column is very important for our Recommendation system.
a = movies_df.iloc[:1]['keywords']
print('Before set option : ' ,a)

# not showing the full value? Use pd.set_option('display.max_colwidth',50). 
# The variable will store the complete value.

pd.set_option('display.max_colwidth',5000)
print('After set option : ', a)

Before set option :  0    [{"id": 1463, "name": "culture clash"}, {"id":...
Name: keywords, dtype: object
After set option :  0    [{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]
Name: keywords, dtype: object


In [9]:
# Displaying all the columns that seem to have complex data structures.
list_of_comp_cols = ['genres','production_companies','production_countries', 'keywords','spoken_languages']

for cols in list_of_comp_cols:
    print(f'{cols} : {movies_df.loc[1,cols]}')
    print('*'*100)

genres : [{"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 28, "name": "Action"}]
****************************************************************************************************
production_companies : [{"name": "Walt Disney Pictures", "id": 2}, {"name": "Jerry Bruckheimer Films", "id": 130}, {"name": "Second Mate Productions", "id": 19936}]
****************************************************************************************************
production_countries : [{"iso_3166_1": "US", "name": "United States of America"}]
****************************************************************************************************
keywords : [{"id": 270, "name": "ocean"}, {"id": 726, "name": "drug abuse"}, {"id": 911, "name": "exotic island"}, {"id": 1319, "name": "east india trading company"}, {"id": 2038, "name": "love of one's life"}, {"id": 2052, "name": "traitor"}, {"id": 2580, "name": "shipwreck"}, {"id": 2660, "name": "strong woman"}, {"id": 3799, "name": "ship"},

In [10]:
# function that outputs string value containing the joined string values of all the keywords.
# using eval function to evaluate the string to python recognizable object.

def list_of_items(a):
    # intialization
    if len(eval(a)) == 0:
        return ''
    else:
        list_of_items = []
        list_of_keys = []
        str_arg = eval(a)
    
        for dic in str_arg:
            #print(dic.values())
            list_of_items = list_of_items + [''.join(dic['name'].split(' '))]
            list_of_keys = list_of_keys + list(dic.keys())
    
        return ' '.join(list_of_items)

In [11]:
list_of_items(movies_df['keywords'][0])

'cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d'

In [12]:
# To extract all the values from the complex datastructure.
list_of_comp_cols = ['genres','production_companies','production_countries', 'keywords','spoken_languages']

for cols in list_of_comp_cols:
    print(f'{cols} : {list_of_items(movies_df.loc[1,cols])}')
    print('-'*50)

genres : Adventure Fantasy Action
--------------------------------------------------
production_companies : WaltDisneyPictures JerryBruckheimerFilms SecondMateProductions
--------------------------------------------------
production_countries : UnitedStatesofAmerica
--------------------------------------------------
keywords : ocean drugabuse exoticisland eastindiatradingcompany loveofone'slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger
--------------------------------------------------
spoken_languages : English
--------------------------------------------------


In [13]:
# Function to process URLs.
def clean_URL(url):
    if url is not np.nan:
        replace_with_empty = ['http:','http://', 'https://', '.com','www.']
        replace_with_space = [ '/' , '.','-']
        cleaned_url = ''
    
        for ind in range(len(replace_with_empty)):
            if ind == 1:
                cleaned_url = url.replace(replace_with_empty[ind], '')
            else:
                cleaned_url = cleaned_url.replace(replace_with_empty[ind], '')
            #print(cleaned_url)
    
        for ind in range(len(replace_with_space)):
            cleaned_url = cleaned_url.replace(replace_with_space[ind], ' ')
            #print(cleaned_url)
    
        return cleaned_url.strip()
    else:
        return ''


#[movies_df['homepage'][i] for i in range(len(movies_df['homepage'])) if movies_df['homepage'][i].startswith('http:','http://', 'https://')]
len([movies_df['homepage'][i] for i in range(len(movies_df['homepage'])) if movies_df['homepage'][i] is np.nan])
# shoudlve used .isna()

In [14]:
type(movies_df['homepage'][15])

str

In [15]:
# validating the function.
for i in range(4):
    print(clean_URL(movies_df.loc[i,'homepage']))

avatarmovie
disney go disneypictures pirates
sonypictures movies spectre
thedarkknightrises


In [16]:
# combined column values in one string.
pd.set_option('display.max_colwidth',50000)
movies_df['Movie_detes_in_str'] = movies_df['homepage'].apply(clean_URL)

list_of_comp_cols = ['genres','production_companies','production_countries', 'keywords','spoken_languages']

for col in list_of_comp_cols:
    movies_df['Movie_detes_in_str'] = movies_df['Movie_detes_in_str'] + movies_df[col].apply(list_of_items)

movies_df.Movie_detes_in_str[14:20]

14                                          manofsteelAction Adventure Fantasy ScienceFictionLegendaryPictures WarnerBros. DCEntertainment Syncopy CruelandUnusualFilmsUnitedKingdom UnitedStatesofAmericasavingtheworld dccomics superhero basedoncomicbook superhuman alieninvasion reboot superpowers dcextendeduniverseEnglish
15    Adventure Family FantasyWaltDisney WaldenMedia StillkingFilms OzumiFilms Propeler SilverbellFilms RevolutionSunStudiosCzechRepublic Poland Slovenia UnitedStatesofAmericabasedonnovel fictionalplace brothersisterrelationship lion humanbeing wretch leapintime matteroflifeanddeath faith uncle narnia fantasyworldEnglish
16                                                               marvel avengers_movieScienceFiction Action AdventureParamountPictures MarvelStudiosUnitedStatesofAmericanewyork shield marvelcomic superhero basedoncomicbook alieninvasion superheroteam aftercreditsstinger duringcreditsstinger marvelcinematicuniverseEnglish
17            disney go pirates

In [17]:
# creating a new dataframe, to store the processed columns
processed_df = pd.DataFrame(movies_df[['id','original_title']])
processed_df.head()

Unnamed: 0,id,original_title
0,19995,Avatar
1,285,Pirates of the Caribbean: At World's End
2,206647,Spectre
3,49026,The Dark Knight Rises
4,49529,John Carter


In [18]:
# Sequence generator
def number_generator(start = 0, step = 1):
    num = start
    while True:
        yield num
        num += step

In [19]:
# creating a generator object
num_gen = number_generator(start = 2, step = 1)

# Budget Column

In [20]:
# processing budget column
# We creating bins so there wont be a vector component for each unique value.
movies_df.budget.fillna(0, inplace = True)
bin_edges = [0,1000000,50000000,100000000,float('inf')]
bin_labels = ['Low Budget', 'Medium Budget', 'High Budget', 'Very High Budget']
budget_categories = pd.cut(movies_df['budget'], bins = bin_edges, labels = bin_labels, right = True, include_lowest = True)

In [21]:
# Counting NaN values.
budget_categories.isna().value_counts()

False    4803
Name: budget, dtype: int64

In [22]:
# inserting the series into the dataframe
processed_df.insert(next(num_gen),'budget_categories',budget_categories)
processed_df

Unnamed: 0,id,original_title,budget_categories
0,19995,Avatar,Very High Budget
1,285,Pirates of the Caribbean: At World's End,Very High Budget
2,206647,Spectre,Very High Budget
3,49026,The Dark Knight Rises,Very High Budget
4,49529,John Carter,Very High Budget
...,...,...,...
4798,9367,El Mariachi,Low Budget
4799,72766,Newlyweds,Low Budget
4800,231617,"Signed, Sealed, Delivered",Low Budget
4801,126186,Shanghai Calling,Low Budget


# Runtime

In [23]:
movies_df.runtime.fillna(0, inplace = True)
#movies_df.runtime[movies_df.runtime.isna()] #empty
runtime_bins_edges = [0,90,120,150,float('inf')]
runtime_labels = ['short','medium','long','Very Long']
runtime_categories = pd.cut(movies_df['runtime'], bins = runtime_bins_edges, right = True, labels = runtime_labels, include_lowest = True)
runtime_categories


0       Very Long
1       Very Long
2            long
3       Very Long
4            long
          ...    
4798        short
4799        short
4800       medium
4801       medium
4802        short
Name: runtime, Length: 4803, dtype: category
Categories (4, object): ['short' < 'medium' < 'long' < 'Very Long']

In [24]:
# checking the copy dataframe(movies_df) for NaN values based on NaN values in the series(runtime_categories).
movies_df.iloc[runtime_categories[runtime_categories.isna()].index]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,Movie_detes_in_str


In [25]:
# inserting runtime_categories
processed_df.insert(next(num_gen), 'Duration', runtime_categories)
processed_df.head()

Unnamed: 0,id,original_title,budget_categories,Duration
0,19995,Avatar,Very High Budget,Very Long
1,285,Pirates of the Caribbean: At World's End,Very High Budget,Very Long
2,206647,Spectre,Very High Budget,long
3,49026,The Dark Knight Rises,Very High Budget,Very Long
4,49529,John Carter,Very High Budget,long


# Popularity

In [26]:
# Stats
movies_df['popularity'].describe()

count    4803.000000
mean       21.492301
std        31.816650
min         0.000000
25%         4.668070
50%        12.921594
75%        28.313505
max       875.581305
Name: popularity, dtype: float64

In [27]:
# Categorizing the values into bins
movies_df['popularity'].fillna(0,inplace = True)
pop_edges = [0, 50, 75, 150, 250, 400, 600, float('inf')]
pop_labels = ['Very low', 'low', 'average', 'Medium', 'High', 'Very High', 'Stellar']
pop_categories = pd.cut(movies_df['popularity'], bins = pop_edges, labels =pop_labels, right = True,
                        include_lowest = True)
pop_categories

0         Medium
1        average
2        average
3        average
4       Very low
          ...   
4798    Very low
4799    Very low
4800    Very low
4801    Very low
4802    Very low
Name: popularity, Length: 4803, dtype: category
Categories (7, object): ['Very low' < 'low' < 'average' < 'Medium' < 'High' < 'Very High' < 'Stellar']

In [28]:
# validating the NaN
movies_df.iloc[pop_categories[pop_categories.isna()].index]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,Movie_detes_in_str


In [29]:
# inserting popularity
processed_df.insert(next(num_gen), 'Popularity', pop_categories)
processed_df.head()

Unnamed: 0,id,original_title,budget_categories,Duration,Popularity
0,19995,Avatar,Very High Budget,Very Long,Medium
1,285,Pirates of the Caribbean: At World's End,Very High Budget,Very Long,average
2,206647,Spectre,Very High Budget,long,average
3,49026,The Dark Knight Rises,Very High Budget,Very Long,average
4,49529,John Carter,Very High Budget,long,Very low


# Release Date

In [30]:
from datetime import datetime as dt

In [31]:
# Extracting the series as datetime. 
release_dt = movies_df['release_date'].astype('datetime64', copy = True) # here [ns] means nanoseconds

In [32]:
# The movie America is still the place was released in 10 june 2022
movies_df.at[4553,'release_date'] = '2022-06-10'

In [33]:
# categorizing the release date
release_month = release_dt.dt.strftime('%B')
release_year = release_dt.dt.strftime('%Y')
release_day = release_dt.dt.strftime('%A')

print('Day : ',release_day, '  month : ',release_month, '  year : ', release_year)

Day :  0        Thursday
1        Saturday
2          Monday
3          Monday
4       Wednesday
          ...    
4798       Friday
4799       Monday
4800       Sunday
4801     Thursday
4802       Friday
Name: release_date, Length: 4803, dtype: object   month :  0        December
1             May
2         October
3            July
4           March
          ...    
4798    September
4799     December
4800      October
4801          May
4802       August
Name: release_date, Length: 4803, dtype: object   year :  0       2009
1       2007
2       2015
3       2012
4       2012
        ... 
4798    1992
4799    2011
4800    2013
4801    2012
4802    2005
Name: release_date, Length: 4803, dtype: object


In [34]:
#pd.set_option('display.max_columns', None)
#movies_df.iloc[release_day[release_day.isna()].index]

In [35]:
# inserting release day, month  and year
# The release day is off less importance when it comes to similarity execpt when querying all the movies that 
# were released on a particular day. Ex: Display all the movies that were released on Christmas Eve.
processed_df.insert(next(num_gen), 'Day', release_day)

# month plays important role too.
processed_df.insert(next(num_gen), 'Month', release_month)

# Year is very important when we categorize them as 90's, 80's movies etc.
processed_df.insert(next(num_gen), 'Year', release_year)

processed_df.head()

Unnamed: 0,id,original_title,budget_categories,Duration,Popularity,Day,Month,Year
0,19995,Avatar,Very High Budget,Very Long,Medium,Thursday,December,2009
1,285,Pirates of the Caribbean: At World's End,Very High Budget,Very Long,average,Saturday,May,2007
2,206647,Spectre,Very High Budget,long,average,Monday,October,2015
3,49026,The Dark Knight Rises,Very High Budget,Very Long,average,Monday,July,2012
4,49529,John Carter,Very High Budget,long,Very low,Wednesday,March,2012


# REVENUE

In [36]:
# Categoizing Revenue
rev_bins = [0,50,100,150,200,float('inf')]
rev_labels = ['Very Low revenue','Low revenue', 'medium revenue', 'high revenue','Blockbuster']
revenue_categories = pd.cut(movies_df['revenue'], bins = rev_bins, labels = rev_labels, right = True, include_lowest = True)

In [37]:
# validating
revenue_categories[revenue_categories.isna()].index

Int64Index([], dtype='int64')

In [38]:
revenue_categories

0            Blockbuster
1            Blockbuster
2            Blockbuster
3            Blockbuster
4            Blockbuster
              ...       
4798         Blockbuster
4799    Very Low revenue
4800    Very Low revenue
4801    Very Low revenue
4802    Very Low revenue
Name: revenue, Length: 4803, dtype: category
Categories (5, object): ['Very Low revenue' < 'Low revenue' < 'medium revenue' < 'high revenue' < 'Blockbuster']

In [39]:
# Inserting Revenue categorization
processed_df.insert(next(num_gen),'Revenue', revenue_categories)
processed_df.head()

Unnamed: 0,id,original_title,budget_categories,Duration,Popularity,Day,Month,Year,Revenue
0,19995,Avatar,Very High Budget,Very Long,Medium,Thursday,December,2009,Blockbuster
1,285,Pirates of the Caribbean: At World's End,Very High Budget,Very Long,average,Saturday,May,2007,Blockbuster
2,206647,Spectre,Very High Budget,long,average,Monday,October,2015,Blockbuster
3,49026,The Dark Knight Rises,Very High Budget,Very Long,average,Monday,July,2012,Blockbuster
4,49529,John Carter,Very High Budget,long,Very low,Wednesday,March,2012,Blockbuster


# Vote Average

In [40]:
# Creating bins
vote_avg_bins = [0,4,6,8,10]
vote_avg_labels = ['Poor','Average','Good', 'Excellent']
vote_avg_cats = pd.cut(movies_df['vote_average'], bins = vote_avg_bins, labels = vote_avg_labels, 
                       right = True, include_lowest = True)

In [41]:
# Validating.
vote_avg_cats[vote_avg_cats.isna()].index # No NaN

Int64Index([], dtype='int64')

In [42]:
vote_avg_cats

0          Good
1          Good
2          Good
3          Good
4          Good
         ...   
4798       Good
4799    Average
4800       Good
4801    Average
4802       Good
Name: vote_average, Length: 4803, dtype: category
Categories (4, object): ['Poor' < 'Average' < 'Good' < 'Excellent']

In [43]:
# Inserting Vote Averages categories
processed_df.insert(next(num_gen), 'Vote Average Category', vote_avg_cats)
processed_df.head()

Unnamed: 0,id,original_title,budget_categories,Duration,Popularity,Day,Month,Year,Revenue,Vote Average Category
0,19995,Avatar,Very High Budget,Very Long,Medium,Thursday,December,2009,Blockbuster,Good
1,285,Pirates of the Caribbean: At World's End,Very High Budget,Very Long,average,Saturday,May,2007,Blockbuster,Good
2,206647,Spectre,Very High Budget,long,average,Monday,October,2015,Blockbuster,Good
3,49026,The Dark Knight Rises,Very High Budget,Very Long,average,Monday,July,2012,Blockbuster,Good
4,49529,John Carter,Very High Budget,long,Very low,Wednesday,March,2012,Blockbuster,Good


# Vote Count

In [44]:
# Stats
movies_df['vote_count'].describe()

count     4803.000000
mean       690.217989
std       1234.585891
min          0.000000
25%         54.000000
50%        235.000000
75%        737.000000
max      13752.000000
Name: vote_count, dtype: float64

In [45]:
# Creating Bins
vote_count_bins = [0,500,1000,5000,float('inf')]
vote_count_labels = ['Low','Medium','High',' Super High']
vote_count_cats = pd.cut(movies_df['vote_count'], bins = vote_count_bins, labels = vote_count_labels, 
                         right = True, include_lowest = True)

In [46]:
# validating.
vote_count_cats[vote_count_cats.isna()].index # No NaN

Int64Index([], dtype='int64')

In [47]:
vote_count_cats

0        Super High
1              High
2              High
3        Super High
4              High
           ...     
4798            Low
4799            Low
4800            Low
4801            Low
4802            Low
Name: vote_count, Length: 4803, dtype: category
Categories (4, object): ['Low' < 'Medium' < 'High' < ' Super High']

In [48]:
# inserting the vote count category
processed_df.insert(next(num_gen), 'Vote Count Category', vote_count_cats)
processed_df.head()

Unnamed: 0,id,original_title,budget_categories,Duration,Popularity,Day,Month,Year,Revenue,Vote Average Category,Vote Count Category
0,19995,Avatar,Very High Budget,Very Long,Medium,Thursday,December,2009,Blockbuster,Good,Super High
1,285,Pirates of the Caribbean: At World's End,Very High Budget,Very Long,average,Saturday,May,2007,Blockbuster,Good,High
2,206647,Spectre,Very High Budget,long,average,Monday,October,2015,Blockbuster,Good,High
3,49026,The Dark Knight Rises,Very High Budget,Very Long,average,Monday,July,2012,Blockbuster,Good,Super High
4,49529,John Carter,Very High Budget,long,Very low,Wednesday,March,2012,Blockbuster,Good,High


# Final Dataframe with other unprocessed text columns

In [49]:
processed_df

Unnamed: 0,id,original_title,budget_categories,Duration,Popularity,Day,Month,Year,Revenue,Vote Average Category,Vote Count Category
0,19995,Avatar,Very High Budget,Very Long,Medium,Thursday,December,2009,Blockbuster,Good,Super High
1,285,Pirates of the Caribbean: At World's End,Very High Budget,Very Long,average,Saturday,May,2007,Blockbuster,Good,High
2,206647,Spectre,Very High Budget,long,average,Monday,October,2015,Blockbuster,Good,High
3,49026,The Dark Knight Rises,Very High Budget,Very Long,average,Monday,July,2012,Blockbuster,Good,Super High
4,49529,John Carter,Very High Budget,long,Very low,Wednesday,March,2012,Blockbuster,Good,High
...,...,...,...,...,...,...,...,...,...,...,...
4798,9367,El Mariachi,Low Budget,short,Very low,Friday,September,1992,Blockbuster,Good,Low
4799,72766,Newlyweds,Low Budget,short,Very low,Monday,December,2011,Very Low revenue,Average,Low
4800,231617,"Signed, Sealed, Delivered",Low Budget,medium,Very low,Sunday,October,2013,Very Low revenue,Good,Low
4801,126186,Shanghai Calling,Low Budget,medium,Very low,Thursday,May,2012,Very Low revenue,Average,Low


In [50]:
# define normal, processed and complex_columns

processed_columns = list(processed_df.columns)[1:]
complex_columns = ['genres','production_companies','production_countries', 'keywords','spoken_languages']
normal_columns = ['original_language', 'overview','status','tagline','title']
print(processed_columns,complex_columns,normal_columns)

['original_title', 'budget_categories', 'Duration', 'Popularity', 'Day', 'Month', 'Year', 'Revenue', 'Vote Average Category', 'Vote Count Category'] ['genres', 'production_companies', 'production_countries', 'keywords', 'spoken_languages'] ['original_language', 'overview', 'status', 'tagline', 'title']


In [51]:
final_df = pd.DataFrame(data = ['' for i in range(len(movies_df))], columns = ['movie_detes_str'])

final_df['movie_detes_str'].str.cat(movies_df['status'])

0       Released
1       Released
2       Released
3       Released
4       Released
          ...   
4798    Released
4799    Released
4800    Released
4801    Released
4802    Released
Name: movie_detes_str, Length: 4803, dtype: object

In [52]:
# Normal columns
for col in normal_columns:
    print(col)
    #gamma = movies_df[col].apply(nltk.word_tokenize)
    #print(gamma)
    final_df['movie_detes_str'] = final_df['movie_detes_str'] + ' ' + ' '.join(movies_df[col].apply(nltk.word_tokenize)[0])
final_df['movie_detes_str']

original_language
overview
status
tagline
title


0        en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar
1        en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar
2        en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar
3        en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar
4        en In the 22nd century , a paraplegic Marine is

In [53]:
# complex columns
for col in complex_columns:
    final_df['movie_detes_str'] = final_df['movie_detes_str'] + ' ' + movies_df[col].apply(list_of_items)
final_df['movie_detes_str']

0        en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Action Adventure Fantasy ScienceFiction IngeniousFilmPartners TwentiethCenturyFoxFilmCorporation DuneEntertainment LightstormEntertainment UnitedStatesofAmerica UnitedKingdom cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d English Español
1                                                                                       en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Adventure Fantasy Action WaltDisneyPictures JerryBruckheimerFilms SecondM

In [54]:
# processed columns
for col in processed_columns:
    print(col)
    final_df['movie_detes_str'] = final_df['movie_detes_str'] + ' ' + processed_df[col].astype(str)
final_df['movie_detes_str']

original_title
budget_categories
Duration
Popularity
Day
Month
Year
Revenue
Vote Average Category
Vote Count Category


0        en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Action Adventure Fantasy ScienceFiction IngeniousFilmPartners TwentiethCenturyFoxFilmCorporation DuneEntertainment LightstormEntertainment UnitedStatesofAmerica UnitedKingdom cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d English Español Avatar Very High Budget Very Long Medium Thursday December 2009 Blockbuster Good  Super High
1                                                                en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Adv

In [55]:
# homepage column
final_df['movie_detes_str'] = final_df['movie_detes_str'] + ' ' + movies_df['homepage'].apply(clean_URL)
final_df['movie_detes_str']

0        en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Action Adventure Fantasy ScienceFiction IngeniousFilmPartners TwentiethCenturyFoxFilmCorporation DuneEntertainment LightstormEntertainment UnitedStatesofAmerica UnitedKingdom cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d English Español Avatar Very High Budget Very Long Medium Thursday December 2009 Blockbuster Good  Super High avatarmovie
1                                           en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Adventure Fa

In [56]:
pd.set_option('display.max_colwidth',500000)
final_df

Unnamed: 0,movie_detes_str
0,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Action Adventure Fantasy ScienceFiction IngeniousFilmPartners TwentiethCenturyFoxFilmCorporation DuneEntertainment LightstormEntertainment UnitedStatesofAmerica UnitedKingdom cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d English Español Avatar Very High Budget Very Long Medium Thursday December 2009 Blockbuster Good Super High avatarmovie"
1,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Adventure Fantasy Action WaltDisneyPictures JerryBruckheimerFilms SecondMateProductions UnitedStatesofAmerica ocean drugabuse exoticisland eastindiatradingcompany loveofone'slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger English Pirates of the Caribbean: At World's End Very High Budget Very Long average Saturday May 2007 Blockbuster Good High disney go disneypictures pirates"
2,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Action Adventure Crime ColumbiaPictures Danjaq B24 UnitedKingdom UnitedStatesofAmerica spy basedonnovel secretagent sequel mi6 britishsecretservice unitedkingdom Français English Español Italiano Deutsch Spectre Very High Budget long average Monday October 2015 Blockbuster Good High sonypictures movies spectre"
3,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Action Crime Drama Thriller LegendaryPictures WarnerBros. DCEntertainment Syncopy UnitedStatesofAmerica dccomics crimefighter terrorist secretidentity burglar hostagedrama timebomb gothamcity vigilante cover-up superhero villainess tragichero terrorism destruction catwoman catburglar imax flood criminalunderworld batman English The Dark Knight Rises Very High Budget Very Long average Monday July 2012 Blockbuster Good Super High thedarkknightrises"
4,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Action Adventure ScienceFiction WaltDisneyPictures UnitedStatesofAmerica basedonnovel mars medallion spacetravel princess alien steampunk martian escape edgarriceburroughs alienrace superhumanstrength marscivilization swordandplanet 19thcentury 3d English John Carter Very High Budget long Very low Wednesday March 2012 Blockbuster Good High movies disney john carter"
...,...
4798,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Action Crime Thriller ColumbiaPictures Mexico UnitedStatesofAmerica unitedstates–mexicobarrier legs arms paperknife guitarcase Español El Mariachi Low Budget short Very low Friday September 1992 Blockbuster Good Low"
4799,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Comedy Romance Newlyweds Low Budget short Very low Monday December 2011 Very Low revenue Average Low"
4800,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Comedy Drama Romance TVMovie FrontStreetPictures MuseEntertainmentEnterprises UnitedStatesofAmerica date loveatfirstsight narration investigation team postalworker English Signed, Sealed, Delivered Low Budget medium Very low Sunday October 2013 Very Low revenue Good Low hallmarkchannel signedsealeddelivered"
4801,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar UnitedStatesofAmerica China English Shanghai Calling Low Budget medium Very low Thursday May 2012 Very Low revenue Average Low shanghaicalling"


In [57]:
final_df['movie_name'] = movies_df['original_title'].copy(deep = True)
final_df

Unnamed: 0,movie_detes_str,movie_name
0,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Action Adventure Fantasy ScienceFiction IngeniousFilmPartners TwentiethCenturyFoxFilmCorporation DuneEntertainment LightstormEntertainment UnitedStatesofAmerica UnitedKingdom cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d English Español Avatar Very High Budget Very Long Medium Thursday December 2009 Blockbuster Good Super High avatarmovie",Avatar
1,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Adventure Fantasy Action WaltDisneyPictures JerryBruckheimerFilms SecondMateProductions UnitedStatesofAmerica ocean drugabuse exoticisland eastindiatradingcompany loveofone'slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger English Pirates of the Caribbean: At World's End Very High Budget Very Long average Saturday May 2007 Blockbuster Good High disney go disneypictures pirates",Pirates of the Caribbean: At World's End
2,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Action Adventure Crime ColumbiaPictures Danjaq B24 UnitedKingdom UnitedStatesofAmerica spy basedonnovel secretagent sequel mi6 britishsecretservice unitedkingdom Français English Español Italiano Deutsch Spectre Very High Budget long average Monday October 2015 Blockbuster Good High sonypictures movies spectre",Spectre
3,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Action Crime Drama Thriller LegendaryPictures WarnerBros. DCEntertainment Syncopy UnitedStatesofAmerica dccomics crimefighter terrorist secretidentity burglar hostagedrama timebomb gothamcity vigilante cover-up superhero villainess tragichero terrorism destruction catwoman catburglar imax flood criminalunderworld batman English The Dark Knight Rises Very High Budget Very Long average Monday July 2012 Blockbuster Good Super High thedarkknightrises",The Dark Knight Rises
4,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Action Adventure ScienceFiction WaltDisneyPictures UnitedStatesofAmerica basedonnovel mars medallion spacetravel princess alien steampunk martian escape edgarriceburroughs alienrace superhumanstrength marscivilization swordandplanet 19thcentury 3d English John Carter Very High Budget long Very low Wednesday March 2012 Blockbuster Good High movies disney john carter",John Carter
...,...,...
4798,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Action Crime Thriller ColumbiaPictures Mexico UnitedStatesofAmerica unitedstates–mexicobarrier legs arms paperknife guitarcase Español El Mariachi Low Budget short Very low Friday September 1992 Blockbuster Good Low",El Mariachi
4799,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Comedy Romance Newlyweds Low Budget short Very low Monday December 2011 Very Low revenue Average Low",Newlyweds
4800,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar Comedy Drama Romance TVMovie FrontStreetPictures MuseEntertainmentEnterprises UnitedStatesofAmerica date loveatfirstsight narration investigation team postalworker English Signed, Sealed, Delivered Low Budget medium Very low Sunday October 2013 Very Low revenue Good Low hallmarkchannel signedsealeddelivered","Signed, Sealed, Delivered"
4801,"en In the 22nd century , a paraplegic Marine is dispatched to the moon Pandora on a unique mission , but becomes torn between following orders and protecting an alien civilization . Released Enter the World of Pandora . Avatar UnitedStatesofAmerica China English Shanghai Calling Low Budget medium Very low Thursday May 2012 Very Low revenue Average Low shanghaicalling",Shanghai Calling


In [58]:
final_df[final_df['movie_detes_str'].isna()]

Unnamed: 0,movie_detes_str,movie_name


In [59]:
movies_df.iloc[4140]

budget                                                                                                                                                                                                                                                         2
genres                                                                                                                                                                                                                       [{"id": 99, "name": "Documentary"}]
homepage                                                                                                                                                                                                                                                        
id                                                                                                                                                                                                                                   

In [60]:
processed_df.iloc[4140]

id                                            459488
original_title           To Be Frank, Sinatra at 100
budget_categories                         Low Budget
Duration                                       short
Popularity                                  Very low
Day                                         Saturday
Month                                       December
Year                                            2015
Revenue                             Very Low revenue
Vote Average Category                           Poor
Vote Count Category                              Low
Name: 4140, dtype: object

In [61]:
# using the TfidfVEctorizer to get our vector matrix
tfidf_vec = TfidfVectorizer()
vector_matrix = tfidf_vec.fit_transform(final_df['movie_detes_str'])
vector_matrix

<4803x20324 sparse matrix of type '<class 'numpy.float64'>'
	with 283794 stored elements in Compressed Sparse Row format>

In [63]:
# Get the user input.
main_id= ''
main_index = ''
movie_name = str(input('Enter movie name : '))
#movie_index = films_df['original_title'][films_df['original_title'] == movie_name].index
movie_index = films_df[films_df['original_title'].str.contains(movie_name,case = False)].index
print(movie_index)

if len(movie_index) > 1:
    print(' Following are the movies related to search : \n',films_df['original_title'][movie_index])
    actual_movie_index = int(input('Enter the desired movie number: '))
    print('movie selected : ',films_df['original_title'][films_df.index == actual_movie_index])
    print(vector_matrix[actual_movie_index])
    main_index = actual_movie_index
    main_id = films_df['id'][films_df.index == actual_movie_index]
    actual_movie_index = ''
else:
    print('movie selected : ',films_df['original_title'][movie_index])
    print(vector_matrix[movie_index])
    main_index = movie_index
    main_id = films_df['id'][movie_index]
    movie_index = ''
print(main_id)

Enter movie name : Scream 3
Int64Index([1164], dtype='int64')
movie selected :  1164    Scream 3
Name: original_title, dtype: object
  (0, 10678)	0.2545841549679313
  (0, 4234)	0.2545841549679313
  (0, 15753)	0.23162331199804395
  (0, 11302)	0.27754499793781867
  (0, 9819)	0.24947783750393213
  (0, 16253)	0.1961644111298537
  (0, 4980)	0.18059530859427003
  (0, 4076)	0.2023937342502158
  (0, 15550)	0.4901090882407501
  (0, 5981)	0.18783949647152182
  (0, 149)	0.14400664245449343
  (0, 14749)	0.2086624690281566
  (0, 11014)	0.22008548058683847
  (0, 12055)	0.1191415111828197
  (0, 6505)	0.23766280368607134
  (0, 6341)	0.12214178046536259
  (0, 8443)	0.10652206601116722
  (0, 10577)	0.0344191839372663
  (0, 1434)	0.06200469169678607
  (0, 2200)	0.04499771073396878
  (0, 18206)	0.08798451642483693
  (0, 11192)	0.11925639157312934
  (0, 2681)	0.03312549428728671
  (0, 19178)	0.03503431182906103
  (0, 5752)	0.03539417552123724
  :	:
  (0, 627)	0.03312549428728671
  (0, 838)	0.03312549428728

In [64]:
# We can use index or 'id' column to match. here we use index. 

#movie_id_list = [ind for ind in films_df['id'] if ind != main_id.values]
#movie_id_list

movie_index_list = [i for i in range(len(movies_df)) if i != main_index]
movie_index_list


[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [65]:
# create a list of combinations movies for our queried movie.
movie_vec_comb_list = []
for idx in movie_index_list:
    movie_vec_comb_list.append([films_df['original_title'][main_index].values[0], 
                                vector_matrix[main_index], 
                               films_df['original_title'][idx], 
                               vector_matrix[idx]])
movie_vec_comb_list

[['Scream 3',
  <1x20324 sparse matrix of type '<class 'numpy.float64'>'
  	with 57 stored elements in Compressed Sparse Row format>,
  'Avatar',
  <1x20324 sparse matrix of type '<class 'numpy.float64'>'
  	with 74 stored elements in Compressed Sparse Row format>],
 ['Scream 3',
  <1x20324 sparse matrix of type '<class 'numpy.float64'>'
  	with 57 stored elements in Compressed Sparse Row format>,
  "Pirates of the Caribbean: At World's End",
  <1x20324 sparse matrix of type '<class 'numpy.float64'>'
  	with 73 stored elements in Compressed Sparse Row format>],
 ['Scream 3',
  <1x20324 sparse matrix of type '<class 'numpy.float64'>'
  	with 57 stored elements in Compressed Sparse Row format>,
  'Spectre',
  <1x20324 sparse matrix of type '<class 'numpy.float64'>'
  	with 63 stored elements in Compressed Sparse Row format>],
 ['Scream 3',
  <1x20324 sparse matrix of type '<class 'numpy.float64'>'
  	with 57 stored elements in Compressed Sparse Row format>,
  'The Dark Knight Rises',
  <

In [68]:
# creating a Dataframe.
vector_df = pd.DataFrame(data = movie_vec_comb_list, columns = ['Searched Movie', 'Searched_Movie_Vector',
                                                               'Comparison Movie', 'Comparison_Movie_Vector'])
vector_df.head()
#vector_df['Searched_Movie_Vector'][0]

Unnamed: 0,Searched Movie,Searched_Movie_Vector,Comparison Movie,Comparison_Movie_Vector
0,Scream 3,"(0, 10678)\t0.2545841549679313\n (0, 4234)\t0.2545841549679313\n (0, 15753)\t0.23162331199804395\n (0, 11302)\t0.27754499793781867\n (0, 9819)\t0.24947783750393213\n (0, 16253)\t0.1961644111298537\n (0, 4980)\t0.18059530859427003\n (0, 4076)\t0.2023937342502158\n (0, 15550)\t0.4901090882407501\n (0, 5981)\t0.18783949647152182\n (0, 149)\t0.14400664245449343\n (0, 14749)\t0.2086624690281566\n (0, 11014)\t0.22008548058683847\n (0, 12055)\t0.1191415111828197\n (0, 6505)\t0.23766280368607134\n (0, 6341)\t0.12214178046536259\n (0, 8443)\t0.10652206601116722\n (0, 10577)\t0.0344191839372663\n (0, 1434)\t0.06200469169678607\n (0, 2200)\t0.04499771073396878\n (0, 18206)\t0.08798451642483693\n (0, 11192)\t0.11925639157312934\n (0, 2681)\t0.03312549428728671\n (0, 19178)\t0.03503431182906103\n (0, 5752)\t0.03539417552123724\n :\t:\n (0, 627)\t0.03312549428728671\n (0, 838)\t0.03312549428728671\n (0, 14164)\t0.03312549428728671\n (0, 868)\t0.03312549428728671\n (0, 12866)\t0.03312549428728671\n (0, 6785)\t0.03312549428728671\n (0, 1984)\t0.03312549428728671\n (0, 18381)\t0.03312549428728671\n (0, 1834)\t0.03312549428728671\n (0, 2785)\t0.03312549428728671\n (0, 11583)\t0.03312549428728671\n (0, 18950)\t0.03312549428728671\n (0, 12772)\t0.03312549428728671\n (0, 13083)\t0.06625098857457341\n (0, 11735)\t0.03312549428728671\n (0, 18294)\t0.03312549428728671\n (0, 5052)\t0.03312549428728671\n (0, 9171)\t0.03312549428728671\n (0, 10927)\t0.03312549428728671\n (0, 13150)\t0.03312549428728671\n (0, 3213)\t0.03312549428728671\n (0, 185)\t0.03312549428728671\n (0, 17781)\t0.09937648286186013\n (0, 8814)\t0.03312549428728671\n (0, 5712)\t0.03312549428728671",Avatar,"(0, 1425)\t0.2399407294771894\n (0, 17247)\t0.1350642483250686\n (0, 7524)\t0.04283291181332338\n (0, 2200)\t0.037105379831473315\n (0, 160)\t0.10805302429439348\n (0, 4726)\t0.09157571316437424\n (0, 18206)\t0.07255255540739221\n (0, 11192)\t0.0327798431999742\n (0, 10446)\t0.07037760973987855\n (0, 2681)\t0.027315479556323283\n (0, 8224)\t0.12385518886907104\n (0, 19178)\t0.057779003702526005\n (0, 5862)\t0.09870647961442798\n (0, 5752)\t0.029186247591611852\n (0, 236)\t0.1328101469895521\n (0, 11475)\t0.20993160796575006\n (0, 13918)\t0.221007081837081\n (0, 997)\t0.22886525560585838\n (0, 10549)\t0.221007081837081\n (0, 1750)\t0.16098883881420248\n (0, 16464)\t0.15313066504542514\n (0, 3224)\t0.20993160796575006\n (0, 642)\t0.20207343419697274\n (0, 18586)\t0.21491180872303237\n (0, 16578)\t0.16098883881420248\n :\t:\n (0, 627)\t0.054630959112646565\n (0, 838)\t0.027315479556323283\n (0, 14164)\t0.027315479556323283\n (0, 868)\t0.027315479556323283\n (0, 12866)\t0.027315479556323283\n (0, 6785)\t0.027315479556323283\n (0, 1984)\t0.027315479556323283\n (0, 18381)\t0.027315479556323283\n (0, 1834)\t0.027315479556323283\n (0, 2785)\t0.027315479556323283\n (0, 11583)\t0.027315479556323283\n (0, 18950)\t0.027315479556323283\n (0, 12772)\t0.027315479556323283\n (0, 13083)\t0.054630959112646565\n (0, 11735)\t0.027315479556323283\n (0, 18294)\t0.027315479556323283\n (0, 5052)\t0.027315479556323283\n (0, 9171)\t0.027315479556323283\n (0, 10927)\t0.054630959112646565\n (0, 13150)\t0.027315479556323283\n (0, 3213)\t0.027315479556323283\n (0, 185)\t0.027315479556323283\n (0, 17781)\t0.08194643866896985\n (0, 8814)\t0.027315479556323283\n (0, 5712)\t0.027315479556323283"
1,Scream 3,"(0, 10678)\t0.2545841549679313\n (0, 4234)\t0.2545841549679313\n (0, 15753)\t0.23162331199804395\n (0, 11302)\t0.27754499793781867\n (0, 9819)\t0.24947783750393213\n (0, 16253)\t0.1961644111298537\n (0, 4980)\t0.18059530859427003\n (0, 4076)\t0.2023937342502158\n (0, 15550)\t0.4901090882407501\n (0, 5981)\t0.18783949647152182\n (0, 149)\t0.14400664245449343\n (0, 14749)\t0.2086624690281566\n (0, 11014)\t0.22008548058683847\n (0, 12055)\t0.1191415111828197\n (0, 6505)\t0.23766280368607134\n (0, 6341)\t0.12214178046536259\n (0, 8443)\t0.10652206601116722\n (0, 10577)\t0.0344191839372663\n (0, 1434)\t0.06200469169678607\n (0, 2200)\t0.04499771073396878\n (0, 18206)\t0.08798451642483693\n (0, 11192)\t0.11925639157312934\n (0, 2681)\t0.03312549428728671\n (0, 19178)\t0.03503431182906103\n (0, 5752)\t0.03539417552123724\n :\t:\n (0, 627)\t0.03312549428728671\n (0, 838)\t0.03312549428728671\n (0, 14164)\t0.03312549428728671\n (0, 868)\t0.03312549428728671\n (0, 12866)\t0.03312549428728671\n (0, 6785)\t0.03312549428728671\n (0, 1984)\t0.03312549428728671\n (0, 18381)\t0.03312549428728671\n (0, 1834)\t0.03312549428728671\n (0, 2785)\t0.03312549428728671\n (0, 11583)\t0.03312549428728671\n (0, 18950)\t0.03312549428728671\n (0, 12772)\t0.03312549428728671\n (0, 13083)\t0.06625098857457341\n (0, 11735)\t0.03312549428728671\n (0, 18294)\t0.03312549428728671\n (0, 5052)\t0.03312549428728671\n (0, 9171)\t0.03312549428728671\n (0, 10927)\t0.03312549428728671\n (0, 13150)\t0.03312549428728671\n (0, 3213)\t0.03312549428728671\n (0, 185)\t0.03312549428728671\n (0, 17781)\t0.09937648286186013\n (0, 8814)\t0.03312549428728671\n (0, 5712)\t0.03312549428728671",Pirates of the Caribbean: At World's End,"(0, 5045)\t0.1643118981292195\n (0, 7447)\t0.1410829971185724\n (0, 5041)\t0.130136874168303\n (0, 158)\t0.10645118573623844\n (0, 11083)\t0.09054983221212384\n (0, 15391)\t0.10152913350632435\n (0, 1434)\t0.04745233136964085\n (0, 5718)\t0.1699688138704058\n (0, 1315)\t0.1618956882353867\n (0, 3000)\t0.1794676735048475\n (0, 13582)\t0.37508159827973325\n (0, 492)\t0.10991035982564125\n (0, 17361)\t0.19483383196346832\n (0, 13579)\t0.1716049309528212\n (0, 6457)\t0.1818838833986803\n (0, 495)\t0.1669828942481484\n (0, 2857)\t0.21240581723292912\n (0, 671)\t0.21240581723292912\n (0, 15946)\t0.15161673578952758\n (0, 17096)\t0.19483383196346832\n (0, 15950)\t0.1716049309528212\n (0, 18491)\t0.1656122352562811\n (0, 16280)\t0.1431968056214611\n (0, 10567)\t0.1431968056214611\n (0, 5487)\t0.1994558686681411\n :\t:\n (0, 627)\t0.025351016006825017\n (0, 838)\t0.025351016006825017\n (0, 14164)\t0.025351016006825017\n (0, 868)\t0.025351016006825017\n (0, 12866)\t0.025351016006825017\n (0, 6785)\t0.025351016006825017\n (0, 1984)\t0.025351016006825017\n (0, 18381)\t0.025351016006825017\n (0, 1834)\t0.025351016006825017\n (0, 2785)\t0.025351016006825017\n (0, 11583)\t0.025351016006825017\n (0, 18950)\t0.025351016006825017\n (0, 12772)\t0.025351016006825017\n (0, 13083)\t0.050702032013650034\n (0, 11735)\t0.025351016006825017\n (0, 18294)\t0.025351016006825017\n (0, 5052)\t0.025351016006825017\n (0, 9171)\t0.025351016006825017\n (0, 10927)\t0.025351016006825017\n (0, 13150)\t0.025351016006825017\n (0, 3213)\t0.025351016006825017\n (0, 185)\t0.025351016006825017\n (0, 17781)\t0.10140406402730007\n (0, 8814)\t0.025351016006825017\n (0, 5712)\t0.025351016006825017"
2,Scream 3,"(0, 10678)\t0.2545841549679313\n (0, 4234)\t0.2545841549679313\n (0, 15753)\t0.23162331199804395\n (0, 11302)\t0.27754499793781867\n (0, 9819)\t0.24947783750393213\n (0, 16253)\t0.1961644111298537\n (0, 4980)\t0.18059530859427003\n (0, 4076)\t0.2023937342502158\n (0, 15550)\t0.4901090882407501\n (0, 5981)\t0.18783949647152182\n (0, 149)\t0.14400664245449343\n (0, 14749)\t0.2086624690281566\n (0, 11014)\t0.22008548058683847\n (0, 12055)\t0.1191415111828197\n (0, 6505)\t0.23766280368607134\n (0, 6341)\t0.12214178046536259\n (0, 8443)\t0.10652206601116722\n (0, 10577)\t0.0344191839372663\n (0, 1434)\t0.06200469169678607\n (0, 2200)\t0.04499771073396878\n (0, 18206)\t0.08798451642483693\n (0, 11192)\t0.11925639157312934\n (0, 2681)\t0.03312549428728671\n (0, 19178)\t0.03503431182906103\n (0, 5752)\t0.03539417552123724\n :\t:\n (0, 627)\t0.03312549428728671\n (0, 838)\t0.03312549428728671\n (0, 14164)\t0.03312549428728671\n (0, 868)\t0.03312549428728671\n (0, 12866)\t0.03312549428728671\n (0, 6785)\t0.03312549428728671\n (0, 1984)\t0.03312549428728671\n (0, 18381)\t0.03312549428728671\n (0, 1834)\t0.03312549428728671\n (0, 2785)\t0.03312549428728671\n (0, 11583)\t0.03312549428728671\n (0, 18950)\t0.03312549428728671\n (0, 12772)\t0.03312549428728671\n (0, 13083)\t0.06625098857457341\n (0, 11735)\t0.03312549428728671\n (0, 18294)\t0.03312549428728671\n (0, 5052)\t0.03312549428728671\n (0, 9171)\t0.03312549428728671\n (0, 10927)\t0.03312549428728671\n (0, 13150)\t0.03312549428728671\n (0, 3213)\t0.03312549428728671\n (0, 185)\t0.03312549428728671\n (0, 17781)\t0.09937648286186013\n (0, 8814)\t0.03312549428728671\n (0, 5712)\t0.03312549428728671",Spectre,"(0, 11881)\t0.14632628847550494\n (0, 16512)\t0.16203268842886784\n (0, 166)\t0.13155899267379104\n (0, 12672)\t0.10633698951614493\n (0, 11682)\t0.129137237471931\n (0, 16645)\t0.5640884312061782\n (0, 4908)\t0.12538589018772825\n (0, 9202)\t0.135994813598042\n (0, 6946)\t0.10900839532003818\n (0, 2569)\t0.21149429974617057\n (0, 11355)\t0.26902527818767613\n (0, 15736)\t0.15808139492936935\n (0, 15628)\t0.1984753623307576\n (0, 1701)\t0.13450111867829778\n (0, 16757)\t0.1813677279170482\n (0, 1467)\t0.2820442156030891\n (0, 4545)\t0.23375032025921688\n (0, 3835)\t0.1338589242627645\n (0, 4273)\t0.09372535545356846\n (0, 1434)\t0.0601013513362116\n (0, 7524)\t0.05034899677394526\n (0, 2200)\t0.04361642882400877\n (0, 10446)\t0.0827270875534761\n (0, 2681)\t0.032108650424094984\n (0, 8224)\t0.14558861961062164\n :\t:\n (0, 627)\t0.032108650424094984\n (0, 838)\t0.032108650424094984\n (0, 14164)\t0.032108650424094984\n (0, 868)\t0.032108650424094984\n (0, 12866)\t0.032108650424094984\n (0, 6785)\t0.032108650424094984\n (0, 1984)\t0.032108650424094984\n (0, 18381)\t0.032108650424094984\n (0, 1834)\t0.032108650424094984\n (0, 2785)\t0.032108650424094984\n (0, 11583)\t0.032108650424094984\n (0, 18950)\t0.032108650424094984\n (0, 12772)\t0.032108650424094984\n (0, 13083)\t0.06421730084818997\n (0, 11735)\t0.032108650424094984\n (0, 18294)\t0.032108650424094984\n (0, 5052)\t0.032108650424094984\n (0, 9171)\t0.032108650424094984\n (0, 10927)\t0.032108650424094984\n (0, 13150)\t0.032108650424094984\n (0, 3213)\t0.032108650424094984\n (0, 185)\t0.032108650424094984\n (0, 17781)\t0.09632595127228495\n (0, 8814)\t0.032108650424094984\n (0, 5712)\t0.032108650424094984"
3,Scream 3,"(0, 10678)\t0.2545841549679313\n (0, 4234)\t0.2545841549679313\n (0, 15753)\t0.23162331199804395\n (0, 11302)\t0.27754499793781867\n (0, 9819)\t0.24947783750393213\n (0, 16253)\t0.1961644111298537\n (0, 4980)\t0.18059530859427003\n (0, 4076)\t0.2023937342502158\n (0, 15550)\t0.4901090882407501\n (0, 5981)\t0.18783949647152182\n (0, 149)\t0.14400664245449343\n (0, 14749)\t0.2086624690281566\n (0, 11014)\t0.22008548058683847\n (0, 12055)\t0.1191415111828197\n (0, 6505)\t0.23766280368607134\n (0, 6341)\t0.12214178046536259\n (0, 8443)\t0.10652206601116722\n (0, 10577)\t0.0344191839372663\n (0, 1434)\t0.06200469169678607\n (0, 2200)\t0.04499771073396878\n (0, 18206)\t0.08798451642483693\n (0, 11192)\t0.11925639157312934\n (0, 2681)\t0.03312549428728671\n (0, 19178)\t0.03503431182906103\n (0, 5752)\t0.03539417552123724\n :\t:\n (0, 627)\t0.03312549428728671\n (0, 838)\t0.03312549428728671\n (0, 14164)\t0.03312549428728671\n (0, 868)\t0.03312549428728671\n (0, 12866)\t0.03312549428728671\n (0, 6785)\t0.03312549428728671\n (0, 1984)\t0.03312549428728671\n (0, 18381)\t0.03312549428728671\n (0, 1834)\t0.03312549428728671\n (0, 2785)\t0.03312549428728671\n (0, 11583)\t0.03312549428728671\n (0, 18950)\t0.03312549428728671\n (0, 12772)\t0.03312549428728671\n (0, 13083)\t0.06625098857457341\n (0, 11735)\t0.03312549428728671\n (0, 18294)\t0.03312549428728671\n (0, 5052)\t0.03312549428728671\n (0, 9171)\t0.03312549428728671\n (0, 10927)\t0.03312549428728671\n (0, 13150)\t0.03312549428728671\n (0, 3213)\t0.03312549428728671\n (0, 185)\t0.03312549428728671\n (0, 17781)\t0.09937648286186013\n (0, 8814)\t0.03312549428728671\n (0, 5712)\t0.03312549428728671",The Dark Knight Rises,"(0, 17859)\t0.21353922480906345\n (0, 163)\t0.1002865984661079\n (0, 9520)\t0.08696383995129879\n (0, 14946)\t0.21353922480906345\n (0, 9788)\t0.1615145089567234\n (0, 4557)\t0.15313148388033945\n (0, 1744)\t0.16998179707512803\n (0, 4296)\t0.2036824234647052\n (0, 6732)\t0.19126432238922117\n (0, 8767)\t0.15213910677459033\n (0, 3102)\t0.21353922480906345\n (0, 3138)\t0.2036824234647052\n (0, 4881)\t0.1769753089255584\n (0, 17742)\t0.1615145089567234\n (0, 18471)\t0.1868321102699166\n (0, 19251)\t0.19668891161427482\n (0, 17260)\t0.12710919950883068\n (0, 19012)\t0.13785009331092757\n (0, 4200)\t0.16623441512346152\n (0, 19238)\t0.16455720785007436\n (0, 7569)\t0.17983859841948627\n (0, 18244)\t0.1868321102699166\n (0, 8472)\t0.1868321102699166\n (0, 2731)\t0.17983859841948627\n (0, 15638)\t0.14395951270361926\n :\t:\n (0, 627)\t0.024309863283547353\n (0, 838)\t0.024309863283547353\n (0, 14164)\t0.024309863283547353\n (0, 868)\t0.024309863283547353\n (0, 12866)\t0.024309863283547353\n (0, 6785)\t0.024309863283547353\n (0, 1984)\t0.024309863283547353\n (0, 18381)\t0.024309863283547353\n (0, 1834)\t0.024309863283547353\n (0, 2785)\t0.024309863283547353\n (0, 11583)\t0.024309863283547353\n (0, 18950)\t0.024309863283547353\n (0, 12772)\t0.024309863283547353\n (0, 13083)\t0.048619726567094707\n (0, 11735)\t0.024309863283547353\n (0, 18294)\t0.024309863283547353\n (0, 5052)\t0.024309863283547353\n (0, 9171)\t0.024309863283547353\n (0, 10927)\t0.024309863283547353\n (0, 13150)\t0.024309863283547353\n (0, 3213)\t0.024309863283547353\n (0, 185)\t0.024309863283547353\n (0, 17781)\t0.09723945313418941\n (0, 8814)\t0.024309863283547353\n (0, 5712)\t0.024309863283547353"
4,Scream 3,"(0, 10678)\t0.2545841549679313\n (0, 4234)\t0.2545841549679313\n (0, 15753)\t0.23162331199804395\n (0, 11302)\t0.27754499793781867\n (0, 9819)\t0.24947783750393213\n (0, 16253)\t0.1961644111298537\n (0, 4980)\t0.18059530859427003\n (0, 4076)\t0.2023937342502158\n (0, 15550)\t0.4901090882407501\n (0, 5981)\t0.18783949647152182\n (0, 149)\t0.14400664245449343\n (0, 14749)\t0.2086624690281566\n (0, 11014)\t0.22008548058683847\n (0, 12055)\t0.1191415111828197\n (0, 6505)\t0.23766280368607134\n (0, 6341)\t0.12214178046536259\n (0, 8443)\t0.10652206601116722\n (0, 10577)\t0.0344191839372663\n (0, 1434)\t0.06200469169678607\n (0, 2200)\t0.04499771073396878\n (0, 18206)\t0.08798451642483693\n (0, 11192)\t0.11925639157312934\n (0, 2681)\t0.03312549428728671\n (0, 19178)\t0.03503431182906103\n (0, 5752)\t0.03539417552123724\n :\t:\n (0, 627)\t0.03312549428728671\n (0, 838)\t0.03312549428728671\n (0, 14164)\t0.03312549428728671\n (0, 868)\t0.03312549428728671\n (0, 12866)\t0.03312549428728671\n (0, 6785)\t0.03312549428728671\n (0, 1984)\t0.03312549428728671\n (0, 18381)\t0.03312549428728671\n (0, 1834)\t0.03312549428728671\n (0, 2785)\t0.03312549428728671\n (0, 11583)\t0.03312549428728671\n (0, 18950)\t0.03312549428728671\n (0, 12772)\t0.03312549428728671\n (0, 13083)\t0.06625098857457341\n (0, 11735)\t0.03312549428728671\n (0, 18294)\t0.03312549428728671\n (0, 5052)\t0.03312549428728671\n (0, 9171)\t0.03312549428728671\n (0, 10927)\t0.03312549428728671\n (0, 13150)\t0.03312549428728671\n (0, 3213)\t0.03312549428728671\n (0, 185)\t0.03312549428728671\n (0, 17781)\t0.09937648286186013\n (0, 8814)\t0.03312549428728671\n (0, 5712)\t0.03312549428728671",John Carter,"(0, 10899)\t0.09315287483181874\n (0, 19589)\t0.07351423485005755\n (0, 10577)\t0.02730380968312251\n (0, 3040)\t0.42521874630140605\n (0, 9442)\t0.3887903122190344\n (0, 143)\t0.16781275185088831\n (0, 17406)\t0.22016895624279892\n (0, 10969)\t0.23082359019188892\n (0, 17267)\t0.1913001052525231\n (0, 645)\t0.22016895624279892\n (0, 5541)\t0.23082359019188892\n (0, 5844)\t0.14101407015243966\n (0, 10988)\t0.19790403537107615\n (0, 16896)\t0.1885314879175833\n (0, 14035)\t0.16055348700586514\n (0, 11148)\t0.19790403537107615\n (0, 10967)\t0.18602696889207415\n (0, 163)\t0.10840403081343813\n (0, 11881)\t0.11975271030873126\n (0, 1701)\t0.11007504987033669\n (0, 5041)\t0.13489319703007557\n (0, 19431)\t0.12435256569189555\n (0, 7524)\t0.04120536977888864\n (0, 2200)\t0.03569546948863725\n (0, 10446)\t0.06770343903129115\n :\t:\n (0, 627)\t0.05255512119798813\n (0, 838)\t0.026277560598994065\n (0, 14164)\t0.026277560598994065\n (0, 868)\t0.026277560598994065\n (0, 12866)\t0.026277560598994065\n (0, 6785)\t0.026277560598994065\n (0, 1984)\t0.026277560598994065\n (0, 18381)\t0.026277560598994065\n (0, 1834)\t0.026277560598994065\n (0, 2785)\t0.026277560598994065\n (0, 11583)\t0.026277560598994065\n (0, 18950)\t0.026277560598994065\n (0, 12772)\t0.026277560598994065\n (0, 13083)\t0.05255512119798813\n (0, 11735)\t0.026277560598994065\n (0, 18294)\t0.026277560598994065\n (0, 5052)\t0.026277560598994065\n (0, 9171)\t0.026277560598994065\n (0, 10927)\t0.026277560598994065\n (0, 13150)\t0.026277560598994065\n (0, 3213)\t0.026277560598994065\n (0, 185)\t0.026277560598994065\n (0, 17781)\t0.07883268179698219\n (0, 8814)\t0.026277560598994065\n (0, 5712)\t0.026277560598994065"


# Start from scratch to build a Vector norm and Dot product

In [66]:
# Importing csr_matrix for later use.
from scipy.sparse import csr_matrix

In [69]:
csr_mat = vector_df['Searched_Movie_Vector'][0]

In [70]:
dense_array = csr_mat.toarray()

In [71]:
contains_nan = np.isnan(dense_array).any()

In [72]:
# checking for nans
if contains_nan:
    print('It contains nan')
else:
    print('doesnt contain nan')

doesnt contain nan


In [73]:
dense_array.shape

(1, 20324)

In [74]:
# Squared matrix 
squared_mat = csr_mat.data**2
squared_mat

array([0.06481309, 0.06481309, 0.05364936, 0.07703123, 0.06223919,
       0.03848048, 0.03261467, 0.04096322, 0.24020692, 0.03528368,
       0.02073791, 0.04354003, 0.04843762, 0.0141947 , 0.05648361,
       0.01491861, 0.01134695, 0.00118468, 0.00384458, 0.00202479,
       0.00774128, 0.01422209, 0.0010973 , 0.0012274 , 0.00125275,
       0.00156425, 0.0010973 , 0.0010973 , 0.0010973 , 0.0010973 ,
       0.0010973 , 0.0010973 , 0.0010973 , 0.0010973 , 0.0010973 ,
       0.0010973 , 0.0010973 , 0.0010973 , 0.0010973 , 0.0010973 ,
       0.0010973 , 0.0010973 , 0.0010973 , 0.0010973 , 0.0010973 ,
       0.00438919, 0.0010973 , 0.0010973 , 0.0010973 , 0.0010973 ,
       0.0010973 , 0.0010973 , 0.0010973 , 0.0010973 , 0.00987569,
       0.0010973 , 0.0010973 ])

In [75]:
# rebuilding Squared matrix as CSR matrix from the information of csr_mat
squared_matrix = csr_matrix((squared_mat, csr_mat.indices, csr_mat.indptr), shape = csr_mat.shape)
squared_matrix

<1x20324 sparse matrix of type '<class 'numpy.float64'>'
	with 57 stored elements in Compressed Sparse Row format>

In [76]:
# Step by Step to get the dot_product

# Create the CSR matrix
#csr_mat = csr_matrix((data, indices, indptr), shape=shape)
csr_mat = vector_df['Searched_Movie_Vector'][0]

# Compute the squared values of each element in the CSR matrix
squared_values = csr_mat.data ** 2

# Create a new CSR matrix with the squared values and the same indices and indptr arrays
squared_mat = csr_matrix((squared_values, csr_mat.indices, csr_mat.indptr), shape=csr_mat.shape)

# Compute the squared norms along the rows
squared_norms = squared_mat.sum(axis=1)

# Take the square root of the squared norms to obtain the vector norms
vector_norms = np.sqrt(squared_norms)

# Divide each element of the CSR matrix by its corresponding vector norm to normalize the vectors
normalized_csr_mat = csr_mat.multiply(1.0 / vector_norms)

# Compute the dot product between the normalized vectors
dot_product = normalized_csr_mat.dot(normalized_csr_mat.T)

print(dot_product)


  (0, 0)	1.0000000000000007


In [77]:
# creating our own L2 Normalizer function
# function to calculate norms of vectors
def vector_norml2(csr_mat):
    # Square of non zero elements
    square_values = csr_mat.data**2
    
    # create a sparse matrix for squared_values using the csr_mat's indices and indptr
    squared_mat = csr_matrix((square_values, csr_mat.indices, csr_mat.indptr), shape = csr_mat.shape)
    
    # find the sum of the squared values
    squared_norms = squared_mat.sum(axis = 1)
    
    # find the vector norms
    vector_norms = np.sqrt(squared_norms)
    
    #normalized csr_mat
    normalized_csr_mat = csr_mat.multiply(1.0/vector_norms)
    
    return normalized_csr_mat

In [78]:
# cosine similarity function
def dot_prod(norm_vec1, norm_vec2):
    return (np.clip(np.dot(vector_norml2(norm_vec1), vector_norml2(norm_vec2).T),-1.0, 1.0)).data[0]

In [79]:
u1 = vector_norml2(vector_df['Searched_Movie_Vector'][0])
u2 = vector_norml2(vector_df['Comparison_Movie_Vector'][2])

In [80]:
dot_prod(u1,u2)



0.05534489787524083

In [81]:
# Finding the cosine similarity for each of the combinations
cosine_similarity = vector_df[['Searched_Movie_Vector','Comparison_Movie_Vector']].apply(lambda x : dot_prod(
                                                                     x['Searched_Movie_Vector'],
                                                                    x['Comparison_Movie_Vector']),
                                                                    axis = 1)

In [82]:
vector_df.insert(len(vector_df.columns),'Cosine_Similarity',cosine_similarity)

In [83]:
vector_df['rank'] = vector_df['Cosine_Similarity'].rank(method = 'dense',ascending = False)

In [85]:
vector_df[['Searched Movie','Comparison Movie','Cosine_Similarity','rank']].sort_values(by = 'rank').head(20)

Unnamed: 0,Searched Movie,Comparison Movie,Cosine_Similarity,rank
1960,Scream 3,Scream 2,0.451583,1.0
1218,Scream 3,Scream 4,0.418358,2.0
2281,Scream 3,Scream,0.310668,3.0
1710,Scream 3,Identity,0.310652,4.0
1429,Scream 3,Cursed,0.274449,5.0
1787,Scream 3,Red Eye,0.263414,6.0
2574,Scream 3,The Hills Have Eyes II,0.245473,7.0
4052,Scream 3,Friday the 13th: A New Beginning,0.224873,8.0
3264,Scream 3,Shadow of the Vampire,0.223244,9.0
895,Scream 3,"Me, Myself & Irene",0.221833,10.0
