## 1. Import necesary libraries

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## 2. Reading TMDB 5000 movies data 

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

## 3(a). EDA - Printing Head of the data.

In [3]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## 3(b). EDA - Merging two dataframes (movies & credits) on title

In [5]:
print(movies.shape)
print(credits.shape)

(4803, 20)
(4803, 4)


In [6]:
movies = movies.merge(credits, on='title')

In [7]:
print(movies.shape)

(4809, 23)


In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

## 3(c). EDA - Keep only 'movie_id','title', 'overview', 'genres', 'keywords', 'cast', 'crew' as these are relevant while making recommendation

In [9]:
movies = movies[['movie_id','title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [10]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## 3(d). EDA - Dropping null value rows

In [11]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [12]:
movies = movies.dropna()

In [13]:
movies.isna().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [14]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## 3(e). EDA - Extract only values from the key-value pair of the "genres", "keywords" , "cast", "crew" columns.

In [15]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [16]:
# '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
# We will extract values from the above key-value pair dictionary.

['Action', 'Adventure', 'Fantasy', 'SciFi']

['Action', 'Adventure', 'Fantasy', 'SciFi']

# 3(e). EDA - Write a function named as "convert" that takes a single argument obj and converts a string representation of a "list of dictionaries" into a "list of names" extracted from those dictionaries.

In [17]:
# This function is gonna extract key pair value of given object
import ast
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [18]:
# Testing the custom function "convert"
convert('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

# 3(e). EDA - Apply the custom function to "genres" and "keywords" columns.

In [19]:
movies['genres'] = movies['genres'].apply(convert)

In [20]:
movies['keywords'] = movies['keywords'].apply(convert)

# 3(f). EDA - Write a function named as "convert3" that takes a single argument obj and converts a string representation of a "list of dictionaries" into a "list of names" extracted from those dictionaries.

In [21]:
# This function is gonna extract key pair value of given object
import ast
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

In [22]:
movies['cast'].apply(convert3)

0        [Sam Worthington, Zoe Saldana, Sigourney Weaver]
1           [Johnny Depp, Orlando Bloom, Keira Knightley]
2            [Daniel Craig, Christoph Waltz, Léa Seydoux]
3            [Christian Bale, Michael Caine, Gary Oldman]
4          [Taylor Kitsch, Lynn Collins, Samantha Morton]
                              ...                        
4804    [Carlos Gallardo, Jaime de Hoyos, Peter Marqua...
4805         [Edward Burns, Kerry Bishé, Marsha Dietlein]
4806           [Eric Mabius, Kristin Booth, Crystal Lowe]
4807            [Daniel Henney, Eliza Coupe, Bill Paxton]
4808    [Drew Barrymore, Brian Herzlinger, Corey Feldman]
Name: cast, Length: 4806, dtype: object

In [23]:
movies['cast'] = movies['cast'].apply(convert3)

# 3(g). EDA - Write a function named as "fetch_director" that will extract name of directors from the "crew" column.

In [24]:
# This function is gonna extract key pair value of given object
import ast
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [25]:
movies['crew'].apply(fetch_director)

0           [James Cameron]
1          [Gore Verbinski]
2              [Sam Mendes]
3       [Christopher Nolan]
4          [Andrew Stanton]
               ...         
4804     [Robert Rodriguez]
4805         [Edward Burns]
4806          [Scott Smith]
4807          [Daniel Hsia]
4808     [Brian Herzlinger]
Name: crew, Length: 4806, dtype: object

In [26]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [27]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [28]:
movies['overview']

0       In the 22nd century, a paraplegic Marine is di...
1       Captain Barbossa, long believed to be dead, ha...
2       A cryptic message from Bond’s past sends him o...
3       Following the death of District Attorney Harve...
4       John Carter is a war-weary, former military ca...
                              ...                        
4804    El Mariachi just wants to play his guitar and ...
4805    A newlywed couple's honeymoon is upended by th...
4806    "Signed, Sealed, Delivered" introduces a dedic...
4807    When ambitious New York attorney Sam is sent t...
4808    Ever since the second grade when he first saw ...
Name: overview, Length: 4806, dtype: object

# 3(h). EDA - The "apply" method in below is used to apply a function to each element of a pandas Series or DataFrame. In this case, the lambda function is used to "split each string" in the 'overview' column into a list of words.

In [29]:
movies['overview'].apply(lambda x:x.split())

0       [In, the, 22nd, century,, a, paraplegic, Marin...
1       [Captain, Barbossa,, long, believed, to, be, d...
2       [A, cryptic, message, from, Bond’s, past, send...
3       [Following, the, death, of, District, Attorney...
4       [John, Carter, is, a, war-weary,, former, mili...
                              ...                        
4804    [El, Mariachi, just, wants, to, play, his, gui...
4805    [A, newlywed, couple's, honeymoon, is, upended...
4806    ["Signed,, Sealed,, Delivered", introduces, a,...
4807    [When, ambitious, New, York, attorney, Sam, is...
4808    [Ever, since, the, second, grade, when, he, fi...
Name: overview, Length: 4806, dtype: object

In [30]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [31]:
movies.head(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]


# 3(i). EDA - Now we have to remove space of any individual's name. for example, "Sam Worthington" should be converted to "SamWorthington"

In [32]:
movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])

0       [Action, Adventure, Fantasy, ScienceFiction]
1                       [Adventure, Fantasy, Action]
2                         [Action, Adventure, Crime]
3                   [Action, Crime, Drama, Thriller]
4                [Action, Adventure, ScienceFiction]
                            ...                     
4804                       [Action, Crime, Thriller]
4805                               [Comedy, Romance]
4806               [Comedy, Drama, Romance, TVMovie]
4807                                              []
4808                                   [Documentary]
Name: genres, Length: 4806, dtype: object

In [33]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])

In [34]:
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])

In [35]:
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])

In [36]:
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [37]:
movies.head(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]


# 3(j). EDA - Add a new column named 'tag' to a pandas DataFrame named movies. The 'tag' column is created by concatenating values from several other columns in the DataFrame, namely 'overview', 'genres', 'keywords', 'cast', and 'crew'.

In [38]:
movies['tag'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew'] 

In [39]:
movies.head(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tag
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."


# 3(k). EDA - Make a new final dataframe using "movie_id", "title" and "tag" columns.

In [40]:
new_df = movies[['movie_id', 'title', 'tag']]

In [41]:
new_df['tag'] = new_df['tag'].apply(lambda x: " ".join(x))

In [42]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4806 entries, 0 to 4808
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4806 non-null   int64 
 1   title     4806 non-null   object
 2   tag       4806 non-null   object
dtypes: int64(1), object(2)
memory usage: 150.2+ KB


In [43]:
new_df['tag'][2]

'A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M battles political forces to keep the secret service alive, Bond peels back the layers of deceit to reveal the terrible truth behind SPECTRE. Action Adventure Crime spy basedonnovel secretagent sequel mi6 britishsecretservice unitedkingdom DanielCraig ChristophWaltz LéaSeydoux SamMendes'

# 3(l). EDA - Make the characters lower of the tag column

In [44]:
new_df['tag'] = new_df['tag'].apply(lambda x:x.lower())

In [45]:
# Check if characters are lowered or not.
new_df['tag'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

In [46]:
new_df['tag'][1]

"captain barbossa, long believed to be dead, has come back to life and is headed to the edge of the earth with will turner and elizabeth swann. but nothing is quite as it seems. adventure fantasy action ocean drugabuse exoticisland eastindiatradingcompany loveofone'slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger johnnydepp orlandobloom keiraknightley goreverbinski"

## Now we have to calculate similarities between two tags

# 4. Covert value of "tag" columns into "vectors"

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english') ## Here, 5000 is the most used words

In [48]:
vectors = cv.fit_transform(new_df['tag']).toarray()

In [49]:
vectors.shape

(4806, 5000)

In [50]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

## 5. We will use nltk (Natural Language Tool Kit) library for removing repeatation of similar words. Like, "action", "actions" both are similar word, but till now our program is considering "action" and "actions" as two different features. We will use stem for making it single feature

In [51]:
import nltk

In [52]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [53]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [54]:
ps.stem('loved')

'love'

In [55]:
ps.stem('loving')

'love'

In [56]:
ps.stem('love')

'love'

In [57]:
ps.stem('dancing')

'danc'

In [58]:
stem('in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron')

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

In [59]:
new_df['tag'] = new_df['tag'].apply(stem)

# 6. Again Make the vector using our "stemmed" words

In [60]:
vectors = cv.fit_transform(new_df['tag']).toarray()

In [61]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [62]:
cv.vocabulary_

{'century': 740,
 'marin': 2806,
 'dispatch': 1288,
 'moon': 3017,
 'pandora': 3282,
 'uniqu': 4681,
 'mission': 2984,
 'becom': 442,
 'torn': 4554,
 'follow': 1729,
 'order': 3235,
 'protect': 3534,
 'alien': 157,
 'action': 79,
 'adventur': 106,
 'fantasi': 1629,
 'sciencefict': 3930,
 'cultureclash': 1087,
 'futur': 1807,
 'societi': 4153,
 'spacetravel': 4198,
 'futurist': 1809,
 'romanc': 3811,
 'space': 4192,
 'tribe': 4605,
 'alienplanet': 160,
 'soldier': 4160,
 'battl': 425,
 '3d': 47,
 'zoesaldana': 4994,
 'sigourneyweav': 4080,
 'jamescameron': 2325,
 'captain': 680,
 'long': 2695,
 'believ': 455,
 'dead': 1152,
 'ha': 1960,
 'come': 917,
 'life': 2647,
 'head': 2018,
 'edg': 1408,
 'earth': 1396,
 'turner': 4628,
 'elizabeth': 1436,
 'noth': 3172,
 'quit': 3572,
 'ocean': 3198,
 'drugabus': 1358,
 'exoticisland': 1571,
 'loveofone': 2729,
 'slif': 4132,
 'traitor': 4584,
 'shipwreck': 4056,
 'ship': 4055,
 'allianc': 167,
 'afterlif': 119,
 'fighter': 1680,
 'pirat': 3391,


# 7. Vectors are ready. Now find the cosine similarity between words.

In [63]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(vectors)

array([[1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
        0.        ],
       [0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
        0.02615329],
       [0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
        0.        ],
       ...,
       [0.04499213, 0.02378257, 0.02451452, ..., 1.        , 0.03962144,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08714204],
       [0.        , 0.02615329, 0.        , ..., 0.04229549, 0.08714204,
        1.        ]])

In [64]:
similarity = cosine_similarity(vectors)

In [65]:
similarity.shape

(4806, 4806)

In [66]:
similarity[0]

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ])

In [67]:
similarity[0].shape

(4806,)

## 8. Well, done with making "cosine similarities". Now its time to make a custom function, that will return you most similar 5 movie names.

In [68]:
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x:x[1])[1:6]

[(1216, 0.2867696673382022),
 (2409, 0.26901379342448517),
 (3730, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574)]

In [69]:
# We will find the index of the given movie name
new_df[new_df['title'] == 'Batman Begins']

Unnamed: 0,movie_id,title,tag
119,272,Batman Begins,"driven by tragedy, billionair bruce wayn dedic..."


In [70]:
# So, 119 is the name of my Batman Begins movie index.
new_df[new_df['title'] == 'Batman Begins'].index[0]

119

## 9. The recommend function takes a movie title as input, finds the 5 most similar movies based on a similarity matrix, and prints their titles.

In [71]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [72]:
recommend('Batman Begins')

The Dark Knight
Batman
Batman
The Dark Knight Rises
10th & Wolf


In [73]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


## 10. Make 2 pkl files while 1 .pkl file will contain movies and another will contain similarity values.

In [74]:
import pickle

In [75]:
pickle.dump(new_df, open('movies.pkl','wb'))

In [76]:
new_df['title'].values

array(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre',
       ..., 'Signed, Sealed, Delivered', 'Shanghai Calling',
       'My Date with Drew'], dtype=object)

In [77]:
pickle.dump(similarity, open('similarity.pkl','wb'))

# End: Finally, these 2 .pkl files will be used to develop the StreamLit web application. Thank You.