In [3]:
import pandas as pd
import numpy as np

In [4]:
movies = pd.read_csv('TMDB_movie_dataset_v11.csv')

In [5]:
movies = movies.dropna()

In [6]:
movies.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')

In [7]:
# Selecting the columns we want to keep 
# id, title, genres, overview, tagline, genres, production_comapany, production_countries, keywords

movies = movies[['id', 'title', 'genres', 'overview', 'tagline', 'production_companies', 'production_countries', 'keywords']]

In [8]:
movies.head()

Unnamed: 0,id,title,genres,overview,tagline,production_companies,production_countries,keywords
0,27205,Inception,"Action, Science Fiction, Adventure","Cobb, a skilled thief who commits corporate es...",Your mind is the scene of the crime.,"Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,"Adventure, Drama, Science Fiction",The adventures of a group of explorers who mak...,Mankind was born on Earth. It was never meant ...,"Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America","rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,"Drama, Action, Crime, Thriller",Batman raises the stakes in his war on crime. ...,Welcome to a world without rules.,"DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,"Action, Adventure, Fantasy, Science Fiction","In the 22nd century, a paraplegic Marine is di...",Enter the world of Pandora.,"Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","future, society, culture clash, space travel, ..."
4,24428,The Avengers,"Science Fiction, Action, Adventure",When an unexpected enemy emerges and threatens...,Some assembly required.,Marvel Studios,United States of America,"new york city, superhero, shield, based on com..."


In [9]:
movies.shape

(9706, 8)

In [10]:
# Droping the rows with null values in the title column
movies = movies[movies['title'].notna()]

In [11]:
movies.isnull().sum()

id                      0
title                   0
genres                  0
overview                0
tagline                 0
production_companies    0
production_countries    0
keywords                0
dtype: int64

In [12]:
# Now we can proceed to the next step

In [13]:
# Removing the spaces between keywords -- Example : united states of america => unitedstatesofamerica
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [14]:
movies['production_companies'] = movies['production_companies'].apply(
    lambda x: x.split(',') if isinstance(x, str) else []
)

In [15]:
movies['production_countries'] = movies['production_countries'].apply(
    lambda x: x.split(',') if isinstance(x, str) else []
)

In [16]:
movies.sample(5)

Unnamed: 0,id,title,genres,overview,tagline,production_companies,production_countries,keywords
387088,1149974,The Expediter,Drama,A soulful coming of age story set in 1975 New ...,"War, work, women & sometimes...wisdom.",[Conscience Circle],[United States of America],"1970s, coming of age, manufacturing, work rela..."
1015581,863019,Ishq e Qalandar - The Beautiful Sindh,"Documentary, History",Ishq e Qalandar - The Beautiful Sindh is a tra...,"Highlighting the true beauty of Sindh, Pakista...",[Beats Ventures and Consulting],[Pakistan],"pakistan, road trip, travel, food, youtube, to..."
14914,76349,1911,"Adventure, Drama, Action, History, War",China's first President Sun Yat-Sen and milita...,The Fall of the Last Empire,"[Changchun Film Studio, Shanghai Film Group, ...","[China, Hong Kong, Taiwan]","martial arts, sword, revolution, gunfight, bru..."
82623,294861,An Evergreen Christmas,"Family, Music, Romance",Leaving her seemingly glamorous Hollywood life...,Home never sounded so good,"[WonderStar Productions, Kim and Jim Producti...",[United States of America],"christmas tree, family business , death of fat..."
1180660,787699,Wonka,"Adventure, Comedy, Family, Fantasy",Willy Wonka – chock-full of ideas and determin...,Discover how Willy became…,"[Warner Bros. Pictures, Heyday Films, Villag...","[Australia, United Kingdom, United States of...","chocolate, musical, prequel, duringcreditsstin..."


In [17]:
# Helper function to safely convert any value to string
def safe_to_string(x):
    if isinstance(x, list):
        return ' '.join(str(item) for item in x)
    elif pd.isna(x):
        return ''
    else:
        return str(x)

# Apply the conversion to all columns before concatenation
movies['tags'] = (
    movies['genres'].apply(safe_to_string) + ' ' +
    movies['overview'].apply(safe_to_string) + ' ' +
    movies['tagline'].apply(safe_to_string) + ' ' +
    movies['production_companies'].apply(safe_to_string) + ' ' +
    movies['production_countries'].apply(safe_to_string) + ' ' +
    movies['keywords'].apply(safe_to_string)
)

In [18]:
movies['tags'].sample(5)

32356    Drama, Romance, War Set in the fields of Devon...
1519     Comedy The story of a mild-mannered radio exec...
72437    Comedy, Romance An isolated guy with cerebral ...
31368    Thriller, Action After the controversial disap...
12338    Animation, Action, Thriller, Science Fiction A...
Name: tags, dtype: object

In [19]:
movies = movies.drop(columns=['genres', 'overview', 'tagline', 'production_companies', 'production_countries', 'keywords'])
movies.head()

Unnamed: 0,id,title,tags
0,27205,Inception,"Action, Science Fiction, Adventure Cobb, a ski..."
1,157336,Interstellar,"Adventure, Drama, Science Fiction The adventur..."
2,155,The Dark Knight,"Drama, Action, Crime, Thriller Batman raises t..."
3,19995,Avatar,"Action, Adventure, Fantasy, Science Fiction In..."
4,24428,The Avengers,"Science Fiction, Action, Adventure When an une..."


In [20]:
movies.shape

(9706, 3)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF generally works better for recommendation systems
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vectors = tfidf.fit_transform(movies['tags']).toarray()

In [22]:
vectors.shape

(9706, 5000)

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [24]:
similarity

array([[1.        , 0.10307986, 0.07331028, ..., 0.02000914, 0.02192231,
        0.01625047],
       [0.10307986, 1.        , 0.01978263, ..., 0.00356034, 0.01069411,
        0.04635341],
       [0.07331028, 0.01978263, 1.        , ..., 0.01591503, 0.01245388,
        0.00280225],
       ...,
       [0.02000914, 0.00356034, 0.01591503, ..., 1.        , 0.00458787,
        0.01574449],
       [0.02192231, 0.01069411, 0.01245388, ..., 0.00458787, 1.        ,
        0.02901385],
       [0.01625047, 0.04635341, 0.00280225, ..., 0.01574449, 0.02901385,
        1.        ]], shape=(9706, 9706))

In [25]:
def recommend(movie):
    index = movies[movies['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(movies.iloc[i[0]].title)
        
    

In [27]:
recommend('Deadpool')

Deadpool 2
X-Men Origins: Wolverine
X-Men
X-Men: Apocalypse
Logan


In [None]:
import pickle
pickle.dump(movies, open('movies_dict.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))