## Content-based Movie Recommendation System

In [1]:
import pandas as pd
import numpy as np

In [3]:
movies = pd.read_csv('Dataset/tmdb_5000_movies.csv')
credits = pd.read_csv('Dataset/tmdb_5000_credits.csv')

In [4]:
movies.shape

(4803, 20)

In [5]:
credits.shape

(4803, 4)

In [6]:
# merging both datasets
movies = movies.merge(credits, on='title')
movies.shape

(4809, 23)

In [7]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [9]:
# relevant columns:
# genres
# id
# keywords
# title
# overview
# cast
# crew
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [10]:
# check for missing data
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [11]:
movies.dropna(inplace=True)

In [12]:
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [13]:
# check for duplicate data
movies.duplicated().sum()

0

In [14]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [15]:
movies.iloc[0].keywords

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [16]:
# convert genres and keywords into a list

import ast
def convert(obj):
    list = [i['name'] for i in ast.literal_eval(obj)]
    return list


In [17]:
movies['genres'] = movies['genres'].apply(convert)

In [18]:
movies['keywords'] = movies['keywords'].apply(convert)

In [19]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [20]:
# take only top 3 actors from cast
def convert3(obj):
    obj = ast.literal_eval(obj)[:3]
    list = [i['name'] for i in obj]
    return list

In [21]:
movies['cast'] = movies['cast'].apply(convert3)

In [22]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [23]:
def fetch_director(obj):
    obj = ast.literal_eval(obj)
    director = [i['name'] for i in obj if i['job']=='Director']
    return director

In [24]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [25]:
movies.rename(columns={'crew':'director'}, inplace=True)

In [26]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,director
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


In [27]:
# convert overview from string to list
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [28]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,director
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


In [29]:
# remove spaces from names in genres, keywords, cast, director
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['director'] = movies['director'].apply(lambda x:[i.replace(" ","") for i in x])
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,director
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]


In [30]:
# create a new column tags
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['director']
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,director,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."


In [31]:
new_df = movies[['movie_id', 'title', 'tags']]
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [32]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))


In [33]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [34]:
# change every word to lowercase
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())
new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [35]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable


DEPRECATION: Loading egg at c:\program files\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 23.3 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [36]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [37]:
# stop word removal and stemming
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
ps = PorterStemmer()
stopword_list = stopwords.words(fileids = 'english')


In [38]:
def stem(text):
    word_list = text.split()
    stemmed_text = " ".join([ps.stem(word) for word in word_list if word not in stopword_list])
    return stemmed_text

In [39]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [40]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"22nd century, parapleg marin dispatch moon pan..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ dead, come back ..."
2,206647,Spectre,cryptic messag bond’ past send trail uncov sin...
3,49026,The Dark Knight Rises,"follow death district attorney harvey dent, ba..."
4,49529,John Carter,"john carter war-weary, former militari captain..."


In [41]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=10000)
vectors = cv.fit_transform(new_df['tags']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [42]:
list(cv.get_feature_names_out())

['000',
 '007',
 '10',
 '100',
 '10th',
 '11',
 '12',
 '12th',
 '13',
 '14',
 '15',
 '150',
 '15th',
 '16',
 '16th',
 '17',
 '17th',
 '18',
 '1863',
 '1890',
 '18th',
 '18thcenturi',
 '19',
 '1910',
 '1920',
 '1927',
 '1930',
 '1930s',
 '1937',
 '1940',
 '1940s',
 '1941',
 '1944',
 '1945',
 '1950',
 '1950s',
 '1955',
 '1959',
 '1960',
 '1960s',
 '1962',
 '1964',
 '1965',
 '1967',
 '1969',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1976',
 '1977',
 '1979',
 '1980',
 '1980s',
 '1984',
 '1985',
 '1986',
 '1987',
 '1990',
 '1994',
 '1995',
 '1996',
 '1997',
 '1999',
 '19th',
 '19thcenturi',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2007',
 '2008',
 '2009',
 '2011',
 '2012',
 '20th',
 '21st',
 '21stcenturi',
 '22nd',
 '23',
 '24',
 '25',
 '27',
 '28',
 '29',
 '30',
 '300',
 '35',
 '3d',
 '40',
 '400',
 '47',
 '50',
 '500',
 '51',
 '60',
 '60s',
 '70',
 '7th',
 '80',
 'aaron',
 'aaroneckhart',
 'aaronseltz',
 'aarontaylor',
 'abandon',
 'abbi',
 'abbiecornis

In [43]:
vectors.shape

(4806, 10000)

In [44]:
# calculate cosine distance between vectors
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)
similarity

array([[1.        , 0.07098728, 0.07098728, ..., 0.03933686, 0.        ,
        0.        ],
       [0.07098728, 1.        , 0.07894737, ..., 0.02187393, 0.        ,
        0.        ],
       [0.07098728, 0.07894737, 1.        , ..., 0.02187393, 0.        ,
        0.        ],
       ...,
       [0.03933686, 0.02187393, 0.02187393, ..., 1.        , 0.03776275,
        0.03669879],
       [0.        , 0.        , 0.        , ..., 0.03776275, 1.        ,
        0.0571662 ],
       [0.        , 0.        , 0.        , ..., 0.03669879, 0.0571662 ,
        1.        ]])

In [45]:
similarity.shape

(4806, 4806)

In [46]:
sorted(list(enumerate(similarity[0])) ,reverse=True,key=lambda x:x[1])#0th 0th movie saath distance
#1 wale number se sorting nhi , second wale se sorting hori

[(0, 1.0),
 (3730, 0.2342460684935891),
 (1216, 0.23323890508917103),
 (2409, 0.22983614055242746),
 (539, 0.2187974872468418),
 (507, 0.21226474130316747),
 (778, 0.20628424925175864),
 (582, 0.20377359689625751),
 (1204, 0.20036097492521526),
 (4048, 0.19569842191603265),
 (1194, 0.19296124624698996),
 (61, 0.18960514247076027),
 (1920, 0.18754070335443584),
 (942, 0.18685673434682065),
 (322, 0.18005965464253748),
 (495, 0.17864740025262413),
 (2971, 0.17560081436822506),
 (2786, 0.175109477645991),
 (4192, 0.17503798979747348),
 (74, 0.1723771054143206),
 (1089, 0.17202205435679124),
 (151, 0.1716388703071869),
 (260, 0.1701758234142103),
 (47, 0.16956460837017517),
 (3608, 0.16956460837017517),
 (4405, 0.1684303842133038),
 (972, 0.1678603981989089),
 (4, 0.1667028474261652),
 (2333, 0.16539535392599136),
 (305, 0.1628309295566865),
 (577, 0.1614429614050549),
 (973, 0.1614429614050549),
 (1329, 0.15978709238739225),
 (172, 0.15978709238739222),
 (3675, 0.15734745152973198),
 (94,

In [47]:
def recommend(movie):
  #convert in lower case
  movie = movie.lower()

  # Check if the movie exists in new_df (case-insensitive)
  if movie not in new_df['title'].str.lower().values:
      print(f"The movie '{movie}' is not in the database.")
      return

  # Get the index of the movie in new_df
  movie_index = new_df[new_df['title'].str.lower() == movie].index[0]

  # Calculate movie similarities and recommend similar movies
  distances = similarity[movie_index]
  movies_list = sorted(enumerate(distances), reverse=True, key=lambda x: x[1])[1:6]

  print(f"Recommendations for '{movie}':")
  for i in movies_list:
      recommended_movie = new_df.iloc[i[0]].title
      print(recommended_movie)



In [48]:
recommend('Batman')

Recommendations for 'batman':
Batman
Batman & Robin
The Dark Knight Rises
Batman Returns
The Dark Knight


In [49]:
recommend('aliens')

Recommendations for 'aliens':
Alien³
Alien
Meet Dave
Escape from Planet Earth
Alien: Resurrection


In [50]:
recommend('the matrix')

Recommendations for 'the matrix':
The Matrix Revolutions
The Matrix Reloaded
Hackers
WarGames
The Thirteenth Floor


In [51]:
import pickle

In [52]:
new_df.to_dict()

{'movie_id': {0: 19995,
  1: 285,
  2: 206647,
  3: 49026,
  4: 49529,
  5: 559,
  6: 38757,
  7: 99861,
  8: 767,
  9: 209112,
  10: 1452,
  11: 10764,
  12: 58,
  13: 57201,
  14: 49521,
  15: 2454,
  16: 24428,
  17: 1865,
  18: 41154,
  19: 122917,
  20: 1930,
  21: 20662,
  22: 57158,
  23: 2268,
  24: 254,
  25: 597,
  26: 271110,
  27: 44833,
  28: 135397,
  29: 37724,
  30: 558,
  31: 68721,
  32: 12155,
  33: 36668,
  34: 62211,
  35: 8373,
  36: 91314,
  37: 68728,
  38: 102382,
  39: 20526,
  40: 49013,
  41: 44912,
  42: 10193,
  43: 534,
  44: 168259,
  45: 72190,
  46: 127585,
  47: 54138,
  48: 81005,
  49: 64682,
  50: 9543,
  51: 68726,
  52: 38356,
  53: 217,
  54: 105864,
  55: 62177,
  56: 188927,
  57: 10681,
  58: 5174,
  59: 14161,
  60: 17979,
  61: 76757,
  62: 258489,
  63: 411,
  64: 246655,
  65: 155,
  66: 14160,
  67: 15512,
  68: 1726,
  69: 44826,
  70: 8487,
  71: 1735,
  72: 297761,
  73: 2698,
  74: 137113,
  75: 9804,
  76: 14869,
  77: 150540,
  78:

In [69]:
pickle.dump(new_df.to_dict(),open('movies_dict.pkl','wb'))

In [70]:
pickle.dump(similarity,open('similarity.pkl','wb'))

In [52]:
pip freeze > requirements_nlp.txt

Note: you may need to restart the kernel to use updated packages.


DEPRECATION: Loading egg at c:\program files\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330
