In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# title.basics.tsv.gz - Contains the following information for titles:
* **tconst (string)** - alphanumeric unique identifier of the title.
* **titleType (string)** – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc).
* **primaryTitle (string)** – the more popular title / the title used by the filmmakers on promotional materials at the point of release.
* **originalTitle (string)** - original title, in the original language.
* **isAdult (boolean)** - 0: non-adult title; 1: adult title.
* **startYear (YYYY)** – represents the release year of a title. In the case of TV Series, it is the series start year.
* **endYear (YYYY)** – TV Series end year. for all other title types.
* **runtimeMinutes** – primary runtime of the title, in minutes.
* **genres (string array)** – includes up to three genres associated with the title.

In [18]:
df_title_info = pd.read_csv('/kaggle/input/imdb-dataset/title.basics.tsv/title.basics.tsv',
                           sep = '\t', na_values = ['nan', '\\N'], low_memory = False)

In [19]:
# Selecting Required Features
df_title_info1 = df_title_info[['tconst', 'titleType', 'originalTitle', 'genres', 'runtimeMinutes', 'startYear', 'isAdult']][(df_title_info.titleType == 'movie') & (df_title_info.startYear >= 1970)]

# Dropping NaN values
df_title_info1 = df_title_info1.dropna()

# Changing runtimeMinutes column dtype to float
df_title_info1 = df_title_info1.astype({'runtimeMinutes': 'float32'})

# Containing titles with run_time >= 30 Minutes 
df_title_info1 = df_title_info1[df_title_info1['runtimeMinutes'] >=30]

# title.akas.tsv.gz - Contains the following information for titles:
* **titleId (string)** - a tconst, an alphanumeric unique identifier of the title.
* **ordering (integer)** – a number to uniquely identify rows for a given titleId.
* **title (string)** – the localized title.
* **region (string)** - the region for this version of the title.
* **language (string)** - the language of the title.
* **types (array)** - Enumerated set of attributes for this alternative title. One or more of the following: "alternative", "dvd", "festival", "tv", "video", "working", "original", "imdbDisplay". New values may be added in the future without warning.
* **attributes (array)** - Additional terms to describe this alternative title, not enumerated.
* **isOriginalTitle (boolean)** – 0: not original title; 1: original title.

In [35]:
df_regional_title_info = pd.read_csv('/kaggle/input/imdb-dataset/title.akas.tsv/title.akas.tsv',
                                     sep = '\t', na_values = ['nan', '\\N'], low_memory = False)

In [95]:
# Selecting Required Features
# df_regional_title_info1 = df_regional_title_info[['titleId', 'language', 'region']][df_regional_title_info.language.isin(['en', 'hi'])]
df_regional_title_info1 = df_regional_title_info[['titleId', 'region']][(df_regional_title_info.language=="en") | (df_regional_title_info.region.str.lower() == "in") ]

# Dropping rows with NaN values
df_regional_title_info1 = df_regional_title_info1.dropna()

# containing only first row of each title (only first region and languages are selected)
df_regional_title_info1 = df_regional_title_info1.groupby('titleId').first().reset_index()

# title.ratings.tsv.gz – Contains the IMDb rating and votes information for titles:
* **tconst (string)** - alphanumeric unique identifier of the title.
* **averageRating** – weighted average of all the individual user ratings.
* **numVotes** - number of votes the title has received.

In [29]:
df_rating = pd.read_csv('/kaggle/input/imdb-dataset/title.ratings.tsv/title.ratings.tsv',
                        sep = '\t', na_values = ['nan', '\\N'], low_memory = False)

In [30]:
# Selecting ratings with numVotes >= 1000
df_rating1 = df_rating[df_rating.numVotes >= 1000][['tconst', 'averageRating']]

# Dropping rows with NaN values
df_rating1 = df_rating1.dropna()

# title.principals.tsv.gz – Contains the principal cast/crew for titles
* **tconst (string)**- alphanumeric unique identifier of the title
* **ordering (integer)** – a number to uniquely identify rows for a given titleId
* **nconst (string)** - alphanumeric unique identifier of the name/person
* **category (string)** - the category of job that person was in
* **job (string)** - the specific job title if applicable, else '\N'
* **characters (string)** - the name of the character played if applicable, else '\N'

In [47]:
df_pricipal_cast_info = pd.read_csv('/kaggle/input/imdb-dataset/title.principals.tsv/title.principals.tsv',
                          sep = '\t', na_values = ['nan', '\\N'], low_memory = False)

In [48]:
df_actor_info = df_pricipal_cast_info[['tconst', 'nconst', 'category']][df_pricipal_cast_info.category == 'actor']
df_actress_info = df_pricipal_cast_info[['tconst', 'nconst', 'category']][df_pricipal_cast_info.category == 'actress']
df_director_info = df_pricipal_cast_info[['tconst', 'nconst', 'category']][df_pricipal_cast_info.category == 'director']
df_producer_info = df_pricipal_cast_info[['tconst', 'nconst', 'category']][df_pricipal_cast_info.category == 'producer']

# title.crew.tsv.gz – Contains the director and writer information for all the titles in IMDb. Fields include:
* **tconst (string)** - alphanumeric unique identifier of the title
* **directors (array of nconsts)** - director(s) of the given title
* **writers (array of nconsts)** – writer(s) of the given title

In [None]:
# df_crew_info = pd.read_csv('/kaggle/input/imdb-crew-information/data.tsv',
#                           sep = '\t', na_values = ['nan', '\\N'], low_memory = False)

# # Filling na values with empty strings
# df_crew_info = df_crew_info.fillna('')

# name.basics.tsv.gz – Contains the following information for names:
* **nconst (string)** - alphanumeric unique identifier of the name/person.
* **primaryName (string)** – name by which the person is most often credited.
* **birthYear** – in YYYY format.
* **deathYear** – in YYYY format if applicable, else .
* **primaryProfession (array of strings)** – the top-3 professions of the person.
* **knownForTitles (array of tconsts)** – titles the person is known for.

In [49]:
df_person_name_info = pd.read_csv('/kaggle/input/imdb-dataset/name.basics.tsv/name.basics.tsv',
                           sep = '\t', na_values = ['nan', '\\N'], low_memory = False)

df_person_name_info = df_person_name_info[['nconst', 'primaryName']].fillna('')

In [None]:
### Functions not used
### These functions intended to be used for considering every director or writer name

# # convert director_nconst_val to director_nconst_val_list
# def get_director_nconst_list(director_nconst_val):
#     director_names = df_crew_info[df_crew_info.directors == director_nconst_val]['directors'].values[0]
#     return list(director_names.split(","))


# # convert writer_nconst_val to writer_nconst_val_list
# def get_writer_nconst_list(writer_nconst_val):
#     writer_names = df_crew_info[df_crew_info.writers == writer_nconst_val]['writers'].values[0]
#     return list(writer_names.split(","))


# # function to get primaryName from nconst_value_list
# def get_name_from_nconst(nconst_val_list):
#     primary_name_list = []
#     for i in range(len(nconst_val_list)):
#         pri_name = df_cast_info[df_cast_info['nconst'] == nconst_val_list[i]]['primaryName'].values[0]
#         primary_name_list.append(pri_name)
#     return primary_name_list

In [50]:
df_actor = df_actor_info[['tconst', 'nconst']].set_index('nconst').join(df_person_name_info.set_index('nconst'), how = 'inner')
df_actor = df_actor.reset_index(drop=True)
df_actor = df_actor.groupby('tconst')['primaryName'].apply(', '.join).reset_index()
df_actor = df_actor.rename(columns={'primaryName': 'actor'})

df_actor.to_csv('df_actor.csv', index=False)

In [51]:
df_actress = df_actress_info[['tconst', 'nconst']].set_index('nconst').join(df_person_name_info.set_index('nconst'), how = 'inner')
df_actress = df_actress.reset_index(drop=True)
df_actress = df_actress.groupby('tconst')['primaryName'].apply(', '.join).reset_index()
df_actress = df_actress.rename(columns={'primaryName': 'actress'})

df_actress.to_csv('df_actress.csv', index=False)

In [52]:
df_director = df_director_info[['tconst', 'nconst']].set_index('nconst').join(df_person_name_info.set_index('nconst'), how = 'inner')
df_director = df_director.reset_index(drop=True)
df_director = df_director.groupby('tconst')['primaryName'].apply(', '.join).reset_index()
df_director = df_director.rename(columns={'primaryName': 'director'})

df_director.to_csv('df_director.csv')

In [53]:
df_producer = df_producer_info[['tconst', 'nconst']].set_index('nconst').join(df_person_name_info.set_index('nconst'), how = 'inner')
df_producer = df_producer.reset_index(drop=True)
df_producer = df_producer.groupby('tconst')['primaryName'].apply(', '.join).reset_index()
df_producer = df_producer.rename(columns={'primaryName': 'producer'})

df_producer.to_csv('df_producer.csv')

# Merging the dataframes to get final df

In [99]:
# Adding title Dataframe to df
df = df_title_info1

# Adding Rating Dataframe to df
df = df.join(df_rating1.set_index('tconst'), on = 'tconst', how = 'inner')

# Adding Region and language Dataframe to df and setting index to movieId (or tconst)
df = df.set_index('tconst').join(df_regional_title_info1.set_index('titleId'), how = 'inner')

# Adding actor dataframe to df
df = df.join(df_actor.set_index('tconst'), how = 'left')

# Adding actor dataframe to df
df = df.join(df_actress.set_index('tconst'), how = 'left')

# Adding actor dataframe to df
df = df.join(df_director.set_index('tconst'), how = 'left')

# Adding actor dataframe to df
df = df.join(df_producer.set_index('tconst'), how = 'left')

In [100]:
# Resetting Index
df = df.reset_index(drop=True)

# Filling NA values
df = df.fillna('')

In [147]:
# df.to_csv('imdb_data.csv', index=False)
df.to_csv('imdb_data1.csv', index=False)

# Making the model - Content Based Movie Recommendation

In [102]:
# df_final = pd.read_csv('/kaggle/input/imdb-data-finalcsv/imdb_data_final.csv')
df_final = df.copy()
df_final.to_csv('imdb_data_x.csv', index=False)

In [103]:
df_final.shape

(11419, 12)

In [104]:
# Selecting titles having 8 words or less
df_final['originalTitle'] = df_final[['originalTitle']].apply(lambda x: x['originalTitle'] if (len(x['originalTitle'].split())<=6) else '', axis=1)

In [105]:
df_final = df_final.fillna('')
df_final = df_final.reset_index(drop=True)

In [106]:
# Converting boolean values of column 'isAdult' to isAdult and notAdult

df_final['isAdult'] = df_final[['isAdult']].apply(lambda x: 'notAdult' if (x['isAdult'] == 0) else 'notAdult', axis=1)
df_final['isAdult'] = df_final['isAdult'].str.lower()

In [107]:
# Creating column that contains all the required combined features
df_final['combined_features'] = df_final[['originalTitle', 'genres', 'isAdult', 'region', 'actor', 'actress', 'director', 'producer']].apply(lambda x: ' '.join(x), axis = 1).str.lower()

**Creating Count Matrix for df_final and Calculating Cosine Similarity**
* Cosine Similarity is used to find angular distance between two movies. Cosine_similarity_val near to 1 means high similarity index
* cos_similarity has value in form of tuples

In [122]:
# Initializing CountVectorizer and transforming data
cv = CountVectorizer()
feature_data_matrix = cv.fit_transform(df_final['combined_features'])

# Gives number of features 
len(cv.get_feature_names())

44182

In [123]:
# finding cosine similarity between features stored in feature_data_matrix
cos_similarity = cosine_similarity(feature_data_matrix)

## Helper Functions

In [117]:
# This function returns index based on the originalTitle
def get_index_from_title(val_title):
    return df_final[df_final.originalTitle.str.lower() == val_title.lower()].index[0]


# This function returns originalTitle based on the index of the title
def get_title_from_index(val_index):
    return df_final[df_final.index == val_index]['originalTitle'].values[0]


# This function returns genres based on the index of the title
def get_genre_from_index(val_index):
    return df_final[df_final.index == val_index]['genres'].values[0]


# This function returns runTimeMinutes based on the index of the title
def get_runTime_from_index(val_index):
    return df_final[df_final.index == val_index]['runtimeMinutes'].values[0]


# This function returns averageRating based on the index of the title
def get_rating_from_index(val_index):
    return df_final[df_final.index == val_index]['averageRating'].values[0]


# This function returns Year based on the index of the title
def get_year_from_index(val_index):
    return df_final[df_final.index == val_index]['startYear'].values[0]


# Testing the model.
* Input given is movie and model will return similar movies

In [144]:
# Enter The exact movie name to which similar movies needs to be found
user_movie = "Agneepath"

# Checking if movie exists before running below cells
df_final[df_final.originalTitle.str.lower() == user_movie.lower()]

Unnamed: 0,titleType,originalTitle,genres,runtimeMinutes,startYear,isAdult,averageRating,region,actor,actress,director,producer,combined_features
7720,movie,Agneepath,"Action,Drama",174.0,2012.0,notadult,6.9,IN,"Hrithik Roshan, Sanjay Dutt, Rishi Kapoor",Priyanka Chopra,Karan Malhotra,"Hiroo Johar, Karan Johar","agneepath action,drama notadult in hrithik ros..."


In [125]:
# Index of Input movie
user_movie_index = get_index_from_title(user_movie)

# Find Similar Movies
similar_movies= list(enumerate(cos_similarity[user_movie_index]))

In [126]:
# Sorting movies with Similarity order. Most similar movie get the top position
most_similar_movies = sorted(similar_movies, key = lambda x:x[1], reverse=True)

# Top 20 similar movies
print("Top 20 similar movies to: ", user_movie)
i = 1
for mov in most_similar_movies:
    print('\nTitle:', get_title_from_index(mov[0]), 
          '\nGenres:', get_genre_from_index(mov[0]),
         '\nRunTime Minutes:', get_runTime_from_index(mov[0]),
         '\nRating:', get_rating_from_index(mov[0]),
         '\nYear:', get_year_from_index(mov[0]))
    i += 1
    if i>20:
        break

Top 20 similar movies to:  lakshya

Title: Lakshya 
Genres: Action,Drama,Romance 
RunTime Minutes: 186.0 
Rating: 7.9 
Year: 2004.0

Title: Koi... Mil Gaya 
Genres: Action,Drama,Romance 
RunTime Minutes: 171.0 
Rating: 7.1 
Year: 2003.0

Title: Zindagi Na Milegi Dobara 
Genres: Comedy,Drama 
RunTime Minutes: 155.0 
Rating: 8.1 
Year: 2011.0

Title: Dil Chahta Hai 
Genres: Comedy,Drama,Romance 
RunTime Minutes: 183.0 
Rating: 8.1 
Year: 2001.0

Title: Armaan 
Genres: Drama,Family,Romance 
RunTime Minutes: 158.0 
Rating: 5.2 
Year: 2003.0

Title: Wazir 
Genres: Action,Crime,Drama 
RunTime Minutes: 103.0 
Rating: 7.1 
Year: 2016.0

Title: Dev 
Genres: Drama 
RunTime Minutes: 172.0 
Rating: 6.9 
Year: 2004.0

Title: Kyun! Ho Gaya Na... 
Genres: Comedy,Drama,Romance 
RunTime Minutes: 165.0 
Rating: 4.2 
Year: 2004.0

Title: Zanjeer 
Genres: Action,Crime,Drama 
RunTime Minutes: 145.0 
Rating: 7.6 
Year: 1973.0

Title: Mission Kashmir 
Genres: Action,Drama,Thriller 
RunTime Minutes: 154.0 
Ra