In [1]:
import pandas as pd

# Load the dataset
tmdb_data = pd.read_csv('dataset.csv')

# Display the first few rows to understand its structure
tmdb_data.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [2]:
# Check for missing values
tmdb_data.isnull().sum()

id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

In [4]:
# Drop rows with missing values in critical columns. Adjust the columns list as needed.
tmdb_data.dropna(subset=['genre', 'overview'], inplace=True)

In [5]:
# Remove duplicates based on the movie title
tmdb_data.drop_duplicates(subset='title', inplace=True)

In [6]:
# Reset index after dropping rows
tmdb_data.reset_index(drop=True, inplace=True)

In [8]:
tmdb_data['genre'].head()

0             Drama,Crime
1    Comedy,Drama,Romance
2             Drama,Crime
3       Drama,History,War
4             Drama,Crime
Name: genre, dtype: object

In [10]:
# Convert genre strings to lists by splitting on commas
tmdb_data['genre'] = tmdb_data['genre'].apply(lambda x: x.split(',') if isinstance(x, str) else [])


In [11]:
tmdb_data

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"[Drama, Crime]",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"[Comedy, Drama, Romance]",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"[Drama, Crime]",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"[Drama, History, War]",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"[Drama, Crime]",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811
...,...,...,...,...,...,...,...,...,...
9641,168098,Cell,"[Horror, Science Fiction, Thriller]",en,When a strange signal pulsates through all cel...,19.521,2016-07-06,4.7,910
9642,10196,The Last Airbender,"[Action, Adventure, Fantasy]",en,"The story follows the adventures of Aang, a yo...",98.322,2010-06-30,4.7,3347
9643,331446,Sharknado 3: Oh Hell No!,"[Action, TV Movie, Science Fiction, Comedy, Ad...",en,The sharks take bite out of the East Coast whe...,12.490,2015-07-22,4.7,417
9644,13995,Captain America,"[Action, Science Fiction, War]",en,"During World War II, a brave, patriotic Americ...",18.333,1990-12-14,4.6,332


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'overview' text to a matrix
tfidf_matrix = tfidf.fit_transform(tmdb_data['overview'])

# Print the shape of the tfidf_matrix to understand its size
print(tfidf_matrix.shape)


(9646, 27628)


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

# Assuming your genres column has been successfully converted to a space-separated string of genres
tmdb_data['genres'] = tmdb_data['genre'].apply(lambda x: ' '.join(x))

# Initialize the CountVectorizer
count_vectorizer = CountVectorizer()

# Apply CountVectorizer to the 'genres' column to create a genre matrix
genres_matrix = count_vectorizer.fit_transform(tmdb_data['genres'])

# Print the shape to understand its size
print(genres_matrix.shape)


(9646, 20)


In [17]:
tmdb_data


Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count,genres
0,278,The Shawshank Redemption,"[Drama, Crime]",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862,Drama Crime
1,19404,Dilwale Dulhania Le Jayenge,"[Comedy, Drama, Romance]",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731,Comedy Drama Romance
2,238,The Godfather,"[Drama, Crime]",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280,Drama Crime
3,424,Schindler's List,"[Drama, History, War]",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959,Drama History War
4,240,The Godfather: Part II,"[Drama, Crime]",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811,Drama Crime
...,...,...,...,...,...,...,...,...,...,...
9641,168098,Cell,"[Horror, Science Fiction, Thriller]",en,When a strange signal pulsates through all cel...,19.521,2016-07-06,4.7,910,Horror Science Fiction Thriller
9642,10196,The Last Airbender,"[Action, Adventure, Fantasy]",en,"The story follows the adventures of Aang, a yo...",98.322,2010-06-30,4.7,3347,Action Adventure Fantasy
9643,331446,Sharknado 3: Oh Hell No!,"[Action, TV Movie, Science Fiction, Comedy, Ad...",en,The sharks take bite out of the East Coast whe...,12.490,2015-07-22,4.7,417,Action TV Movie Science Fiction Comedy Adventure
9644,13995,Captain America,"[Action, Science Fiction, War]",en,"During World War II, a brave, patriotic Americ...",18.333,1990-12-14,4.6,332,Action Science Fiction War


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

# Assuming 'tfidf_matrix' is your TF-IDF matrix and 'genres_matrix' is your one-hot encoded genres matrix
# Calculate the cosine similarity for descriptions
cosine_sim_desc = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Calculate the cosine similarity for genres
cosine_sim_genres = cosine_similarity(genres_matrix, genres_matrix)

# Combine these similarity matrices. You can start by averaging them.
cosine_sim_combined = (cosine_sim_desc + cosine_sim_genres) / 2


In [29]:
tmdb_data

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count,genres
0,278,The Shawshank Redemption,"[Drama, Crime]",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862,Drama Crime
1,19404,Dilwale Dulhania Le Jayenge,"[Comedy, Drama, Romance]",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731,Comedy Drama Romance
2,238,The Godfather,"[Drama, Crime]",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280,Drama Crime
3,424,Schindler's List,"[Drama, History, War]",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959,Drama History War
4,240,The Godfather: Part II,"[Drama, Crime]",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811,Drama Crime
...,...,...,...,...,...,...,...,...,...,...
9641,168098,Cell,"[Horror, Science Fiction, Thriller]",en,When a strange signal pulsates through all cel...,19.521,2016-07-06,4.7,910,Horror Science Fiction Thriller
9642,10196,The Last Airbender,"[Action, Adventure, Fantasy]",en,"The story follows the adventures of Aang, a yo...",98.322,2010-06-30,4.7,3347,Action Adventure Fantasy
9643,331446,Sharknado 3: Oh Hell No!,"[Action, TV Movie, Science Fiction, Comedy, Ad...",en,The sharks take bite out of the East Coast whe...,12.490,2015-07-22,4.7,417,Action TV Movie Science Fiction Comedy Adventure
9644,13995,Captain America,"[Action, Science Fiction, War]",en,"During World War II, a brave, patriotic Americ...",18.333,1990-12-14,4.6,332,Action Science Fiction War


In [31]:
# Convert release_date to datetime format
tmdb_data['release_date'] = pd.to_datetime(tmdb_data['release_date'])

# Rename columns for clarity
tmdb_data.rename(columns={'vote_average': 'rating'}, inplace=True)


In [34]:
# Example: Extract year from release_date
tmdb_data['release_date'] = tmdb_data['release_date'].dt.year

In [35]:
# Rename columns for clarity
tmdb_data.rename(columns={'release_date': 'release_year'}, inplace=True)

In [36]:
tmdb_data

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_year,rating,vote_count,genres
0,278,The Shawshank Redemption,"[Drama, Crime]",en,Framed in the 1940s for the double murder of h...,94.075,1994,8.7,21862,Drama Crime
1,19404,Dilwale Dulhania Le Jayenge,"[Comedy, Drama, Romance]",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995,8.7,3731,Comedy Drama Romance
2,238,The Godfather,"[Drama, Crime]",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972,8.7,16280,Drama Crime
3,424,Schindler's List,"[Drama, History, War]",en,The true story of how businessman Oskar Schind...,44.761,1993,8.6,12959,Drama History War
4,240,The Godfather: Part II,"[Drama, Crime]",en,In the continuing saga of the Corleone crime f...,57.749,1974,8.6,9811,Drama Crime
...,...,...,...,...,...,...,...,...,...,...
9641,168098,Cell,"[Horror, Science Fiction, Thriller]",en,When a strange signal pulsates through all cel...,19.521,2016,4.7,910,Horror Science Fiction Thriller
9642,10196,The Last Airbender,"[Action, Adventure, Fantasy]",en,"The story follows the adventures of Aang, a yo...",98.322,2010,4.7,3347,Action Adventure Fantasy
9643,331446,Sharknado 3: Oh Hell No!,"[Action, TV Movie, Science Fiction, Comedy, Ad...",en,The sharks take bite out of the East Coast whe...,12.490,2015,4.7,417,Action TV Movie Science Fiction Comedy Adventure
9644,13995,Captain America,"[Action, Science Fiction, War]",en,"During World War II, a brave, patriotic Americ...",18.333,1990,4.6,332,Action Science Fiction War


In [37]:
from sklearn.metrics.pairwise import cosine_similarity

# Assuming 'tfidf_matrix' is your TF-IDF matrix and 'genres_matrix' is your one-hot encoded genres matrix
# Calculate the cosine similarity for descriptions
cosine_sim_desc = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Calculate the cosine similarity for genres
cosine_sim_genres = cosine_similarity(genres_matrix, genres_matrix)

# Combine these similarity matrices. You can start by averaging them.
cosine_sim_combined = (cosine_sim_desc + cosine_sim_genres) / 2


In [39]:
from sklearn.metrics.pairwise import cosine_similarity

# Example of calculating cosine similarity matrix for TF-IDF matrix of movie descriptions
# tfidf_matrix = tfidf_vectorizer.fit_transform(tmdb_data['description'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [40]:
def recommend_movies(title, num_recommendations=5):
    """
    Recommend movies based on a given movie title and display additional details.

    Parameters:
    - title (str): The title of the movie to find recommendations for.
    - num_recommendations (int): The number of recommendations to return.
    
    Returns:
    - DataFrame with recommended movies and their details.
    """
    # Ensure title is in the index to avoid errors
    if title not in tmdb_data['title'].values:
        return "Movie title not found."
    
    # Find the index of the movie that matches the title
    idx = tmdb_data.index[tmdb_data['title'] == title].tolist()[0]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the most similar movies
    sim_scores = sim_scores[1:num_recommendations+1]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    recommended_movies = tmdb_data.iloc[movie_indices][['title', 'genres', 'release_year', 'rating']]
    
    return recommended_movies

# Example usage
test_title = "Captain America"  # Replace with an actual movie title from your dataset
recommendations = recommend_movies(test_title, 5)
print(recommendations)


                                                  title  \
3288                 Captain America: The First Avenger   
1781                                          Team Thor   
859                 Captain America: The Winter Soldier   
4202                       Ultimate Avengers: The Movie   
7733  Indiana Jones and the Kingdom of the Crystal S...   

                                                 genres  release_year  rating  
3288                   Action Adventure Science Fiction          2011     7.0  
1781                             Comedy Science Fiction          2016     7.4  
859                    Action Adventure Science Fiction          2014     7.7  
4202  Action Animation Family Adventure Science Fiction          2006     6.8  
7733                                   Adventure Action          2008     6.0  


In [42]:
def recommend_movies_by_genre(genre, num_recommendations=10):
    """
    Recommend movies based on a specified genre.

    Parameters:
    - genre (str): The genre to filter by.
    - num_recommendations (int): The number of recommendations to return.

    Returns:
    - recommendations (DataFrame): A DataFrame of recommended movies.
    """
    # Filter movies that contain the specified genre
    filtered_movies = tmdb_data[tmdb_data['genres'].apply(lambda x: genre in x)]
    
    # Sort the filtered movies based on a criterion like popularity or ratings, if available.
    # Here, it's assumed there's a 'popularity' column. Adjust as per your dataset.
    # If no such column exists, you might simply return the first N movies.
    recommendations = filtered_movies.sort_values(by='rating', ascending=False).head(num_recommendations)
    
    return recommendations

# Example usage:
genre = "Action"
recommended_movies = recommend_movies_by_genre(genre, 10)
print(recommended_movies[['title', 'genres', 'rating', 'release_year']])


                                                title  \
15                                    The Dark Knight   
22                                      Seven Samurai   
18      The Lord of the Rings: The Return of the King   
27                                           Harakiri   
35                  Spider-Man: Into the Spider-Verse   
41     Neon Genesis Evangelion: The End of Evangelion   
42                            The Empire Strikes Back   
45  The Lord of the Rings: The Fellowship of the Ring   
47                          Primal: Tales of Savagery   
51              The Lord of the Rings: The Two Towers   

                                        genres  rating  release_year  
15                 Drama Action Crime Thriller     8.5          2008  
22                                Action Drama     8.5          1954  
18                    Adventure Fantasy Action     8.5          2003  
27                        Action Drama History     8.4          1962  
35  Action Advent

In [43]:
def recommend_movies_by_year(year, num_recommendations=5):
    """
    Recommend movies released in a specified year, sorted by a criterion (e.g., popularity).

    Parameters:
    - year (int or str): The release year to filter by.
    - num_recommendations (int): The number of recommendations to return.

    Returns:
    - DataFrame: A DataFrame of recommended movies including their titles, genres, release year, and sorting criterion.
    """
    # Filter movies by the specified year
    movies_in_year = tmdb_data[tmdb_data['release_year'] == year]
    
    # Sort the filtered movies based on a sorting criterion like 'rating'
    # Adjust the 'rating' column name if your dataset uses a different name or metric
    recommendations = movies_in_year.sort_values(by='rating', ascending=False).head(num_recommendations)
    
    # Assuming the dataset includes 'title', 'genres', 'release_year', 'rating', and 'popularity'
    return recommendations[['title', 'genres', 'release_year', 'rating']]

# Example usage
recommendations = recommend_movies_by_year(2020, 10)  # Replace 2020 with the desired year
print(recommendations)


                                                title  \
7                                      Your Eyes Tell   
14                         Gabriel's Inferno: Part II   
19                        Gabriel's Inferno: Part III   
24                       Violet Evergarden: The Movie   
11                                  Gabriel's Inferno   
29                      Josee, the Tiger and the Fish   
44                                        Wolfwalkers   
55  Demon Slayer -Kimetsu no Yaiba- The Movie: Mug...   
56                                              Given   
66                                             Clouds   

                                genres  release_year  rating  
7                        Romance Drama          2020     8.5  
14                             Romance          2020     8.5  
19                      Romance Comedy          2020     8.5  
24     Animation Fantasy Romance Drama          2020     8.5  
11                             Romance          2020     

In [44]:
import pickle

In [46]:

# Saving the dataset with context management
with open('tmdb_data_list.pkl', 'wb') as file:
    pickle.dump(tmdb_data, file)


In [47]:

# Assuming 'cosine_sim' is your cosine similarity matrix
with open('similarity.pkl', 'wb') as file:
    pickle.dump(cosine_sim, file)


In [49]:

# Load the dataset from 'tmdb_data_list.pkl'
with open('tmdb_data_list.pkl', 'rb') as file:
    tmdb_data = pickle.load(file)



In [50]:

# Load the data from 'tmdb_data_list.pkl' and assign it to a variable
tmdb_data = pickle.load(open('tmdb_data_list.pkl', 'rb'))

# Now you can work with 'tmdb_data' as needed
print(tmdb_data)  # For example, to print the data or part of it to verify it's loaded correctly


          id                                          title  \
0        278                       The Shawshank Redemption   
1      19404                    Dilwale Dulhania Le Jayenge   
2        238                                  The Godfather   
3        424                               Schindler's List   
4        240                         The Godfather: Part II   
...      ...                                            ...   
9641  168098                                           Cell   
9642   10196                             The Last Airbender   
9643  331446                       Sharknado 3: Oh Hell No!   
9644   13995                                Captain America   
9645    2312  In the Name of the King: A Dungeon Siege Tale   

                                                  genre original_language  \
0                                        [Drama, Crime]                en   
1                              [Comedy, Drama, Romance]                hi   
2           