In [1]:
# Cell 1: Import Libraries
import numpy as np
import pandas as pd
import ast # Used for processing stringified lists
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib # For saving your model

In [3]:
# Cell 2: Load and Merge Data
movies = pd.read_csv('/data1/students/rupam/Self_Project/Movie_Recommendation/dataset_and_code/tmdb_5000_movies.csv')
credits = pd.read_csv('/data1/students/rupam/Self_Project/Movie_Recommendation/dataset_and_code/tmdb_5000_credits.csv')

# Merge the two dataframes on the 'title' column
movies = movies.merge(credits, on='title')

print("Shape of merged data:", movies.shape)
movies.head(1)

Shape of merged data: (4809, 23)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [4]:
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [5]:
# Cell 3: Data Preprocessing and Feature Selection
# Select the columns that will be most useful for recommendations
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop rows with missing values in these essential columns
movies.dropna(inplace=True)

print("Shape after dropping nulls:", movies.shape)

Shape after dropping nulls: (4806, 7)


In [6]:
# Cell 4: Helper Functions to Extract Features
# The 'genres', 'keywords', etc., are strings that look like lists of dictionaries.
# We need to extract the 'name' from each dictionary.

def convert(text):
    """Converts a stringified list of dictionaries to a list of names."""
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

def fetch_director(text):
    """Fetches the director's name from the crew list."""
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

# Apply the helper functions
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# For cast, we'll just take the top 3 actors
movies['cast'] = movies['cast'].apply(lambda x: convert(x)[:3])
movies['crew'] = movies['crew'].apply(fetch_director)

print("Data after applying helper functions:")
movies.head(2)


Data after applying helper functions:


Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


In [7]:
# Cell 5: Create the 'tags' Column
# We will create a single 'tags' column that contains a string of all relevant keywords for a movie.

# First, remove spaces between words in each feature to avoid confusion
# e.g., 'Science Fiction' becomes 'ScienceFiction'
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])

# Combine everything into a single list
movies['tags'] = movies['overview'].apply(lambda x: x.split()) + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create a new dataframe with just the necessary columns
new_df = movies[['movie_id', 'title', 'tags']]

# Convert the list of tags into a single string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

# Convert tags to lowercase for consistency
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

print("Final dataframe with 'tags':")
new_df.head()

Final dataframe with 'tags':


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [8]:
# Cell 6: Vectorization and Similarity Calculation
# Now we convert the text 'tags' into numerical vectors to calculate similarity.

# Use TF-IDF (Term Frequency-Inverse Document Frequency)
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vectors = tfidf.fit_transform(new_df['tags']).toarray()

# Calculate cosine similarity between all movie vectors
similarity = cosine_similarity(vectors)

print("Shape of similarity matrix:", similarity.shape)

# ---

Shape of similarity matrix: (4806, 4806)


In [9]:
# Cell 7: Create the Recommendation Function (Updated for Case-Insensitivity)

def recommend(movie_title):
    """
    Recommends movies based on a given title, ignoring case.
    """
    try:
        # Convert both the dataframe titles and the input title to lowercase for matching
        movie_index = new_df[new_df['title'].str.lower() == movie_title.lower()].index[0]
    except IndexError:
        # If the movie is not found after the case-insensitive search, print a message.
        print(f"Sorry, the movie '{movie_title}' was not found in our dataset.")
        return

    # Get the similarity scores for that movie against all other movies
    distances = similarity[movie_index]

    # Sort the movies based on similarity and get the top 5 (excluding the movie itself)
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    print(f"Recommendations for '{new_df.iloc[movie_index].title}':\n")
    for i in movies_list:
        # new_df.iloc[i[0]].title will print the original, properly capitalized title
        print(new_df.iloc[i[0]].title)

In [10]:
# Cell 8: Test the Recommendation Function
recommend('Avatar')
print("\n" + "-"*20 + "\n")
recommend('The Dark Knight Rises')
print("\n" + "-"*20 + "\n")
recommend('Iron Man')
print("\n" + "-"*20 + "\n")
recommend('iron man')
# ---

Recommendations for 'Avatar':

Falcon Rising
Battle: Los Angeles
Apollo 18
Star Trek Into Darkness
Titan A.E.

--------------------

Recommendations for 'The Dark Knight Rises':

The Dark Knight
Batman Returns
Batman Begins
Batman Forever
Batman

--------------------

Recommendations for 'Iron Man':

Iron Man 2
Iron Man 3
Avengers: Age of Ultron
Captain America: Civil War
Ant-Man

--------------------

Recommendations for 'Iron Man':

Iron Man 2
Iron Man 3
Avengers: Age of Ultron
Captain America: Civil War
Ant-Man


In [11]:
# Cell 9: Save the Model and Data for your API
# This is the final and most important step to connect to your Flask API.

# Save the processed dataframe. Your API will need this to get movie titles from indices.
new_df.to_pickle('movie_data.pkl')

# Save the similarity matrix. This is your "model".
joblib.dump(similarity, 'movie_model.pkl')

print("\nProcessed data ('movie_data.pkl') and similarity model ('movie_model.pkl') have been saved!")


Processed data ('movie_data.pkl') and similarity model ('movie_model.pkl') have been saved!


In [12]:
# New Cell in your Jupyter Notebook - Run this!

# We need the original merged dataframe before we created the 'tags'
# Let's re-run the merge and cleaning steps to be safe.
# Cell 2: Load and Merge Data
movies = pd.read_csv('/data1/students/rupam/Self_Project/Movie_Recommendation/dataset_and_code/tmdb_5000_movies.csv')
credits = pd.read_csv('/data1/students/rupam/Self_Project/Movie_Recommendation/dataset_and_code/tmdb_5000_credits.csv')
movies = movies.merge(credits, on='title')

# We only need a few columns
full_data = movies[['movie_id', 'title', 'genres', 'cast', 'vote_average', 'vote_count']].copy()
full_data.dropna(inplace=True)

# Save this complete dataframe to a new pickle file
full_data.to_pickle('full_movie_data.pkl')

print("Saved 'full_movie_data.pkl' successfully!")

Saved 'full_movie_data.pkl' successfully!
