In [None]:
import os
import json
import gdown
import pandas as pd
import numpy as np
### Plot
import matplotlib.pyplot as plt
import seaborn as sns
### HTML
from bs4 import BeautifulSoup
import re
### Warnings
import warnings
### Text Preprocessing and Natural Language Processing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import nltk
import re
import spacy
from wordcloud import WordCloud


In [None]:
tmdb_df = pd.read_csv('TMDB_movie_dataset_v11.csv') #tmdb dataset we will be working with
tmdb_df
tmdb_df.describe()


In [None]:
# in order to create a robust key "title_year to join with the movielens dataset"
tmdb_df['release_date'] = pd.to_datetime(tmdb_df['release_date'], errors='coerce')
tmdb_df['year'] = tmdb_df['release_date'].dt.year.astype('Int64').astype(str).replace('nan', '')
tmdb_df['title_year'] = tmdb_df.apply(lambda x: f"{x['title']} ({x['year']})" if x['year'] else x['title'], axis=1)
tmdb_df


In [None]:
df_ratings = pd.read_csv('ratings.csv', sep='::', engine='python', encoding='ISO-8859-1', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp']) # ^same
print(df_ratings.head())

In [None]:
df_users = pd.read_csv('users.csv', sep='::', engine='python', encoding='ISO-8859-1', header=None, names=['UserID', 'Gender', 'Age', 'Occupation','ZipCode']) # ^same
print(df_users.head())

In [None]:
df_ml_combined = pd.merge(pd.merge(df_ratings, df_movies, on='MovieID'), df_users, on='UserID') #this is the merged version of the MovieLens Dataset
df_ml_combined

In [None]:
#now to finally merge the datasets
merged_df = pd.merge(df_ml_combined, tmdb_df, left_on='Title', right_on='title_year', how='left') #main dataset
merged_df
print(merged_df)   #use Genres from movielens

In [None]:
print(merged_df.columns)
# Counting how many NA values are in the 'UserID' column
user_id_na_count = merged_df['UserID'].isna().sum()

print("Number of NA values in the UserID column:", user_id_na_count)
# Assuming 'df' is your DataFrame
important_columns = ['Title', 'Genres', 'overview', 'tagline', 'original_language', 'release_date', 'runtime', 'vote_average', 'vote_count', 'production_companies', 'production_countries', 'spoken_languages', 'year']

# Check for any NA values in these columns
na_columns = merged_df[important_columns].isna().any()

# Print columns with NA values
print("Columns with NA values:", na_columns[na_columns].index.tolist())



In [97]:
df = pd.read_csv('Merged_df.csv')
df.describe()
print(df.columns)
df[['title_year']]


  df = pd.read_csv('Merged_df.csv')


Index(['Unnamed: 0', 'UserID', 'MovieID', 'Rating', 'Timestamp', 'Title',
       'Genres', 'Gender', 'Age', 'Occupation', 'ZipCode', 'Movie_Title',
       'Movie_Year', 'id', 'title', 'vote_average', 'vote_count', 'status',
       'release_date', 'revenue', 'runtime', 'adult', 'backdrop_path',
       'budget', 'homepage', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'year', 'title_year'],
      dtype='object')


Unnamed: 0,title_year
0,One Flew Over the Cuckoo's Nest (1975)
1,James and the Giant Peach (1996)
2,My Fair Lady (1964)
3,Erin Brockovich (2000)
4,A Bug's Life (1998)
...,...
995650,The Crying Game (1992)
995651,The Crying Game (1992)
995652,Welcome to the Dollhouse (1996)
995653,Sophie's Choice (1982)


In [99]:
# Assuming 'merged_df' is your DataFrame
merged_df = df
content_merged_df = merged_df.drop(['genres', 'poster_path', 'backdrop_path','Age', 'Gender', 'homepage','Occupation', 'ZipCode', 'Title', 'Movie_Title', 'title', 'original_title'], axis=1)
content_merged_df = content_merged_df.drop_duplicates().reset_index(drop=True) #no duplicates since row numbers remained the same, timestamp used for temporal analysis
# print(content_merged_df)
print(content_merged_df.columns)
#content_merged_df.to_csv('content.csv', index=False)

Index(['Unnamed: 0', 'UserID', 'MovieID', 'Rating', 'Timestamp', 'Genres',
       'Movie_Year', 'id', 'vote_average', 'vote_count', 'status',
       'release_date', 'revenue', 'runtime', 'adult', 'budget', 'imdb_id',
       'original_language', 'overview', 'popularity', 'tagline',
       'production_companies', 'production_countries', 'spoken_languages',
       'year', 'title_year'],
      dtype='object')


In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor, export_text
from scipy.sparse import hstack

# Fill NaN values in the 'overview' column with an empty string
content_merged_df['overview'] = content_merged_df['overview'].fillna("")

# Now, proceed with TF-IDF vectorization (overview)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_overviews = tfidf_vectorizer.fit_transform(content_merged_df['overview'])

# Scale numerical features (popularity)
scaler = StandardScaler()
popularity_scaled = scaler.fit_transform(content_merged_df[['popularity']].to_numpy())



In [85]:
content_merged_df['Genres'] = content_merged_df['Genres'].apply(lambda x: x.split("|"))

genres = set(g for G in content_merged_df['Genres'] for g in G)

for g in genres:
    content_merged_df[g] = content_merged_df.Genres.transform(lambda x: int(g in x))

# Adding a prefix to the genre columns to create a sparse matrix for feature engineering
prefix = 'Genre_'
for g in genres:
    content_merged_df.rename(columns={g: prefix + g}, inplace=True)
    
# Identifying genre columns by a common prefix "Genre_"
genre_columns = [col for col in content_merged_df.columns if col.startswith('Genre_')]

# Convert the identified genre columns to sparse data type
for col in genre_columns:
    content_merged_df[col] = pd.arrays.SparseArray(content_merged_df[col])

# Convert it to a sparse matrix
sparse_genres = pd.concat([content_merged_df[col] for col in genre_columns], axis=1).sparse.to_coo()

In [87]:
# Combine features into a single feature matrix
X = hstack([tfidf_overviews, popularity_scaled, sparse_genres])

# Target variable (ratings in this example)
y = content_merged_df['Rating'].to_numpy()

In [88]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Decision Tree Regressor
tree_regressor = DecisionTreeRegressor(random_state=42)
tree_regressor.fit(X_train, y_train)

In [92]:
from sklearn.metrics import mean_squared_error
import math

# Predict on the test set
y_pred = tree_regressor.predict(X_test)

# Calculate RMSE
rmse = math.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

RMSE: 0.9794300117735689


In [96]:
# Get TF-IDF feature names and popularity column name
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
popularity_column_name = 'popularity'

# Combine feature names with their importances
features_importance = list(zip(tfidf_feature_names, feature_importances)) + [(popularity_column_name, feature_importances[-1])] + [(col, imp) for col, imp in zip(genre_columns, feature_importances[-len(genre_columns):])]

# Sort features based on their importance
ranked_features = sorted(features_importance, key=lambda x: x[1], reverse=True)

# Print ranked features
print("Ranked features based on importance:")
for feature, importance in ranked_features:
    print(f"{feature}: {importance}")


Ranked features based on importance:
Genre_Drama: 0.05999160603882507
popularity: 0.024624275540798424
Genre_Horror: 0.024624275540798424
Genre_Action: 0.023517983505721818
Genre_Children's: 0.018592341498030466
Genre_Film-Noir: 0.017977706261179612
death: 0.015849654832385124
Genre_Documentary: 0.010945178849429507
room: 0.008230034925948046
Genre_Comedy: 0.00818495012153006
tragedy: 0.007784188691881802
man: 0.007719879489154756
young: 0.0075848309100972015
wife: 0.007275231675447399
gets: 0.00594108107901286
minutes: 0.005908664385194055
murder: 0.005842758682734916
rocky: 0.005807079857755114
new: 0.0057664686898036945
Genre_Romance: 0.005709755310256203
gromit: 0.005565360668228199
superman: 0.005431234191417276
toting: 0.005358862681585407
miyagi: 0.005147231460350145
Genre_Thriller: 0.005043803779233722
lassard: 0.005036174735959555
plans: 0.005029455198638108
safety: 0.004784176357571996
suddenly: 0.0047505150413665095
ex: 0.004734283173635042
attractive: 0.004718907143027952
b

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Define the CountVectorizer
vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the 'Genres' column
genres = vectorizer.fit_transform(content_merged_df['Genres']).toarray()

# Create a DataFrame from the transformed genres
contents = pd.DataFrame(genres, columns=vectorizer.get_feature_names_out())

# Print the shape of the content table
print('Shape of the content table:', contents.shape)

# Display the first few rows of the DataFrame
contents.head()

In [None]:
# Combine features into a single feature matrix
X = hstack([tfidf_overviews, popularity_scaled, genres_encoded])

# Target variable (ratings in this example)
y = movies_df['rating'].to_numpy()

In [None]:
from sklearn.neighbors import NearestNeighbors
nn_algo = NearestNeighbors(metric='cosine')
nn_algo.fit(contents)

In [None]:
def generate_user_watch_history(user_id, df):
    """
    Generate a particular user's watch history based on their user ID.

    Parameters:
        user_id (int): The ID of the user.
        df (pandas.DataFrame): The DataFrame containing the merged data.

    Returns:
        list: A list of unique movie titles representing the user's watch history.
    """
    user_history = df[df['UserID'] == user_id]['title_year'].tolist()
    
    # Remove duplicates by converting to a set and back to a list
    user_history = list(set(user_history))

    return user_history

In [None]:
class Recommender:
    def __init__(self):
        self.hist = [] 
        self.ishist = False

    def recommend_on_movie(self, movie, n_recommend=10):
        """
        Recommend movies similar to a given movie.

        Parameters:
            movie (str): The title of the movie.
            n_recommend (int): Number of recommendations to return.

        Returns:
            list: A list of recommended movie titles.
        """
        self.ishist = True
        iloc = content_merged_df[content_merged_df['title_year'] == movie].index[0]
        self.hist.append(iloc)
        distance, neighbors = nn_algo.kneighbors([contents.iloc[iloc]], n_neighbors=n_recommend+1)
        recommeds = [content_merged_df.iloc[i]['title_year'] for i in neighbors[0] if i not in [iloc]]
        return recommeds[:n_recommend]
    
    def recommend_on_history(self, n_recommend=10):
        """
        Recommend movies based on the user's watch history.

        Parameters:
            n_recommend (int): Number of recommendations to return.

        Returns:
            list: A list of recommended movie titles.
        """
        if not self.ishist:
            return print('No history found')
        
        unique_movies = set(content_merged_df['title_year'])  # Create a set of unique movie titles
        
        history = np.array([list(contents.iloc[iloc]) for iloc in self.hist])
        distance, neighbors = nn_algo.kneighbors([np.average(history, axis=0)], n_neighbors=len(unique_movies))
        
        recommended_movies = []
        recommended_indices = set()  # To store indices of recommended movies
        
        for i in neighbors[0]:
            # Check if the movie is not in user's watch history, not already recommended, and is in the unique movie set
            if i not in self.hist and i not in recommended_indices and content_merged_df.iloc[i]['title_year'] in unique_movies:
                recommended_movies.append(content_merged_df.iloc[i]['title_year'])
                recommended_indices.add(i)
                if len(recommended_movies) == n_recommend:
                    break
                    
        # Convert index locations to movie titles
        watched_movies = [content_merged_df.iloc[i]['title_year'] for i in self.hist]

        # Filter out movies that are already in the user's watch history
        recommended_movies = [movie for movie in recommended_movies if movie not in watched_movies]

        
        # Remove duplicates from recommended movies
        recommended_movies = list(set(recommended_movies))
        
        return recommended_movies[:n_recommend]

def create_recommender_for_user(user_id, df):
    """
    Create a recommender for a specific user's watch history.

    Parameters:
        user_id (int): The ID of the user.
        df (pandas.DataFrame): The DataFrame containing the merged data.

    Returns:
        Recommender: A Recommender instance populated with the user's watch history.
    """
    user_watch_history = generate_user_watch_history(user_id, df)
    
    # Create a new Recommender instance
    user_recommender = Recommender()
    
    # Populate the Recommender instance with the user's watch history
    for movie_title in user_watch_history:
        user_recommender.recommend_on_movie(movie_title)
    
    return user_recommender


In [None]:
from sklearn.decomposition import TruncatedSVD

class RecommenderSVD:
    def __init__(self, n_components=50):
        self.hist = [] 
        self.ishist = False
        self.n_components = n_components
        self.svd = TruncatedSVD(n_components=self.n_components)
    
    def fit(self, contents):
        self.svd.fit(contents)
    
    def recommend_on_movie(self, movie, n_recommend=5):
        """
        Recommend movies similar to a given movie.

        Parameters:
            movie (str): The title of the movie.
            n_recommend (int): Number of recommendations to return.

        Returns:
            list: A list of recommended movie titles.
        """
        self.ishist = True
        iloc = content_merged_df[content_merged_df['title_year'] == movie].index[0]
        self.hist.append(iloc)
        transformed_movie = self.svd.transform([contents.iloc[iloc]])
        distance, neighbors = nn_algo.kneighbors(transformed_movie, n_neighbors=n_recommend+1)
        recommeds = [content_merged_df.iloc[i]['title_year'] for i in neighbors[0] if i not in [iloc]]
        return recommeds[:n_recommend]
    
    def recommend_on_history(self, n_recommend=5):
        """
        Recommend movies based on the user's watch history.

        Parameters:
            n_recommend (int): Number of recommendations to return.

        Returns:
            list: A list of recommended movie titles.
        """
        if not self.ishist:
            return print('No history found')
        
        history = np.array([list(contents.iloc[iloc]) for iloc in self.hist])
        transformed_history = self.svd.transform(history)
        
        distance, neighbors = nn_algo.kneighbors([np.average(transformed_history, axis=0)], n_neighbors=len(contents))
        
        recommended_movies = []
        recommended_indices = set()  # To store indices of recommended movies
        
        for i in neighbors[0]:
            # Check if the movie is not in user's watch history, not already recommended
            if i not in self.hist and i not in recommended_indices:
                recommended_movies.append(content_merged_df.iloc[i]['title_year'])
                recommended_indices.add(i)
                if len(recommended_movies) == n_recommend:
                    break
                # Convert index locations to movie titles
        watched_movies = [content_merged_df.iloc[i]['title_year'] for i in self.hist]

        # Filter out movies that are already in the user's watch history
        recommended_movies = [movie for movie in recommended_movies if movie not in watched_movies]

        
        # Remove duplicates from recommended movies
        recommended_movies = list(set(recommended_movies))
        
        return recommended_movies[:n_recommend]
    
def create_recommender_for_userSVD(user_id, df, n_components=20):
    """
    Create a recommender for a specific user's watch history.

    Parameters:
        user_id (int): The ID of the user.
        df (pandas.DataFrame): The DataFrame containing the merged data.
        n_components (int): Number of components for TruncatedSVD.

    Returns:
        Recommender: A Recommender instance populated with the user's watch history.
    """
    user_watch_history = generate_user_watch_history(user_id, df)
    
    # Create a new Recommender instance
    user_recommender = RecommenderSVD(n_components=n_components)
    
    # Fit the Recommender instance with the content matrix
    user_recommender.fit(contents)
    
    # Populate the Recommender instance with the user's watch history
    for movie_title in user_watch_history:
        user_recommender.recommend_on_movie(movie_title)
    
    return user_recommender





In [None]:
user3_recommenderSVD = create_recommender_for_userSVD(1,df)


In [None]:
print(generate_user_watch_history(1, content_merged_df))
user3_recommenderSVD.recommend_on_history()

In [None]:
#old tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'overview' and 'tagline' are your textual columns
content_merged_df['combined_text'] = content_merged_df['overview'].fillna('') + ' ' + content_merged_df['tagline'].fillna('') + ' ' + content_merged_df['Title']

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(content_merged_df['combined_text'])




In [None]:
#old training
import pandas as pd

# Assuming df is your DataFrame and 'Timestamp' is the column with Unix timestamps
# Step 1: Convert 'Timestamp' to a datetime object
content_merged_df['Timestamp'] = pd.to_datetime(content_merged_df['Timestamp'], unit='s')

# Step 2: Sort the DataFrame by 'Timestamp' in ascending order
content_merged_df = content_merged_df.sort_values(by='Timestamp', ascending=True)

# Step 3: Determine the split point (e.g., 80% for training, 20% for test)
split_point = int(len(content_merged_df) * 0.8)

# Step 4: Split the dataset into training and test sets
train_set = content_merged_df[:split_point]
test_set = content_merged_df[split_point:]

# You now have your training set and test set, with the test set containing the most recent data
print(train_set)
print(test_set)

In [None]:
#old
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def item_based_recom(input_dataframe, input_film_name): 
    pivot_item_based = pd.pivot_table(input_dataframe, index='Title', columns=['UserID'], values='Rating')
    sparse_pivot = sparse.csr_matrix(pivot_item_based.fillna(0)) 
    recommender = cosine_similarity(sparse_pivot) 
    recommender_df = pd.DataFrame(recommender, columns=pivot_item_based.index, index=pivot_item_based.index)
    cosine_df = pd.DataFrame(recommender_df[input_film_name].sort_values(ascending=False)) 
    cosine_df.reset_index(level=0, inplace=True) 
    cosine_df.columns = ['Title', 'cosine_sim']
    return cosine_df

def item_and_genre_based_recom(cosine_df, movies_df, categories):
    top_cos_genre = pd.merge(cosine_df, movies_df, on='Title')
    top_cos_genre['genre_similarity'] = [pairwise_row_diff(top_cos_genre, 0, row, categories) for row in top_cos_genre.index.values]
    return top_cos_genre[['Title', 'cosine_sim', 'genre_similarity']]

def pairwise_row_diff(dataframe, row1, row2, column_names):
    matrix_row1 = [[dataframe.loc[row1, cat] for cat in column_names.split('|')]] 
    matrix_row2 = [[dataframe.loc[row2, cat] for cat in column_names.split('|')]] 
    return round(cosine_similarity(matrix_row1, matrix_row2)[0][0], 5)

def generate_recommendations(df, UserID, top_results=10, cat=None): 
    # Get the top movie based on the rating of the UserID
    top_movie = df[df['UserID'] == UserID].sort_values(by='Rating', ascending=False)['Title'].iloc[0]
    
    print("Movie Recommender by Karan Walanj: ")
    print("User name: " + "Favorite Movie:", top_movie+'\n\n')
    print("Films you might enjoy based on what user", UserID, "watched:", top_movie)
    cos_sim = item_based_recom(df, top_movie) 
    display(cos_sim[1:top_results+1])

    print("Films you might enjoy with similar genre as", top_movie)
    display(item_and_genre_based_recom(item_based_recom(df, top_movie), df[['Title']], cat)
            .sort_values('cosine_sim', ascending=False)[top_results:]
            .sort_values('genre_similarity', ascending=False)[:top_results])
    return None

# Example usage
generate_recommendations(content_merged_df, UserID=25, top_results=10, cat=content_merged_df['Genres'])

In [None]:
#old
# #CONTENT BASED FILTERING 
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# # Assuming merged_df is already loaded with your data

# # Step 1: Preprocess the dataset with TF-IDF
# tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# merged_df['combined_features'] = merged_df[['Genres', 'spoken_languages', 'production_countries', 'production_companies', 'tagline', 'overview']].apply(lambda x: ' '.join(x.astype(str).values), axis=1)
# tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['combined_features'])

# # Function to create a user profile
# # def create_user_profile(user_id, merged_df, tfidf_matrix, rating_threshold=4):
#     user_data = merged_df[merged_df['UserID'] == user_id]
#     # Filter movies that the user has rated above the threshold
#     high_rating_indices = user_data[user_data['Rating'] > rating_threshold].index.tolist()
    
#     if not high_rating_indices:
#         return np.array([])  # Return an empty array if no high-rated movies

#     # We extract the rows from tfidf_matrix corresponding to high ratings
#     # and compute the mean. Note: toarray() converts sparse matrix to dense
#     # Ensure the result is a dense 2D array for compatibility with cosine_similarity
#     user_profile = np.mean(tfidf_matrix[high_rating_indices].toarray(), axis=0)
    
#     # Ensure user_profile is 2D: (1, number_of_features)
#     user_profile = user_profile.reshape(1, -1)
    
#     return user_profile


# # Function to recommend movies based on the user profile
# def recommend(user_profile, tfidf_matrix, merged_df, user_id, n_recommendations=5):
#     if user_profile is None:
#         return []

#     # Calculate similarity between user profile and all movie profiles
#     cosine_similarities = cosine_similarity(user_profile, tfidf_matrix)
    
#     # Get indices sorted by similarity (descending)
#     similar_indices = cosine_similarities.argsort().flatten()[-n_recommendations*2:]
    
#     # Filter out movies the user has already watched/rated
#     watched_movie_ids = set(merged_df[merged_df['UserID'] == user_id]['MovieID'])
#     recommended_movie_ids = [idx for idx in similar_indices if merged_df.iloc[idx]['MovieID'] not in watched_movie_ids]
    
#     # Limit to the top N recommendations
#     recommended_movie_ids = recommended_movie_ids[-n_recommendations:]
#     return merged_df.iloc[recommended_movie_ids]['MovieID'].values

# # Example usage
# user_id = 1  # Replace with an actual user ID
# user_profile = create_user_profile(user_id, merged_df, tfidf_matrix)
# recommended_movies = recommend(user_profile, tfidf_matrix, merged_df, user_id)
# print("Recommended movies for user {}: {}".format(user_id, recommended_movies))
