In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Data selection and preprocessing:

In [13]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [19]:
def train_test_split_user(ratings, test_size=0.2):
    train_list = []
    test_list = []

    # Get all unique users
    for user_id in ratings['userId'].unique():
        # Filter the dataset to include only rows corresponding to curr user
        user_data = ratings[ratings['userId'] == user_id]
        # Check if user has rated more than 5 movies to meaningfully split data into train and test
        # Ex. user with 10 ratings -> 8 train, 2 test, but user with 3 ratings -> 2 train, 1 test
            # In the latter case, we would not have enough data to train the model
        # If user has rated more than 5 movies, split the data into train and test
        # Else, include all data in train
        if len(user_data) >= 5:
            train_data, test_data = train_test_split(user_data, test_size=test_size, random_state=42)
            train_list.append(train_data)
            test_list.append(test_data)
        else:
            train_list.append(user_data)
    
    # Combine training and testing data for all users into train and test
    train = pd.concat(train_list)
    test = pd.concat(test_list)
    return train, test

train_data, test_data = train_test_split_user(ratings)
print(train_data.head())
# We don't need timestamp column
train_data = train_data.drop(columns=['timestamp'])
test_data = test_data.drop(columns=['timestamp'])

     userId  movieId  rating  timestamp
55        1     1031     5.0  964982653
230       1     4006     4.0  964982903
69        1     1197     5.0  964981872
168       1     2596     5.0  964981144
109       1     1777     4.0  964981230


In [20]:
train_genres, test_genres = [], []

test_movies = set(test_data['movieId'])

for movie_id in movies['movieId']:
    genres = movies[movies['movieId'] == movie_id]['genres'].values

    if movie_id in test_movies:
        test_genres.append({
            'movieId': movie_id,
            'tags': genres
        })
    else:
        train_genres.append({
            'movieId': movie_id,
            'tags': genres
        })

train_tags = pd.DataFrame(train_genres)
test_tags = pd.DataFrame(test_genres)

print("Train Tags:")
print(train_tags.head())
print("Test Tags:")
print(test_tags.head())


Train Tags:
   movieId               tags
0       28    [Drama|Romance]
1       30      [Crime|Drama]
2       40            [Drama]
3       49    [Drama|Romance]
4       53  [Adventure|Drama]
Test Tags:
   movieId                                           tags
0        1  [Adventure|Animation|Children|Comedy|Fantasy]
1        2                   [Adventure|Children|Fantasy]
2        3                               [Comedy|Romance]
3        4                         [Comedy|Drama|Romance]
4        5                                       [Comedy]


In [25]:
train_tags_str = train_tags['tags'].astype(str)
test_tags_str = test_tags['tags'].astype(str)

tfidf_vectorizer = TfidfVectorizer()

train_tfidf_matrix = tfidf_vectorizer.fit_transform(train_tags_str)
test_tfidf_matrix = tfidf_vectorizer.transform(test_tags_str)

train_tfidf_df = pd.DataFrame(train_tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
test_tfidf_df = pd.DataFrame(test_tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("Train TF-IDF Matrix:")
print(train_tfidf_df.head())
print("Test TF-IDF Matrix:")
print(test_tfidf_df.head())

Train TF-IDF Matrix:
   action  adventure  animation  children  comedy     crime  documentary  \
0     0.0   0.000000        0.0       0.0     0.0  0.000000          0.0   
1     0.0   0.000000        0.0       0.0     0.0  0.869641          0.0   
2     0.0   0.000000        0.0       0.0     0.0  0.000000          0.0   
3     0.0   0.000000        0.0       0.0     0.0  0.000000          0.0   
4     0.0   0.877486        0.0       0.0     0.0  0.000000          0.0   

      drama  fantasy   fi  ...  listed  musical  mystery   no  noir  romance  \
0  0.525003      0.0  0.0  ...     0.0      0.0      0.0  0.0   0.0   0.8511   
1  0.493685      0.0  0.0  ...     0.0      0.0      0.0  0.0   0.0   0.0000   
2  1.000000      0.0  0.0  ...     0.0      0.0      0.0  0.0   0.0   0.0000   
3  0.525003      0.0  0.0  ...     0.0      0.0      0.0  0.0   0.0   0.8511   
4  0.479603      0.0  0.0  ...     0.0      0.0      0.0  0.0   0.0   0.0000   

   sci  thriller  war  western  
0  0.0  

# Rating Prediction

In [49]:
class MF():

    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_movies = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_movies, self.K))

        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_movies)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_movies)
            if self.R[i, j] > 0
        ]

        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            print("Iteration: %d ; mae = %.4f ; rmse = %.4f" % (i+1, self.mae(), self.rmse()))

    def mae(self):
        xs, ys = self.R.nonzero() 
        predicted = self.full_matrix()  
        error = 0
        
        for x, y in zip(xs, ys):
            error += abs(self.R[x, y] - predicted[x, y])  
        
        return error / len(xs)
    
    def rmse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)

        return np.sqrt(error/len(xs))

    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            P_i = self.P[i, :].copy()
            Q_j = self.Q[j, :].copy()

            self.P[i, :] += self.alpha * (e * Q_j - self.beta * P_i)
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * Q_j)

    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    def full_matrix(self):
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)


ERROR! Session/line number was not unique in database. History logging moved to new session 17


In [59]:
class MFWithVectorizer(MF):
    def __init__(self, R, K, alpha, beta, iterations, tfidf_matrix, lambda_reg=0.1):
        # Initialize the matrix factorization parameters
        super().__init__(R, K, alpha, beta, iterations)
        
        # Add the TF-IDF matrix (content features)
        self.tfidf_matrix = tfidf_matrix
        
        # Additional matrix for TF-IDF features
        self.T = np.random.normal(scale=1./self.K, size=(self.num_movies, self.K))  # movie features from TF-IDF
        self.lambda_reg = lambda_reg  # regularization for TF-IDF part

    def get_rating(self, i, j):
        """Modified to include content-based TF-IDF features in the rating prediction."""
        # Combine collaborative filtering (latent) and content-based features
        collaborative_rating = super().get_rating(i, j)
        
        # Content-based (TF-IDF) features
        content_rating = self.T[j, :].dot(self.P[i, :].T)
        
        # Final rating is a weighted sum of collaborative and content-based predictions
        return collaborative_rating + content_rating

    def sgd(self):
        """Update both latent factors (P, Q) and content-based features (T)."""
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            # Update latent factors P and Q (collaborative part)
            P_i = self.P[i, :].copy()
            Q_j = self.Q[j, :].copy()
            self.P[i, :] += self.alpha * (e * Q_j - self.beta * P_i)
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * Q_j)

            # Update TF-IDF features T (content-based part)
            T_j = self.T[j, :].copy()
            self.T[j, :] += self.alpha * (e * P_i - self.lambda_reg * T_j)  # Regularization term for TF-IDF features


In [5]:
def test_model(mf_model, test_data, user_to_idx, movie_to_idx):
    squared_error = 0
    absolute_error = 0
    n = len(test_data)
    unfound_movies = 0

    for _, row in test_data.iterrows():
        uid = int(row['userId'])
        mid = int(row['movieId'])

        if mid not in movie_to_idx:
            unfound_movies += 1
            continue 
        
        user_id = int(user_to_idx[uid])
        movie_id = int(movie_to_idx[mid])
        actual_rating = row['rating']
        
        predicted_rating = mf_model.get_rating(user_id, movie_id)
        
        squared_error += (actual_rating - predicted_rating) ** 2
        
        absolute_error += abs(actual_rating - predicted_rating)
    
    rmse = np.sqrt(squared_error / n)
    mae = absolute_error / n

    print(f"{unfound_movies} movies not found")
    return rmse, mae


In [None]:
user_ids = train_data['userId'].unique()  
movie_ids = train_data['movieId'].unique()  

user_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_to_idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

R = np.zeros((len(user_ids), len(movie_ids)))

for row in train_data.itertuples():
    user_idx = user_to_idx[row.userId]
    movie_idx = movie_to_idx[row.movieId]
    R[user_idx, movie_idx] = row.rating

mf = MF(R, K=100, alpha=0.01, beta=0.05, iterations=300)
#Iteration: 800 ; mae = 0.1950 ; rmse = 0.2761

mf.train()

In [60]:
mf_with_vectorizer = MFWithVectorizer(R, K=10, alpha=0.01, beta=0.05, iterations=1000, tfidf_matrix=train_tfidf_matrix)
mf_with_vectorizer.train()
rmse, mae = mf_with_vectorizer.rmse(), mf_with_vectorizer.mae()
print("Test RMSE:", rmse)
print("Test MAE:", mae)

Iteration: 1 ; mae = 0.7008 ; rmse = 0.8986
Iteration: 2 ; mae = 0.6764 ; rmse = 0.8736
Iteration: 3 ; mae = 0.6645 ; rmse = 0.8596
Iteration: 4 ; mae = 0.6558 ; rmse = 0.8497
Iteration: 5 ; mae = 0.6492 ; rmse = 0.8421
Iteration: 6 ; mae = 0.6444 ; rmse = 0.8356
Iteration: 7 ; mae = 0.6392 ; rmse = 0.8294
Iteration: 8 ; mae = 0.6350 ; rmse = 0.8243
Iteration: 9 ; mae = 0.6311 ; rmse = 0.8193
Iteration: 10 ; mae = 0.6276 ; rmse = 0.8146
Iteration: 11 ; mae = 0.6237 ; rmse = 0.8096
Iteration: 12 ; mae = 0.6202 ; rmse = 0.8049
Iteration: 13 ; mae = 0.6172 ; rmse = 0.7996
Iteration: 14 ; mae = 0.6126 ; rmse = 0.7943
Iteration: 15 ; mae = 0.6087 ; rmse = 0.7886
Iteration: 16 ; mae = 0.6043 ; rmse = 0.7829
Iteration: 17 ; mae = 0.6010 ; rmse = 0.7773
Iteration: 18 ; mae = 0.5963 ; rmse = 0.7712
Iteration: 19 ; mae = 0.5914 ; rmse = 0.7651
Iteration: 20 ; mae = 0.5872 ; rmse = 0.7592
Iteration: 21 ; mae = 0.5835 ; rmse = 0.7533
Iteration: 22 ; mae = 0.5793 ; rmse = 0.7478
Iteration: 23 ; mae

KeyboardInterrupt: 

In [None]:
rmse, mae = test_model(mf, test_data, user_to_idx, movie_to_idx)
print("Test RMSE:", rmse)
print("Test MAE:", mae)

827 movies not found
Test RMSE: 0.8356285770966579
Test MAE: 0.6269698313285563


In [None]:
def test_model_with_content(mf_model, test_data, user_to_idx, movie_to_idx):
    squared_error = 0
    absolute_error = 0
    n = len(test_data)
    unfound_movies = 0

    for _, row in test_data.iterrows():
        uid = int(row['userId'])
        mid = int(row['movieId'])

        # If the movie is not in the ratings matrix (cold-start item), handle it using TF-IDF
        if mid not in movie_to_idx:
            unfound_movies += 1
            continue 
        
        user_id = int(user_to_idx[uid])
        movie_id = int(movie_to_idx[mid])
        actual_rating = row['rating']
        
        # Use the new model's get_rating function that includes both latent features and TF-IDF
        predicted_rating = mf_model.get_rating(user_id, movie_id)
        
        squared_error += (actual_rating - predicted_rating) ** 2
        absolute_error += abs(actual_rating - predicted_rating)
    
    rmse = np.sqrt(squared_error / n)
    mae = absolute_error / n

    print(f"{unfound_movies} movies not found (cold-start items)")
    return rmse, mae

rmse, mae = test_model_with_content(mf_with_vectorizer, test_data, user_to_idx, movie_to_idx)

print(f"Test RMSE: {rmse}")
print(f"Test MAE: {mae}")

# Item Recommendation

In [9]:
def unwatched_movies(ratings):
    all_movie_ids = set(ratings['movieId'])
    user_to_unwatched_movies = {}

    for user_id in ratings['userId'].unique():
        user = ratings[ratings['userId'] == user_id]
        watched_movies = set(user['movieId'])
        unwatched_movies = all_movie_ids - watched_movies
        user_to_unwatched_movies[user_id] = unwatched_movies

    return user_to_unwatched_movies

def recommend_movies(mf_model, user_to_idx, movie_to_idx, n=10):
    unwatched = unwatched_movies(train_data)
    recommendations = {}

    for user_id in user_to_idx.keys():
        print(f'\ruser_{user_id}', end='')
        user_idx = user_to_idx[user_id]
        predictions = mf_model.full_matrix()[user_idx]
        recommended_movie_idxs = np.argsort(predictions)[::-1]
        recommended_movies = [k for k, v in movie_to_idx.items() if v in recommended_movie_idxs and k in unwatched[user_id]][:n]
        recommendations[user_id] = recommended_movies

    return recommendations

top_10_recommendations = recommend_movies(mf, user_to_idx, movie_to_idx, n=10)

user_610

In [33]:
def dcg(recommended_movies, actual_movies):
    dcg_value = 0.0
    for i, movie in enumerate(recommended_movies):
        if movie in actual_movies:
            dcg_value += 1 / np.log2(i + 2)  
    return dcg_value

def idcg(actual_movies, length):
    idcg_value = 0.0
    for i in range(min(len(actual_movies), length)):
        idcg_value += 1 / np.log2(i + 2)
    return idcg_value

def recommendation_performance(recommendations, test_data):
    running_precision, running_recall, running_ndcg = 0, 0, 0

    for user_id, recommended_movies in recommendations.items():
        actual_movies = test_data[test_data['userId'] == user_id]['movieId']

        intersection = len(set(recommended_movies) & set(actual_movies))
        precision = (intersection / len(recommended_movies)) * 100
        recall = (intersection / len(actual_movies)) * 100

        running_precision += precision
        running_recall += recall

        dcg_value = dcg(recommended_movies, actual_movies)
        idcg_value = idcg(actual_movies, len(recommended_movies))
        ndcg = (dcg_value / idcg_value) * 100 if idcg_value > 0 else 0
        running_ndcg += ndcg

    precision = running_precision / len(recommendations)
    recall = running_recall / len(recommendations)
    f_measure = 2 * (precision * recall) / (precision + recall)
    ndcg = running_ndcg / len(recommendations)

    return precision, recall, f_measure, ndcg

print("Precision \t%.4f%%\nRecall \t\t%.4f%%\nF-Measure \t%.4f%%\nNDCG \t\t%.4f" % recommendation_performance(top_10_recommendations, test_data))

Precision 	1.8852%
Recall 		0.4796%
F-Measure 	0.7647%
NDCG 		0.0399
