In [13]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances
from math import sqrt
import warnings

warnings.filterwarnings("ignore", message="Mean of empty slice")

class DataLoader:
    def __init__(self, file_path):
        self.file_path = file_path

    def load_data(self):
        ratings_df = pd.read_csv(self.file_path)
        return ratings_df

class DataPreprocessor:
    def __init__(self, ratings_df, movies_df, n):
        self.ratings_df = ratings_df
        self.movies_df = movies_df
        self.n = n

    def preprocess(self):
        ratings_df_sample = self.ratings_df[:self.n]
        self.n_users = len(ratings_df_sample['userId'].unique())
        self.n_movies = len(ratings_df_sample['movieId'].unique())
        movie_ids = ratings_df_sample['movieId'].unique()
        self.movies_df = self.movies_df[self.movies_df['movieId'].isin(movie_ids)]



        def scale_movie_id(movie_id):
            scaled = np.where(movie_ids == movie_id)[0][0] + 1
            return scaled

        def clean_title(x):
            x = x.split(" (")[0]  
            x = re.sub("[^a-zA-Z0-9 ]", "", x)  
            x = x.strip()
            return x

        ratings_df_sample.loc[:, 'movieId'] = ratings_df_sample['movieId'].apply(scale_movie_id)
        movie_ids =  self.movies_df['movieId'].unique()
        self.movies_df.loc[:, 'movieId'] = self.movies_df['movieId'].apply(scale_movie_id)
        self.movies_df.loc[:, 'title'] = self.movies_df['title'].apply(clean_title)
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        tfidf_matrix = vectorizer.fit_transform(self.movies_df['title'])
        train_data, test_data = train_test_split(ratings_df_sample, test_size=0.2)
        return train_data, test_data, self.movies_df, tfidf_matrix, vectorizer, self.n_users, self.n_movies

class RecommenderModel:
    def __init__(self, n_users, n_movies, movies_df, tfidf_matrix, vectorizer):
        self.n_users = n_users
        self.n_movies = n_movies
        self.movies_df = movies_df
        self.tfidf_matrix = tfidf_matrix
        self.vectorizer = vectorizer

    def train(self, train_data):
        self.train_data_matrix = np.zeros((self.n_users, self.n_movies))
        for line in train_data.itertuples():
            self.train_data_matrix[line[1] - 1, line[2] - 1] = line[3]

        self.user_similarity = pairwise_distances(self.train_data_matrix, metric='cosine')
        self.item_similarity = pairwise_distances(self.train_data_matrix.T, metric='cosine')

    def search(self, title):
        title = title.strip() 
        query_vec = self.vectorizer.transform([title])
        similarity = cosine_similarity(query_vec, self.tfidf_matrix).flatten()
        index = similarity.argsort()[-1] 
        return self.movies_df.iloc[index]['movieId']
    
    def recommend_movies(self, movie_ratings, n_recommendations=5):
        user_data_matrix = np.zeros((1, self.n_movies))
        for movie_name, rating in movie_ratings.items():
            movie_index_test = self.search(movie_name)
            user_data_matrix[0, movie_index_test] = rating

        user_pred = self.k_fract_mean_predict(7)  
        user_recommendations = user_pred[0].argsort()[::-1][:n_recommendations]

        recommended_movies = []
        for movie_id in user_recommendations:
            recommended_movies.append(movie_id + 1) 

        return list(self.movies_df[self.movies_df['movieId'].isin(recommended_movies)]['title'])

    def predict(self, top, type='user'):
        if type == 'user':
            return self.naive_predict(top)
        elif type == 'item':
            return self.naive_predict_item(top)
        elif type == 'user_k_fract':
            return self.k_fract_predict(top)
        elif type == 'item_k_fract':
            return self.k_fract_predict_item(top)
        elif type == 'user_k_fract_mean':
            return self.k_fract_mean_predict(top)
        elif type == 'item_k_fract_mean':
            return self.k_fract_mean_predict_item(top)

    def naive_predict(self, top):
        top_similar_ratings = np.zeros((self.n_users, top, self.n_movies))
        for i in range(self.n_users):
            top_sim_users = self.user_similarity[i].argsort()[1:top + 1]
            top_similar_ratings[i] = self.train_data_matrix[top_sim_users]

        pred = np.zeros((self.n_users, self.n_movies))
        for i in range(self.n_users):
            pred[i] = top_similar_ratings[i].sum(axis=0) / top
        return pred

    def naive_predict_item(self, top):
        top_similar_ratings = np.zeros((self.n_movies, top, self.n_users))
        for i in range(self.n_movies):
            top_sim_movies = self.item_similarity[i].argsort()[1:top + 1]
            top_similar_ratings[i] = self.train_data_matrix.T[top_sim_movies]

        pred = np.zeros((self.n_movies, self.n_users))
        for i in range(self.n_movies):
            pred[i] = top_similar_ratings[i].sum(axis=0) / top
        return pred.T

    def k_fract_predict(self, top):
        top_similar = np.zeros((self.n_users, top))
        for i in range(self.n_users):
            user_sim = self.user_similarity[i]
            top_sim_users = user_sim.argsort()[1:top + 1]
            for j in range(top):
                top_similar[i, j] = top_sim_users[j]

        abs_sim = np.abs(self.user_similarity)
        pred = np.zeros((self.n_users, self.n_movies))
        for i in range(self.n_users):
            indexes = top_similar[i].astype(int)
            numerator = self.user_similarity[i][indexes]
            product = numerator.dot(self.train_data_matrix[indexes])
            denominator = abs_sim[i][top_similar[i].astype(int)].sum()
            pred[i] = product / denominator
        return pred

    def k_fract_predict_item(self, top):
        top_similar = np.zeros((self.n_movies, top))
        for i in range(self.n_movies):
            movies_sim = self.item_similarity[i]
            top_sim_movies = movies_sim.argsort()[1:top + 1]
            for j in range(top):
                top_similar[i, j] = top_sim_movies.T[j]

        abs_sim = np.abs(self.item_similarity)
        pred = np.zeros((self.n_movies, self.n_users))
        for i in range(self.n_movies):
            indexes = top_similar[i].astype(int)
            numerator = self.item_similarity[i][indexes]
            product = numerator.dot(self.train_data_matrix.T[indexes])
            denominator = abs_sim[i][indexes].sum()
            denominator = denominator if denominator != 0 else 1
            pred[i] = product / denominator
        return pred.T

    def k_fract_mean_predict(self, top):
        top_similar = np.zeros((self.n_users, top))
        for i in range(self.n_users):
            user_sim = self.user_similarity[i]
            top_sim_users = user_sim.argsort()[1:top + 1]
            for j in range(top):
                top_similar[i, j] = top_sim_users[j]

        abs_sim = np.abs(self.user_similarity)
        pred = np.zeros((self.n_users, self.n_movies))
        for i in range(self.n_users):
            indexes = top_similar[i].astype(int)
            numerator = self.user_similarity[i][indexes]
            mean_rating = np.nanmean([x for x in self.train_data_matrix[i] if x > 0])
            if np.isnan(mean_rating):
                mean_rating = 0
            diff_ratings = self.train_data_matrix[indexes] - np.nanmean(self.train_data_matrix[indexes], axis=0)
            numerator = numerator.dot(diff_ratings)
            denominator = abs_sim[i][top_similar[i].astype(int)].sum()
            pred[i] = mean_rating + numerator / denominator if denominator != 0 else mean_rating
        return pred

    def k_fract_mean_predict_item(self, top):
        top_similar = np.zeros((self.n_movies, top))
        for i in range(self.n_movies):
            movie_sim = self.item_similarity[i]
            top_sim_movies = movie_sim.argsort()[1:top + 1]
            for j in range(top):
                top_similar[i, j] = top_sim_movies[j]

        abs_sim = np.abs(self.item_similarity)
        pred = np.zeros((self.n_movies, self.n_users))
        for i in range(self.n_movies):
            indexes = top_similar[i].astype(int)
            numerator = self.item_similarity[i][indexes]
            diff_ratings = self.train_data_matrix.T[indexes] - np.nanmean(self.train_data_matrix.T[indexes], axis=0)
            numerator = numerator.dot(diff_ratings)
            denominator = abs_sim[i][top_similar[i].astype(int)].sum()
            denominator = denominator if denominator != 0 else 1
            mean_rating = np.nanmean([x for x in self.train_data_matrix.T[i] if x > 0])
            mean_rating = 0 if np.isnan(mean_rating) else mean_rating
            pred[i] = mean_rating + numerator / denominator
        return pred.T

def rmse(prediction, ground_truth):
    prediction = np.nan_to_num(prediction)[ground_truth.nonzero()].flatten()
    ground_truth = np.nan_to_num(ground_truth)[ground_truth.nonzero()].flatten()
    mse = mean_squared_error(prediction, ground_truth)
    return sqrt(mse)

In [14]:
data_loader_ratings = DataLoader('data/ratings.csv')
ratings_df = data_loader_ratings.load_data()

data_loader_movies = DataLoader('data/movies.csv')
movies_df = data_loader_movies.load_data()

print(f'Unique users count: {len(ratings_df["userId"].unique())}')
print(f'Unique movies count: {len(ratings_df["movieId"].unique())}')
print(f'DataFrame shape: {ratings_df.shape}')

# Preprocess data
n = 100000
data_preprocessor = DataPreprocessor(ratings_df, movies_df, n)
train_data, test_data, movies_df, tdif_matrix, vectorizer, n_users, n_movies = data_preprocessor.preprocess()
print(f'Train shape: {train_data.shape}')
print(f'Test shape: {test_data.shape}')

# Train model
recommender_model = RecommenderModel(n_users, n_movies, movies_df, tdif_matrix, vectorizer)
recommender_model.train(train_data)

# Create test data matrix
test_data_matrix = np.zeros((n_users, n_movies))
for line in test_data.itertuples():
    test_data_matrix[line[1] - 1, line[2] - 1] = line[3]

# Make predictions and evaluate
naive_pred = recommender_model.predict(7, type='user')
print('User-based CF RMSE:', rmse(naive_pred, test_data_matrix))

naive_pred_item = recommender_model.predict(7, type='item')
print('Item-based CF RMSE:', rmse(naive_pred_item, test_data_matrix))

k_predict = recommender_model.predict(7, type='user_k_fract')
print('User-based CF RMSE (k_fract):', rmse(k_predict, test_data_matrix))

k_predict_item = recommender_model.predict(7, type='item_k_fract')
print('Item-based CF RMSE (k_fract):', rmse(k_predict_item, test_data_matrix))

k_predict_mean = recommender_model.predict(7, type='user_k_fract_mean')
print('User-based CF RMSE (k_fract_mean):', rmse(k_predict_mean, test_data_matrix))

k_predict_item_mean = recommender_model.predict(7, type='item_k_fract_mean')
print('Item-based CF RMSE (k_fract_mean):', rmse(k_predict_item_mean, test_data_matrix))

Unique users count: 330975
Unique movies count: 83239
DataFrame shape: (33832162, 4)
Train shape: (80000, 4)
Test shape: (20000, 4)
User-based CF RMSE: 2.933015052112479
Item-based CF RMSE: 3.060378836426788
User-based CF RMSE (k_fract): 2.9338878493556977
Item-based CF RMSE (k_fract): 3.0628380911081603
User-based CF RMSE (k_fract_mean): 0.9474072447156595
Item-based CF RMSE (k_fract_mean): 1.265143026359543


In [22]:
def recommend_movies_from_terminal():
    movies_input = input("Enter movies separated by commas: ")
    ratings_input = input("Enter ratings separated by commas: ")

    movies_list = movies_input.split(',')
    ratings_list = [float(rating.strip()) for rating in ratings_input.split(',')]

    movie_ratings = dict(zip(movies_list, ratings_list))
    recommended_movie_ids = recommender_model.recommend_movies(movie_ratings, n_recommendations=5)
    print("Recommendations:")
    for movie_name in recommended_movie_ids:
        print(movie_name)

recommend_movies_from_terminal()

Recommendations:
Now and Then
Umbrellas of Cherbourg The
National Lampoons Senior Trip
Angus
Haunted World of Edward D Wood Jr The
