In [89]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt

In [90]:
#Users data
usersData = pd.read_csv('users.dat', sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python')

#Ratings data
ratingsData = pd.read_csv('ratings.dat', sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')

#Movies data
moviesData = pd.read_csv('movies.dat', sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='latin1')

In [91]:
#Users data
print(usersData.head())

#Ratings data
print(ratingsData.head())

#Movies data
print(moviesData.head())

   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


In [92]:
#Calculating the Global Average Rating
globalAvgRating = ratingsData["Rating"].mean()
print("The Global Average Rating is", globalAvgRating)

The Global Average Rating is 3.581564453029317


In [93]:
#Initialising the variables for error metrics
totalMAE = 0
totalRMSE = 0

#Performing 5 fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split(ratingsData):
    trainSet = ratingsData.iloc[train_index]
    testSet = ratingsData.iloc[test_index]

    #Global average Rating
    globalAverageRating = trainSet["Rating"].mean()

    #Dictionaries to store user and movie average ratings
    userAverageRating = trainSet.groupby("UserID")["Rating"].mean().to_dict()
    movieAverageRating = trainSet.groupby("MovieID")["Rating"].mean().to_dict()

    #List to store predictions
    predictions = []

    #Iterating through the test set and make predictions
    for _, row in testSet.iterrows():
        user_id = row["UserID"]
        movie_id = row["MovieID"]

        #Condition to check if user or movie is not in the training data
        if user_id not in userAverageRating or movie_id not in movieAverageRating:
            predictedRating = globalAverageRating
        else:
            #Calculating the predicted rating using the user and movie average
            predictedRating = (
                userAverageRating.get(user_id, globalAverageRating)
                + movieAverageRating.get(movie_id, globalAverageRating)
                - globalAverageRating
            )

        # Ensure the predicted rating is within the valid range [1, 5]
        predictedRating = min(5, max(1, predictedRating))

        predictions.append(predictedRating)

    #Replacing the NAN values with global average rating
    predictions = [globalAvgRating if np.isnan(x) else x for x in predictions]

    #Calculating MAE and RMSE
    mae = mean_absolute_error(testSet["Rating"], predictions)
    rmse = np.sqrt(mean_squared_error(testSet["Rating"], predictions))

    totalMAE += mae
    totalRMSE += rmse

# Calculate the average MAE and RMSE over all folds
averageMAE = totalMAE / 5
averageRMSE = totalRMSE / 5

print("The Average MAE is", averageMAE)
print("The Average RMSE is", averageRMSE)

The Average MAE is 0.732557217730404
The Average RMSE is 0.9338024482626637


In [94]:
#5 flod cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

MAE_Scores = []
RMSE_Scores = []

for train_index, test_index in kf.split(ratingsData):
    trainSet, testSet = ratingsData.iloc[train_index], ratingsData.iloc[test_index]

    # Avg user rating for the training set
    AvgUserRating = trainSet.groupby('UserID')['Rating'].mean().to_dict()

    #Predicting the user ratings in the test set and getting a copy of the slice
    testSet = testSet.copy()
    testSet['PredictedRating'] = testSet['UserID'].map(AvgUserRating)

    #MAE and RMSE
    MAE = mean_absolute_error(testSet['Rating'], testSet['PredictedRating'])
    RMSE = np.sqrt(mean_squared_error(testSet['Rating'], testSet['PredictedRating']))

    MAE_Scores.append(MAE)
    RMSE_Scores.append(RMSE)

# Calculate average MAE and RMSE over all folds
Avg_MAE = np.mean(MAE_Scores)
Avg_RMSE = np.mean(RMSE_Scores)

print("The Average MAE is", Avg_MAE)
print("The Average RMSE is", Avg_RMSE)

The Average MAE is 0.8289498348484926
The Average RMSE is 1.0354800404316435


In [95]:
#Avereage rating for each of the movies
AvgMovieRating = ratingsData.groupby('MovieID')['Rating'].mean().to_dict()

#Applying the recommender to the test set
testSet['PredictedRating'] = testSet['MovieID'].map(AvgMovieRating)

#MAE & RMSE
MAE = mean_absolute_error(testSet['Rating'], testSet['PredictedRating'])
RMSE = mean_squared_error(testSet['Rating'], testSet['PredictedRating'], squared=False)

print("MAE:", MAE)
print("RMSE:", RMSE)

MAE: 0.7796494048920112
RMSE: 0.9755265559644645


In [96]:
#global average rating
globalAverageRating = ratingsData['Rating'].mean()

#Applying the recommender to the test set
testSet['PredictedRating'] = globalAverageRating

#MAE and RMSE
MAE = mean_absolute_error(testSet['Rating'], testSet['PredictedRating'])
RMSE = mean_squared_error(testSet['Rating'], testSet['PredictedRating'], squared=False)

print("MAE:", MAE)
print("RMSE:", RMSE)

MAE: 0.9336170940472399
RMSE: 1.1164859152763942


In [97]:
#Average Rating for each of the user
userAverageRating = ratingsData.groupby('UserID')['Rating'].mean().to_dict()

#Applying the recommender to the test set
testSet['PredictedRating'] = testSet['UserID'].map(userAverageRating)

#Filling the missing average with average ratings
testSet['PredictedRating'].fillna(globalAverageRating, inplace=True)

#MAE & RMSE
MAE = mean_absolute_error(testSet['Rating'], testSet['PredictedRating'])
RMSE = mean_squared_error(testSet['Rating'], testSet['PredictedRating'], squared=False)

print("MAE:", MAE)
print("RMSE:", RMSE)

MAE: 0.8225786635080068
RMSE: 1.027543645855075


In [98]:
#Average rating for each of the movies
movieAvgRating = ratingsData.groupby('MovieID')['Rating'].mean().to_dict()

#Applying the recommender to the test set
testSet['PredictedRating'] = testSet['MovieID'].map(movieAvgRating)

#Filling the missing value with global average
testSet['PredictedRating'].fillna(globalAverageRating, inplace=True)

# MAE & RMSE
MAE = mean_absolute_error(testSet['Rating'], testSet['PredictedRating'])
RMSE = mean_squared_error(testSet['Rating'], testSet['PredictedRating'], squared=False)

print("MAE:", MAE)
print("RMSE:", RMSE)

MAE: 0.7796494048920112
RMSE: 0.9755265559644645


In [99]:
# Initialize variables for error metrics
totalMAE = 0
totalRMSE = 0

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split(ratingsData):
    train_set = ratingsData.iloc[train_index]
    test_set = ratingsData.iloc[test_index]

    # Calculate the global average rating
    globalAverageRating = train_set["Rating"].mean()

    # Calculate user and movie average ratings
    userAverageRating = train_set.groupby("UserID")["Rating"].mean().to_dict()
    movieAverageRating = train_set.groupby("MovieID")["Rating"].mean().to_dict()

    # Linear regression to estimate alpha, beta, and gamma
    X = train_set[["UserID", "MovieID"]]
    y = train_set["Rating"]

    model = LinearRegression()
    model.fit(X, y)

    alpha = model.coef_[0]
    beta = model.coef_[1]
    gamma = model.intercept_

    #To store predictions
    predictions = []

    # Iterating through the testset and make predictions
    for _, row in test_set.iterrows():
        user_id = row["UserID"]
        movie_id = row["MovieID"]

        # Calculating the predicted rating using linear combination
        if user_id in userAverageRating and movie_id in movieAverageRating:
            predictedRating = alpha * userAverageRating[user_id] + beta * movieAverageRating[movie_id] + gamma
        else:
            predictedRating = globalAverageRating

        # Checking the predicted rating is within the valid range [1, 5]
        predictedRating = min(5, max(1, predictedRating))
        predictions.append(predictedRating)

    # Calculating the MAE and RMSE
    mae = mean_absolute_error(test_set["Rating"], predictions)
    rmse = np.sqrt(mean_squared_error(test_set["Rating"], predictions))

    totalMAE += mae
    totalRMSE += rmse

# Calculating the average MAE and RMSE over all folds
averageMAE = totalMAE / 5
averageRMSE = totalRMSE / 5

print("The Average MAE is", averageMAE)
print("The Average RMSE is", averageRMSE)

The Average MAE is 0.9189324683047826
The Average RMSE is 1.1215123148736876


In [100]:
# Initialising the variables for error metrics
totalMAE_userItem = 0
totalRMSE_userItem = 0

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split(ratingsData):
    train_set = ratingsData.iloc[train_index]
    test_set = ratingsData.iloc[test_index]

    # Linear regression to estimate alpha, beta_user, and gamma_userItem
    X = train_set[['UserID', 'MovieID']]
    y = train_set['Rating']

    model = LinearRegression()
    model.fit(X, y)

    alpha_userItem = model.coef_[0]
    beta_userItem = model.coef_[1]
    gamma_userItem = model.intercept_

    # To store predictions for Ruser-item(user, item)
    predictions_userItem = []

    #Iterating the testset to make the predictions
    for _, row in test_set.iterrows():
        user_id = row['UserID']
        movie_id = row['MovieID']

        #Calculating the predicted rating
        if user_id in userAverageRating and movie_id in movieAverageRating:
            predictedRating_UserItem = alpha_userItem * userAverageRating[user_id] + beta_userItem * movieAverageRating[movie_id] + gamma_userItem
        else:
            predictedRating_UserItem = globalAvgRating

       #Checking that the predicted rating is within the valid range [1,5]
        predictedRating_UserItem = min(5, max(1, predictedRating_UserItem))
        predictions_userItem.append(predictedRating_UserItem)

    #Calculating MAE and RMSE for Ruser-item(user, item)
    mae_userItem = mean_absolute_error(test_set['Rating'], predictions_userItem)
    rmse_userItem = np.sqrt(mean_squared_error(test_set['Rating'], predictions_userItem))

    totalMAE_userItem += mae_userItem
    totalRMSE_userItem += rmse_userItem

#Calculating the average MAE & RMSE for Ruser-item(user,item) overall folds
averageMAE_userItem = totalMAE_userItem / 5
averageRMSE_userItem = totalRMSE_userItem / 5

print("The Average MAE for Ruser-item(user, item) is", averageMAE_userItem)
print("The Average RMSE for Ruser-item(user, item) is", averageRMSE_userItem)

The Average MAE for Ruser-item(user, item) is 0.9189390806677356
The Average RMSE for Ruser-item(user, item) is 1.1215232850932553
