In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
# Load ratings
ratings = pd.read_csv(
    'u1.base',
    sep='\t',
    names=['userId', 'movieId', 'rating', 'timestamp']
)

# Load movie metadata
movies = pd.read_csv(
    'u.item',
    sep='|',
    encoding='latin-1',
    names=[
        'movieId', 'title', 'release_date', 'video_release_date',
        'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation',
        'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
        'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
        'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
    ]
)

In [3]:
print(ratings.shape)
ratings.head(10)

(80000, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
5,1,7,4,875071561
6,1,8,1,875072484
7,1,9,5,878543541
8,1,11,2,875072262
9,1,13,5,875071805


In [4]:
print(movies.shape)
movies.head(10)

(1682, 24)


Unnamed: 0,movieId,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,9,Dead Man Walking (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Dead%20Man%20...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,Richard III (1995),22-Jan-1996,,http://us.imdb.com/M/title-exact?Richard%20III...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [5]:
# Create user-item matrix
user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
user_movie_matrix.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,,4.0,1.0,5.0,,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,4.0,,,,,,2.0,4.0,4.0,,...,,,,,,,,,,
7,,,,5.0,,,,,5.0,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,4.0,,,,...,,,,,,,,,,
10,,,,,,,,,4.0,,...,,,,,,,,,,


In [6]:
# Fill missing values with 0 (alternative: mean normalization)
user_movie_matrix_filled = user_movie_matrix.fillna(0)
user_movie_matrix_filled.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# User-User Collaborative Filtering
# Calculate cosine similarity between users
user_similarity = cosine_similarity(user_movie_matrix_filled)
print(user_similarity.shape)
user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)
user_similarity_df.head(10)

(943, 943)


userId,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.097021,0.052469,0.021162,0.193545,0.290114,0.200438,0.097786,0.060791,0.200926,...,0.252187,0.090728,0.216186,0.139478,0.156937,0.095265,0.222958,0.158151,0.13309,0.252778
2,0.097021,1.0,0.051348,0.084426,0.015516,0.187717,0.074479,0.023262,0.108167,0.078176,...,0.083045,0.298811,0.308608,0.40381,0.282896,0.213233,0.189257,0.132806,0.126597,0.101784
3,0.052469,0.051348,1.0,0.14566,0.03718,0.084526,0.015718,0.0864,0.0,0.030909,...,0.04133,0.036956,0.13575,0.06394,0.102082,0.034064,0.13372,0.083537,0.08123,0.019676
4,0.021162,0.084426,0.14566,1.0,0.017291,0.0,0.037566,0.046087,0.0,0.023232,...,0.014238,0.047742,0.127305,0.118947,0.089016,0.039116,0.120521,0.135669,0.125925,0.035586
5,0.193545,0.015516,0.03718,0.017291,1.0,0.158552,0.17089,0.157488,0.053846,0.089508,...,0.284295,0.074215,0.086345,0.063759,0.126358,0.068145,0.198118,0.142141,0.15,0.239803
6,0.290114,0.187717,0.084526,0.0,0.158552,1.0,0.201437,0.089909,0.075596,0.240896,...,0.287658,0.075532,0.136085,0.15145,0.102204,0.099518,0.289962,0.093607,0.178815,0.210308
7,0.200438,0.074479,0.015718,0.037566,0.17089,0.201437,1.0,0.088937,0.085366,0.25666,...,0.311565,0.087229,0.076907,0.100333,0.104598,0.075901,0.263561,0.017469,0.169986,0.292691
8,0.097786,0.023262,0.0864,0.046087,0.157488,0.089909,0.088937,1.0,0.0,0.11669,...,0.191657,0.030694,0.041758,0.031863,0.119228,0.039294,0.094273,0.106551,0.067465,0.214486
9,0.060791,0.108167,0.0,0.0,0.053846,0.075596,0.085366,0.0,1.0,0.066739,...,0.053205,0.0,0.099027,0.121492,0.097944,0.0,0.093677,0.110612,0.064484,0.104008
10,0.200926,0.078176,0.030909,0.023232,0.089508,0.240896,0.25666,0.11669,0.066739,1.0,...,0.24418,0.04255,0.109462,0.112436,0.072791,0.031693,0.246078,0.014989,0.156821,0.130262


In [8]:
def predict_rating(user_id, movie_id, user_movie_matrix, user_similarity_df):
    """
    Predict the rating of a specific movie for a specific user using collaborative filtering.
    """
    if movie_id in user_movie_matrix.columns:
        # Get users who rated the movie
        users_who_rated = user_movie_matrix[movie_id].dropna().index
        if len(users_who_rated) > 0:
            # Calculate weighted average of ratings by similar users
            sim_scores = user_similarity_df.loc[user_id, users_who_rated]
            ratings_by_others = user_movie_matrix.loc[users_who_rated, movie_id]
            weighted_sum = (sim_scores * ratings_by_others).sum()
            sum_of_weights = sim_scores.sum()
            if sum_of_weights > 0:
                return weighted_sum / sum_of_weights
    # Default: return user's average rating
    return user_movie_matrix.loc[user_id].mean() if user_id in user_movie_matrix.index else None


In [9]:
# Evaluation function
def generate_predictions(test_data, user_movie_matrix_train, user_similarity_train_df):
    """
    Evaluate the model using RMSE on the given test set.
    """
    predictions = []
    for _, row in test_data.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']
        predicted_rating = predict_rating(user_id, movie_id, user_movie_matrix_train, user_similarity_train_df)
        predictions.append({
            "userId": user_id,
            "movieId": movie_id,
            "predicted_rating": predicted_rating
        })
    return pd.DataFrame(predictions)

In [10]:
# Evaluate using RMSE
def evaluate_predictions(predictions_df, test_data):
    # Merge predictions with actual ratings
    merged_data = pd.merge(test_data, predictions_df, on=['userId', 'movieId'], how='inner')
    rmse = sqrt(mean_squared_error(merged_data['rating'], merged_data['predicted_rating']))
    return rmse, merged_data


In [11]:
# # Load original test dataset
# test_data = pd.read_csv("u1.test", sep='\t', names=["userId", "movieId", "rating", "timestamp"])

# # Shuffle the data and select only 1000 examples
# test_data_shuffled = test_data.sample(n=1000, random_state=42).reset_index(drop=True)

# # Save the tagged version (with ratings)
# test_data_shuffled.to_csv("u1_test_tagged_1000.csv", index=False)

# # Create and save the untagged version (without ratings and timestamp)
# untagged_test_data = test_data_shuffled.drop(columns=['rating', 'timestamp'])
# untagged_test_data.to_csv("u1_test_untagged_1000.csv", index=False)

In [13]:
# Generate predictions and evaluate
untagged_test_data = pd.read_csv("u1_test_untagged_1000.csv")
predictions_df = generate_predictions(untagged_test_data, user_movie_matrix, user_similarity_df)
predictions_df.head(10)
print(predictions_df)
# save
predictions_df.to_csv("u1_test_w_predictions_1000.csv", index=False)

     userId  movieId  predicted_rating
0       235      190          4.203340
1        42      428          3.692159
2       194      568          3.575321
3        14      655          3.833792
4       294      346          3.664135
..      ...      ...               ...
995     178        8          3.987168
996     369      988          2.363022
997     228      690          3.567471
998     279      732          3.665678
999       1      188          3.733003

[1000 rows x 3 columns]


In [21]:
test_data = pd.read_csv("u1.test", sep='\t', names=["userId", "movieId", "rating", "timestamp"])

gilat_predictions = pd.read_csv("u1_test_w_predictions_1000.csv")
rmse_value, merged_data = evaluate_predictions(gilat_predictions, test_data)

print(f"RMSE: {rmse_value:.4f}")
print(merged_data.head())

RMSE: 0.9949
   userId  movieId  rating  timestamp  predicted_rating
0       1      128       4  875072573          3.601846
1       1      188       3  875073128          3.733003
2       1      232       3  878543196          3.200574
3       1      241       4  878543133          3.536787
4       1      255       2  885345822          3.297222


In [23]:
# Run all predictions
data_list = [("Gilat", "u1_test_w_predictions_1000.csv")]
data_scores = []
for name, data_file in data_list:
    predictions = pd.read_csv(data_file)
    rmse_value, merged_data = evaluate_predictions(predictions, test_data)
    data_scores.append((name, rmse_value))
    print(f"{name} RMSE : {rmse_value:.4f}")

# print the top 3
data_scores.sort(key=lambda x: x[1])
print("Top 3:")
for name, rmse_value in data_scores[:3]:
    print(f"{name} RMSE: {rmse_value:.4f}")


Gilat RMSE : 0.9949
Top 3:
Gilat RMSE: 0.9949
