In [1]:
import pandas as pd
from surprise import Dataset, Reader, KNNBasic, dump
from surprise.model_selection import train_test_split
from surprise import accuracy

In [2]:
# read u.genre
u_genre = pd.read_csv('u.genre', sep='|', header=None)
u_genre.columns = ['genre', 'genre_id']

# read u1.base ... u5.base and combine it
u1_base = pd.read_csv('u1.base', sep='\t', header=None)
u1_base.columns = ['user_id', 'item_id', 'rating', 'timestamp']
u2_base = pd.read_csv('u2.base', sep='\t', header=None)
u2_base.columns = ['user_id', 'item_id', 'rating', 'timestamp']
u3_base = pd.read_csv('u3.base', sep='\t', header=None)
u3_base.columns = ['user_id', 'item_id', 'rating', 'timestamp']
u4_base = pd.read_csv('u4.base', sep='\t', header=None)
u4_base.columns = ['user_id', 'item_id', 'rating', 'timestamp']
u5_base = pd.read_csv('u5.base', sep='\t', header=None)
u5_base.columns = ['user_id', 'item_id', 'rating', 'timestamp']

u1_base = pd.concat([u1_base, u2_base, u3_base, u4_base, u5_base])


# read u1.test
u1_test = pd.read_csv('u1.test', sep='\t', header=None)
u1_test.columns = ['user_id', 'item_id', 'rating', 'timestamp']

# read u.user
u_user = pd.read_csv('u.user', sep='|', header=None)
u_user.columns = ['user_id', 'age', 'gender', 'occupation', 'zip code']

# read u.item
u_item = pd.read_csv('u.item', sep='|', header=None, encoding='latin-1')
u_item.columns = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDB_URL'] + list(u_genre['genre'])


In [3]:
u_item

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDB_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
u_genre

Unnamed: 0,genre,genre_id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4
5,Comedy,5
6,Crime,6
7,Documentary,7
8,Drama,8
9,Fantasy,9


In [5]:
u_user

Unnamed: 0,user_id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [6]:
u1_base

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
79995,943,943,5,888639614
79996,943,1011,2,875502560
79997,943,1067,2,875501756
79998,943,1074,4,888640250


In [7]:
# Merge user and item data
u_data = pd.merge(u1_base, u_user, on='user_id')
u_data = pd.merge(u_data, u_item, left_on='item_id', right_on='movie_id')

# Create Surprise Dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(u_data[['user_id', 'item_id', 'rating']], reader)

# Split the dataset into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Use the KNNBasic algorithm for collaborative filtering
sim_options = {'name': 'cosine', 'user_based': True}
model = KNNBasic(sim_options=sim_options)

# Train the model on the training set
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Evaluate the model
accuracy.rmse(predictions)

# Function to get movie recommendations for a user
def get_top_n_recommendations(predictions, user_id, n=10):
    user_predictions = [pred for pred in predictions if pred.uid == user_id]
    user_predictions.sort(key=lambda x: x.est, reverse=True)
    return user_predictions[:n]

# Generate personalized recommendations for each user
num_users = u_data['user_id'].nunique()
for user_id in range(1, num_users + 1):
    user_top_n = get_top_n_recommendations(predictions, user_id, n=5)

    # Display the top recommendations for the user
    print(f"\nTop Recommendations for User {user_id}:")
    for pred in user_top_n:
        movie_title = u_item[u_item['movie_id'] == pred.iid]['movie_title'].values[0]
        print(f'Movie Title: {movie_title}, Estimated Rating: {pred.est}')

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8640

Top Recommendations for User 1:
Movie Title: Cinema Paradiso (1988), Estimated Rating: 4.70090018250504
Movie Title: Welcome to the Dollhouse (1995), Estimated Rating: 4.6510061850056506
Movie Title: Three Colors: Red (1994), Estimated Rating: 4.626998973067046
Movie Title: Three Colors: Red (1994), Estimated Rating: 4.626998973067046
Movie Title: Usual Suspects, The (1995), Estimated Rating: 4.57572579441017

Top Recommendations for User 2:
Movie Title: Godfather, The (1972), Estimated Rating: 4.800195619048908
Movie Title: Sense and Sensibility (1995), Estimated Rating: 4.625078468775705
Movie Title: Secrets & Lies (1996), Estimated Rating: 4.624499389706533
Movie Title: Titanic (1997), Estimated Rating: 4.575292165391551
Movie Title: Shall We Dance? (1996), Estimated Rating: 4.426076775721376

Top Recommendations for User 3:
Movie Title: Paradise Lost: The Child Murders at Robin Hood Hills (199

In [8]:
# Merge user and item data
u_data = pd.merge(u1_base, u_user, on='user_id')
u_data = pd.merge(u_data, u_item, left_on='item_id', right_on='movie_id')

# Create Surprise Dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(u_data[['user_id', 'item_id', 'rating']], reader)

# Split the dataset into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Use the KNNBasic algorithm for collaborative filtering
sim_options = {'name': 'cosine', 'user_based': True}
model = KNNBasic(sim_options=sim_options)

# Train the model on the training set
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Evaluate the model
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")

# Function to get movie recommendations for a user
def get_top_n_recommendations(predictions, user_id, n=10):
    user_predictions = [pred for pred in predictions if pred.uid == user_id]
    user_predictions.sort(key=lambda x: x.est, reverse=True)
    return user_predictions[:n]

# Evaluate the model on the test set (u1_test)
test_data = Dataset.load_from_df(u1_test[['user_id', 'item_id', 'rating']], reader)
testset = test_data.build_full_trainset().build_testset()
test_predictions = model.test(testset)

# Calculate RMSE and MAE for the test set
test_rmse = accuracy.rmse(test_predictions)
test_mae = accuracy.mae(test_predictions)

print(f"\nTest Set RMSE: {test_rmse}")
print(f"Test Set MAE: {test_mae}")

# Save the trained model to a file
model_filename = 'collaborative_filtering_model'
dump.dump(model_filename, algo=model)
#
# # You can later load the model using the following code
# loaded_model = dump.load(model_filename)[1]

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8640
MAE:  0.6766
Root Mean Squared Error (RMSE): 0.8640246570455596
Mean Absolute Error (MAE): 0.6765519172465901
RMSE: 0.8261
MAE:  0.6474

Test Set RMSE: 0.826080041507462
Test Set MAE: 0.6474142119506778
