In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3095446 sha256=82432e3172df08b2ab2da5d8ae96f02f5d9cc4b9f2e76be3aeb3e15a416426be
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.

### Import Libraries

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from collections import defaultdict
from surprise import Dataset, Reader, SVD, accuracy
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

### Load Dataset

In [4]:
# Load the dataset from csv file
df1 = pd.read_csv('/content/drive/MyDrive/Notebooks/Rating/rats.csv')
df2 = pd.read_csv('/content/drive/MyDrive/Notebooks/Rating/icat.csv')
merged_df = pd.merge(df1, df2, on='itemId')

# create new dataframe
df3 = pd.DataFrame(merged_df)
df3.head()

Unnamed: 0,userId,itemId,rating,ItemName,Category,Quality
0,13,0,1.533462,Restaurant Fake,['Gastro'],1.647351
1,15,0,1.283205,Restaurant Fake,['Gastro'],1.647351
2,17,0,1.506836,Restaurant Fake,['Gastro'],1.647351
3,19,0,1.260289,Restaurant Fake,['Gastro'],1.647351
4,23,0,3.203168,Restaurant Fake,['Gastro'],1.647351


In [5]:
# Round the rating values to the nearest integer
df3['rating'] = df3['rating'].round()

# Convert the rating values to integer type
df3['rating'] = df3['rating'].astype(int)

# Collaborative Filtering

In [6]:
# Define the reader for the Surprise library
reader = Reader(rating_scale=(1, 5))
# Split the dataset into training and testing sets
train_set, test_set = train_test_split(df3, test_size=0.2, random_state=42)

# Load the training dataset into the Surprise format
data = Dataset.load_from_df(train_set[['userId', 'itemId', 'rating']], reader)

# Split the dataset into training and testing sets
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

# Train the SVD model
svd = SVD(random_state=42)
svd.fit(trainset)

# Test the model on the testing set
predictions = svd.test(testset)

# Calculate metrics
print('CF RMSE:', accuracy.rmse(predictions))
print('CF MAE:', accuracy.mae(predictions))

RMSE: 0.6601
CF RMSE: 0.6600582475462539
MAE:  0.5705
CF MAE: 0.5704795894758707


# Content Based Filtering

In [7]:
# Load the training dataset into the Surprise format
data = Dataset.load_from_df(train_set[['userId', 'Category', 'rating']], reader)

# Split the dataset into training and testing sets
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

# Train the SVD model
svd = SVD(random_state=42)
svd.fit(trainset)

# Test the model on the testing set
predictions = svd.test(testset)

# Calculate metrics
print('Content Based RMSE:', accuracy.rmse(predictions))
print('Content Based MAE:', accuracy.mae(predictions))

RMSE: 0.5981
Content Based RMSE: 0.5981440474571738
MAE:  0.4891
Content Based MAE: 0.48913407895147143


In [8]:
# Extract the predicted ratings and the original ratings
y_pred = [round(pred.est) for pred in predictions]
y_true = [int(pred.r_ui) for pred in predictions]

# Evaluate the performance of the model using classification metrics
# Precision
precision = precision_score(y_true, y_pred, average='macro')

# Recall
recall = recall_score(y_true, y_pred, average='macro')

# F1 score micro
f1_micro = f1_score(y_true, y_pred, average='micro')

# F1 score macro
f1_macro = f1_score(y_true, y_pred, average='macro')

# Print the classification metrics
print(f"CF Precision: {precision}")
print(f"CF Recall: {recall}")
print(f"CF Micro F1 score: {f1_micro}")
print(f"CF Macro F1 score: {f1_macro}")

  _warn_prf(average, modifier, msg_start, len(result))


CF Precision: 0.2
CF Recall: 0.09338787595449587
CF Micro F1 score: 0.46693937977247935
CF Macro F1 score: 0.12732342895993456


### Top 5 & 10

In [9]:
# Evaluate the model using RMSE and MAE metrics on top 5 & 10
rmse_5 = accuracy.rmse(predictions[:5])
rmse_10 = accuracy.rmse(predictions[:10])
mae_5 = accuracy.mae(predictions[:5])
mae_10 = accuracy.mae(predictions[:10])
print('RMSE@5:', rmse_5)
print('RMSE@10:', rmse_10)
print('MAE@5:', mae_5)
print('MAE@10:', mae_10)

RMSE: 0.3357
RMSE: 0.4044
MAE:  0.2661
MAE:  0.3079
RMSE@5: 0.3356734644783255
RMSE@10: 0.4044118416572703
MAE@5: 0.26607430391129144
MAE@10: 0.3079062641698405


### Top 5 Fold

In [10]:
from surprise.model_selection import cross_validate

# Train and evaluate the SVD model using 5-fold cross-validation
cv_results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Print the average RMSE and MAE across all folds
print('Average RMSE:', np.mean(cv_results['test_rmse']))
print('Average MAE:', np.mean(cv_results['test_mae']))

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9651  0.9658  0.9648  0.9716  0.9678  0.9670  0.0025  
MAE (testset)     0.7775  0.7789  0.7787  0.7868  0.7823  0.7808  0.0034  
Fit time          7.25    5.75    6.39    4.08    5.50    5.79    1.05    
Test time         0.72    0.65    1.03    0.40    0.40    0.64    0.23    
Average RMSE: 0.9670208734696576
Average MAE: 0.7808307392821937


### Top 5 Item Id/Name

In [11]:
# make dictionary of users and their catogries
user_categories = defaultdict(list)
for row in trainset.all_ratings():
    user = trainset.to_raw_uid(row[0])
    category = trainset.to_raw_iid(row[1])
    user_categories[user].append(category)

# Slice the dictionary
user_categories = {key: user_categories[key] for key in list(user_categories.keys())[:1000]}
# Print the subset
print(user_categories)

{2721: ["['Beach']", "['Themeprk']", "['Gastro']", "['Shop', 'Relax']", "['Shop']"], 22619: ["['Nature', 'Relax']", "['Nightlf']", "['Sports', 'Events']"], 73150: ["['Nightlf']", "['Gastro']", "['Sports']", "['Shop']", "['Culture']", "['Culture']"], 90550: ["['Sports']", "['Nature', 'Relax']", "['Gastro']", "['Relax']"], 9589: ["['Events', 'Culture']", "['Shop', 'Relax']", "['Sports', 'Nature']", "['Nightlf']"], 23324: ["['Events', 'Culture']", "['Sports', 'Nature']", "['Nightlf']"], 27397: ["['Themeprk']", "['Gastro']", "['Events', 'Culture']", "['Gastro', 'Nightlf']"], 54996: ["['Gastro']", "['Sports', 'Nature']", "['Culture']", "['Sports']"], 70677: ["['Sports', 'Events']", "['Themeprk']"], 60220: ["['Gastro', 'Nightlf']", "['Sports', 'Nature']", "['Shop']", "['Sports']", "['Themeprk']"], 18210: ["['Beach']"], 18163: ["['Gastro', 'Nightlf']", "['Sports', 'Events']", "['Nightlf']", "['Relax']", "['Beach']"], 93113: ["['Sports', 'Nature']", "['Themeprk']"], 29222: ["['Nature', 'Relax'

In [12]:
# function to set top user with top Categories
def predict_user_category_ratings(user, category):
    items_in_category = merged_df[merged_df['Category'] == category][['itemId', 'ItemName']].drop_duplicates()
    user_ratings = []
    for _, row in items_in_category.iterrows():
        item_id = row['itemId']
        item_name = row['ItemName']
        user_ratings.append((item_id, item_name, svd.predict(user, item_id, verbose=False).est))
    user_ratings = sorted(user_ratings, key=lambda x: x[2], reverse=True)[:5]
    return [(user, rating[0], merged_df[merged_df['itemId'] == rating[0]]['ItemName'].iloc[0], rating[1]) for rating in user_ratings]

# collect top categories
user_top_items = {}
for user in user_categories.keys():
    user_top_items[user] = {}
    for category in set(user_categories[user]):
        user_top_items[user][category] = predict_user_category_ratings(user, category)

# collect top 5 users items
for i, (user, top_items) in enumerate(user_top_items.items()):
    print(f"Top 5 items for user {user}:")
    for category, items in top_items.items():
        if items:
            print(f"\tIn category {category}:")
            for item in items:
                print(f"\t\tItem Id: {item[1]} - Name: {item[2]}")
    print('\n')
    
    if i == 4:
        break

Top 5 items for user 2721:
	In category ['Gastro']:
		Item Id: 0 - Name: Restaurant Fake
		Item Id: 15 - Name: Best Imaginary Restaurant
	In category ['Shop', 'Relax']:
		Item Id: 2 - Name: Random Shopping Mall
	In category ['Shop']:
		Item Id: 14 - Name: Fake Brands Boutique
	In category ['Beach']:
		Item Id: 21 - Name: Secret Beach
		Item Id: 22 - Name: Fake Beach
	In category ['Themeprk']:
		Item Id: 3 - Name: Bogus Waterpark


Top 5 items for user 22619:
	In category ['Nightlf']:
		Item Id: 1 - Name: Fiction Nightclub
		Item Id: 19 - Name: Fake Klub
	In category ['Nature', 'Relax']:
		Item Id: 4 - Name: Unknown Nature Route
	In category ['Sports', 'Events']:
		Item Id: 5 - Name: Some Sport Event
		Item Id: 9 - Name: Another Sport Event


Top 5 items for user 73150:
	In category ['Sports']:
		Item Id: 18 - Name: Nonexisting Zipline
		Item Id: 20 - Name: Random Golf Lessons
	In category ['Gastro']:
		Item Id: 0 - Name: Restaurant Fake
		Item Id: 15 - Name: Best Imaginary Restaurant
	

# Hybrid Model

In [13]:
# create data for hybrid model
data = Dataset.load_from_df(train_set[['userId', 'itemId', 'rating']], reader)
# Build a full training set from the Dataset object.
trainset_surprise = data.build_full_trainset()

# Train the SVD model
model_surprise = SVD(random_state=42)
# fit svd model on training set
model_surprise.fit(trainset_surprise)

# list of tuples represent test data
testset_surprise = [(row['userId'], row['itemId'], row['rating']) for _, row in test_set.iterrows()]
# Predict the ratings on the testing set using the SVD model
predictions = model_surprise.test(testset_surprise)
# list of estimated ratings from the predictions generated by the SVD model
y_pred_svd = np.array([pred.est for pred in predictions])
# select independent features from train set
X_train = train_set[['userId', 'itemId']]
# select target feature from train set
y_train = train_set['rating']
# select independent features from test set
X_test = test_set[['userId', 'itemId']]
# select dependent feature from test set
y_test = test_set['rating']
# SVD model predict the ratings for all user-item pairs 
predicted_ratings = model_surprise.test(data.build_full_trainset().build_testset())
# add SVD model predictions in train data
X_train['predicted_rating'] = np.array([pred.est for pred in predicted_ratings])
# add SVD model predictions in test data
X_test['predicted_rating'] = y_pred_svd

# Train the SVM model using the predicted ratings from the SVD model as features
model_svm = SVC(random_state=42)
model_svm.fit(X_train[:10000], y_train[:10000])
# Test the model on the testing set and calculate accuracy and precision
y_pred = model_svm.predict(X_test)

# Evaluate the performance of the model using classification metrics
# Accuracy
accuracy = accuracy_score(y_test, y_pred)*100

# Precision
precision = precision_score(y_test, y_pred, average='macro')

# Recall
recall = recall_score(y_test, y_pred, average='macro')

# F1 score micro
f1_micro = f1_score(y_test, y_pred, average='micro')
# F1 score macro
f1_macro = f1_score(y_test, y_pred, average='macro')

# Print the classification metrics
print(f"Hybrid Model Accuracy: {accuracy}")
print(f"Hybrid Model Precision: {precision}")
print(f"Hybrid Model Recall: {recall}")
print(f"Hybrid Model F1 score: {f1_micro}") 
print(f"Hybrid Model F1 score: {f1_macro}") 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['predicted_rating'] = np.array([pred.est for pred in predicted_ratings])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['predicted_rating'] = y_pred_svd


Hybrid Model Accuracy: 35.97011900280858
Hybrid Model Precision: 0.07194023800561716
Hybrid Model Recall: 0.2
Hybrid Model F1 score: 0.3597011900280858
Hybrid Model F1 score: 0.1058177172061329


  _warn_prf(average, modifier, msg_start, len(result))


# SVM 

In [14]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df3[['userId', 'itemId']], df3['rating'], test_size=0.2, random_state=42)

# Train the SVM model
svm = SVC(random_state=42)
# fit model on train X and y
svm.fit(X_train[:10000], y_train[:10000])

# Test the model on the testing set
y_pred = svm.predict(X_test)

# Evaluate the performance of the model using classification metrics
# Accuracy
accuracy = accuracy_score(y_test, y_pred)*100

# Precision
precision = precision_score(y_test, y_pred, average='macro')

# Recall
recall = recall_score(y_test, y_pred, average='macro')

# F1 score micro
f1_micro = f1_score(y_test, y_pred, average='micro')

# F1 score macro
f1_macro = f1_score(y_test, y_pred, average='macro')

# Print the classification metrics
print(f"SVM Accuracy: {accuracy}")
print(f"SVM Precision: {precision}")
print(f"SVM Recall: {recall}")
print(f"SVM Mico F1 score: {f1_micro}")
print(f"SVM Maco F1 score: {f1_macro}")

SVM Accuracy: 35.97011900280858
SVM Precision: 0.07194023800561716
SVM Recall: 0.2
SVM Mico F1 score: 0.3597011900280858
SVM Maco F1 score: 0.1058177172061329


  _warn_prf(average, modifier, msg_start, len(result))


# K-Means

In [15]:
from sklearn.cluster import KMeans

# apply K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_train)

# get the predicted cluster labels
y_pred = kmeans.predict(X_test)

# Evaluate the performance of the model using classification metrics
# Accuracy
accuracy = accuracy_score(y_test, y_pred)*100

# Precision
precision = precision_score(y_test, y_pred, average='macro')

# Recall
recall = recall_score(y_test, y_pred, average='macro')

# F1 score micro
f1_micro = f1_score(y_test, y_pred, average='micro')
# F1 score macro
f1_macro = f1_score(y_test, y_pred, average='macro')

# Print the classification metrics
print(f"K-Means Accuracy: {accuracy}")
print(f"K-Means Precision: {precision}")
print(f"K-Means Recall: {recall}")
print(f"K-Means F1 score: {f1_micro}") 
print(f"K-Means F1 score: {f1_macro}") 



K-Means Accuracy: 18.995569968439643
K-Means Precision: 0.1590090627528293
K-Means Recall: 0.13422931313202568
K-Means F1 score: 0.18995569968439643
K-Means F1 score: 0.14002856066195005


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## XGBoost

In [16]:
from xgboost import XGBClassifier

# Train the XGB model
xgb = XGBClassifier()
# fit model on train X and y
xgb.fit(X_train, y_train-1)

# Test the model on the testing set
y_pred = xgb.predict(X_test)

# Evaluate the performance of the model using classification metrics
# Accuracy
accuracy = accuracy_score(y_test-1, y_pred)*100

# Precision
precision = precision_score(y_test-1, y_pred, average='macro')

# Recall
recall = recall_score(y_test-1, y_pred, average='macro')

# F1 score micro
f1_micro = f1_score(y_test-1, y_pred, average='micro')

# F1 score macro
f1_macro = f1_score(y_test-1, y_pred, average='macro')

# Print the classification metrics
print(f"XGBoost Accuracy: {accuracy}")
print(f"XGBoost Precision: {precision}")
print(f"XGBoost Recall: {recall}")
print(f"XGBoost Mico F1 score: {f1_micro}")
print(f"XGBoost Maco F1 score: {f1_macro}")

XGBoost Accuracy: 46.01297159568
XGBoost Precision: 0.42635418730701724
XGBoost Recall: 0.33171211536220346
XGBoost Mico F1 score: 0.46012971595679997
XGBoost Maco F1 score: 0.32745825768569214
