## Data Preprocessing
Import Libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Load Dataset

In [2]:
# read CSV files
df1 = pd.read_csv('icat.csv')
df2 = pd.read_csv('rats.csv')
df3 = pd.read_csv('ufeat.csv')
df4 = pd.read_csv('user_info.csv')

# merge the dataframes
merged_df = pd.merge(df2, df3, on='userId')
merged_df = pd.merge(merged_df, df4, on='userId')
merged_df = pd.merge(merged_df, df1, on='itemId')

# create new dataframe
data = pd.DataFrame(merged_df)
data.head()

Unnamed: 0,userId,itemId,rating,Age,AcDeg,Budget,Accom,Gender,Job,Region,GroupComp,Email Address,FirstName LastName,ItemName,Category,Quality
0,13,0,1.533462,4,3,2,2,Female,blue collar,North Europe,1Adlt,Eryn_Calderwood9648@gompie.com,Eryn Calderwood,Restaurant Fake,['Gastro'],1.647351
1,15,0,1.283205,4,4,2,2,Male,blue collar,South Europe,2Adlt,Alex_Hall6434@twace.org,Alex Hall,Restaurant Fake,['Gastro'],1.647351
2,17,0,1.506836,4,4,2,2,Female,blue collar,East Europe,1Adlt,Jacqueline_Vaughn7868@cispeto.com,Jacqueline Vaughn,Restaurant Fake,['Gastro'],1.647351
3,19,0,1.260289,2,1,1,1,Male,blue collar,Asia,2Adlt,Domenic_Howard3974@twipet.com,Domenic Howard,Restaurant Fake,['Gastro'],1.647351
4,23,0,3.203168,3,3,1,1,Male,white collar,Africa,1Adlt,Fred_Herbert5917@ubusive.com,Fred Herbert,Restaurant Fake,['Gastro'],1.647351


Data Description

In [3]:
data.shape

(345368, 16)

In [4]:
data.describe()

Unnamed: 0,userId,itemId,rating,Age,AcDeg,Budget,Accom,Quality
count,345368.0,345368.0,345368.0,345368.0,345368.0,345368.0,345368.0,345368.0
mean,49984.786677,10.997067,2.496675,3.084206,3.114516,1.809062,2.047677,2.729969
std,28888.39816,6.636583,0.987163,1.269043,0.751531,0.623599,0.67319,1.319768
min,0.0,0.0,1.134487,1.0,1.0,1.0,1.0,1.188559
25%,24978.0,5.0,1.644446,2.0,3.0,1.0,2.0,1.633559
50%,49968.0,11.0,2.362574,3.0,3.0,2.0,2.0,2.133209
75%,74994.0,17.0,3.202133,4.0,4.0,2.0,2.0,4.476363
max,99999.0,22.0,4.76243,5.0,4.0,3.0,4.0,4.984842


In [5]:
# remove some columns
data = data.drop(['userId', 'Email Address', 'FirstName LastName'], axis=1)
data.head()

Unnamed: 0,itemId,rating,Age,AcDeg,Budget,Accom,Gender,Job,Region,GroupComp,ItemName,Category,Quality
0,0,1.533462,4,3,2,2,Female,blue collar,North Europe,1Adlt,Restaurant Fake,['Gastro'],1.647351
1,0,1.283205,4,4,2,2,Male,blue collar,South Europe,2Adlt,Restaurant Fake,['Gastro'],1.647351
2,0,1.506836,4,4,2,2,Female,blue collar,East Europe,1Adlt,Restaurant Fake,['Gastro'],1.647351
3,0,1.260289,2,1,1,1,Male,blue collar,Asia,2Adlt,Restaurant Fake,['Gastro'],1.647351
4,0,3.203168,3,3,1,1,Male,white collar,Africa,1Adlt,Restaurant Fake,['Gastro'],1.647351


### Convert Feature Values to Numbers

Gender & Job

In [6]:
# total no of values in gender and job column
print(data['Gender'].value_counts())
print(data['Job'].value_counts())

Male      172759
Female    172609
Name: Gender, dtype: int64
white collar    220969
blue collar     124399
Name: Job, dtype: int64


In [7]:
# change gender and job into categorical feature
data['Gender'] = data['Gender'].astype('category').cat.codes
data['Job'] = data['Job'].astype('category').cat.codes

Region & GroupComp

In [8]:
print(data['Region'].value_counts())
print(data['GroupComp'].value_counts())

South Europe     96150
North Europe     95053
Asia             38470
Africa           38231
North America    20012
East Europe      19280
Middle East      19177
South America    18995
Name: Region, dtype: int64
2Adlt          132823
2Adlt+Child    132758
1Adlt           52946
GrpFriends      26841
Name: GroupComp, dtype: int64


In [9]:
data['Region'] = data['Region'].astype('category').cat.codes
data['GroupComp'] = data['GroupComp'].astype('category').cat.codes

ItemName & Category

In [10]:
data['ItemName'].unique()

array(['Restaurant Fake', 'Random Shopping Mall', 'Bogus Waterpark',
       'Nonexisting Zipline', 'Fake Klub', 'Surprise Concert',
       'Museum of Fake History', 'Random Cultural Tour',
       'Unknown Nature Route', 'False Tavern', 'Another Sport Event',
       'Random Surfing Lessons', 'Fiction Nightclub', 'MakeBelieve Pub',
       'Some Sport Event', 'Bogus Spa', 'Random Golf Lessons',
       'Secret Beach', 'Fake Brands Boutique', 'Fake BTT Route',
       'Fake Beach', 'Never Happened Festival',
       'Best Imaginary Restaurant'], dtype=object)

In [11]:
print(data['Category'].value_counts())

['Beach']                30184
['Sports', 'Nature']     30153
['Events', 'Culture']    30125
['Nightlf']              30121
['Sports', 'Events']     30090
['Sports']               30041
['Culture']              30011
['Gastro']               29842
['Shop']                 15177
['Nature', 'Relax']      15012
['Themeprk']             14992
['Gastro', 'Culture']    14990
['Shop', 'Relax']        14935
['Gastro', 'Nightlf']    14892
['Relax']                14803
Name: Category, dtype: int64


In [12]:
data['ItemName'] = data['ItemName'].astype('category').cat.codes
data['Category'] = data['Category'].astype('category').cat.codes
data.head()

Unnamed: 0,itemId,rating,Age,AcDeg,Budget,Accom,Gender,Job,Region,GroupComp,ItemName,Category,Quality
0,0,1.533462,4,3,2,2,0,0,5,0,18,5,1.647351
1,0,1.283205,4,4,2,2,1,0,7,1,18,5,1.647351
2,0,1.506836,4,4,2,2,0,0,2,0,18,5,1.647351
3,0,1.260289,2,1,1,1,1,0,1,1,18,5,1.647351
4,0,3.203168,3,3,1,1,1,1,0,0,18,5,1.647351


Rating

In [13]:
data['rating'].value_counts()

4.662240    2074
4.666387     893
1.228280     807
4.677095     712
1.188851     687
            ... 
3.574407       1
2.598941       1
3.415163       1
2.605091       1
2.317777       1
Name: rating, Length: 322762, dtype: int64

In [14]:
# Round the rating values to the nearest integer
data['rating'] = data['rating'].round()

# Convert the rating values to integer type
data['rating'] = data['rating'].astype(int)
data.head()

Unnamed: 0,itemId,rating,Age,AcDeg,Budget,Accom,Gender,Job,Region,GroupComp,ItemName,Category,Quality
0,0,2,4,3,2,2,0,0,5,0,18,5,1.647351
1,0,1,4,4,2,2,1,0,7,1,18,5,1.647351
2,0,2,4,4,2,2,0,0,2,0,18,5,1.647351
3,0,1,2,1,1,1,1,0,1,1,18,5,1.647351
4,0,3,3,3,1,1,1,1,0,0,18,5,1.647351


## Feature Selection

In [15]:
from sklearn.feature_selection import mutual_info_classif

# Separate the features (X) and target variable (y)
X = data.drop('rating', axis=1)
y = data['rating']

# Compute Pearson correlation coefficients
correlations = X.corrwith(y)

# Compute mutual information scores
mutual_info = mutual_info_classif(X, y)

# Print the results
print("Correlations:")
print(correlations)
print("Mutual information scores:")
print(mutual_info)

Correlations:
itemId       0.055518
Age         -0.013712
AcDeg        0.000381
Budget      -0.012923
Accom       -0.009887
Gender       0.022055
Job          0.014191
Region       0.034200
GroupComp   -0.014632
ItemName     0.039606
Category    -0.141262
Quality      0.473812
dtype: float64
Mutual information scores:
[0.248308   0.00624647 0.0074118  0.00933659 0.01055582 0.01493454
 0.01747767 0.00928472 0.00895851 0.24768132 0.14277082 0.24702006]


In [16]:
data.columns

Index(['itemId', 'rating', 'Age', 'AcDeg', 'Budget', 'Accom', 'Gender', 'Job',
       'Region', 'GroupComp', 'ItemName', 'Category', 'Quality'],
      dtype='object')

## Train and Test Data

In [17]:
# Input and output variable
X = data.drop('rating', axis=1)
y = data['rating']
X.shape, y.shape

((345368, 12), (345368,))

In [18]:
#Train , Test and  Split data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.20, random_state=42)
X_train.shape, X_test.shape ,y_train.shape ,y_test.shape 

((276294, 12), (69074, 12), (276294,), (69074,))

Data Scaling

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Collaborating Filtering

In [21]:
from surprise import SVD
from surprise import accuracy

# Train SVD on X_train
algo_X = SVD()
trainset_X = data.build_full_trainset(X_train)
algo_X.fit(trainset_X)

# Make predictions on X_test
X_test_pred = algo_X.test(X_test)

# Evaluate performance on X_test
X_test_rmse = accuracy.rmse(X_test_pred)

# Train SVD on y_train
algo_y = SVD()
trainset_y = data.build_full_trainset(y_train)
algo_y.fit(trainset_y)

# Make predictions on y_test
y_test_pred = algo_y.test(y_test)

# Evaluate performance on y_test
y_test_rmse = accuracy.rmse(y_test_pred)

ModuleNotFoundError: No module named 'surprise'

# SVM

In [None]:
# Fit the model SVC
from sklearn.svm import SVC
# SVM model building
svc = SVC()
svc.fit(X_train[:50000],y_train[:50000])

# Predic the y_test data
y_pred=svc.predict(X_test[:50000])
print(y_pred)

# comparing actual response values (y_test) with predicted response values (y_pred)
print("SVM model accuracy(in %):", accuracy_score(y_test[:50000], y_pred)*100)

# precision_score
precision_score_svc = precision_score(y_test[:50000], y_pred, average='macro')
print(precision_score_svc)

# recall score
recall_score_svc = recall_score(y_test[:50000], y_pred, average='macro')
print(recall_score_svc)

# F1 score macro
f1_score_svc = f1_score(y_test[:50000], y_pred, average='macro')
print(f1_score_svc)
# F1 score micro
f1_score_svc = f1_score(y_test[:50000], y_pred, average='micro')
print(f1_score_svc)

# K- Means Clustering

In [None]:
from sklearn.cluster import KMeans

# apply K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_train)

# get the predicted cluster labels
y_pred = kmeans.predict(X_test)

# calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

precision = precision_score(y_test, y_pred, average='macro')
print("Precision:", precision)

recall = recall_score(y_test, y_pred, average='macro')
print("Recall:", recall)

f1 = f1_score(y_test, y_pred, average='macro')
print("F1-score Macro:", f1)

f1 = f1_score(y_test, y_pred, average='micro')
print("F1-score Micro:", f1)