In [1]:
pip install surprise



In [2]:
from surprise import Dataset, Reader
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn import preprocessing
from sklearn.model_selection import KFold

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#df = pd.read_csv("/content/drive/MyDrive/costumerdata/ratings.csv", dtype='str')
df = pd.read_csv("/content/drive/MyDrive/Analysis of Customer Data/ratings.csv", dtype='str')
df = df.drop(['timestamp'], axis=1)

types_dict = {'userId': int, 'movieId': int, 'rating':float}
for col, col_type in types_dict.items():
    df[col] = df[col].astype(col_type)
    
df

Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5
...,...,...,...
27753439,283228,8542,4.5
27753440,283228,8712,4.5
27753441,283228,34405,4.5
27753442,283228,44761,4.5


In [5]:
def select(df, k, col):
    top_values = df[col].value_counts().nlargest(k)
    return df.loc[df[col].isin(top_values.index)]

filtered_df = select(df, k=100, col='movieId')
filtered_df


Unnamed: 0,userId,movieId,rating
34,3,1221,4.0
40,3,2028,5.0
42,4,1,4.0
46,4,10,4.0
54,4,32,4.5
...,...,...,...
27753419,283228,3578,5.0
27753422,283228,4886,5.0
27753423,283228,4995,4.0
27753429,283228,6377,5.0


In [6]:
# Preparing the data. The KNN-classifier doesn't take continious labels so we need to use the LabelEncoder.
x = np.array(filtered_df[['userId', 'movieId']])
le = preprocessing.LabelEncoder()
y = np.array(filtered_df['rating'])
le.fit(y)
y = le.transform(y)

# Here we perform 5-fold cross-validation for two configurations of the knn algorithm. 
kf = KFold(n_splits=5)
configurations = np.array([[500,1], [400,1]])


# configurations = np.array([[3,1], [5,1], [3,2], [5,2]])
# output for the configurations above: [3.327787599283532, 2.9307420635726285, 3.6984209609424887, 3.45740603954357]
# [2.26]

avg_rmse = []

for config in configurations:
    rmse_config = []
    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]    
        KNN = KNeighborsClassifier(n_neighbors=config[0], p=config[1]).fit(x_train, y_train)
        y_pred = KNN.predict(x_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        rmse_config.append(rmse)
        print(rmse_config)
    avg_rmse.append(np.average(rmse_config))
    print(avg_rmse)



best_conf = np.argmin(avg_rmse)
print(best_conf)



[2.267890737391432]


KeyboardInterrupt: ignored