In [1]:
#import
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from itertools import product
from torch.utils.tensorboard import SummaryWriter


In [2]:
#import dataset
df = pd.read_csv('dataset-ml-25m/dataset.csv')

In [3]:
#split data and labels 
X = df.drop(['rating'], axis=1)
y = df['rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
#count the numebr of x_train 
print("Number of train set: ", X_train.shape[0])
print("Numebr of test set: ", X_test.shape[0])

pca = PCA(n_components=0.95)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

Number of train set:  10362
Numebr of test set:  3454


In [None]:
#plot dataset after PCA
plt.figure(figsize=(8, 6))
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap="Blues")
plt.show()

### Linear regressor

In [None]:
log_name = f"linear_regression"
writer = SummaryWriter(f"results/tradML/pca/LinearRegression/{log_name}")

lin_regr = linear_model.LinearRegression()
lin_regr.fit(X_train, y_train)
y_pred = lin_regr.predict(X_test)

# Compute the RSS
mse = mean_squared_error(y_test, y_pred) 
print('Mean Square Error:', mse)
writer.add_scalar('Loss', mse)
writer.flush()

# Compute the R-square index
rsquare = r2_score(y_test, y_pred) 
print('R-square:', rsquare)

### Lasso

In [None]:
# lasso regression
import copy
from sklearn.linear_model import Lasso

alpha = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]

best_mse = float('inf')
best_a = None
best_lasso = None

i = 0
max_iter = len(alpha)

for a in alpha:
    i += 1
    log_name = f"alpha"
    writer = SummaryWriter(f"results/tradML/pca/Lasso/{log_name}")

    lasso = Lasso(alpha=a)
    lasso.fit(X_train, y_train)
    y_pred = lasso.predict(X_test)
    mse = mean_squared_error(y_test, y_pred) 

    writer.add_scalar('Loss', mse, a)
    writer.add_hparams(
        {'alpha': a},
        {'mse': mse}
    )

    if mse < best_mse:
        best_mse = mse
        best_a = a
        lasso_best = copy.deepcopy(lasso)

    print("Iteration: {}/{} - Alpha: {} - MSE: {:.4f} - Best MSE: {:.4f}".format(i, max_iter, a, mse, best_mse), end='\r')

y_pred = lasso_best.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("\nHyperparameter Tuning")
print("Alpha: ", best_a)
print("MSE: ", best_mse)
print("R2: ", r2)

### SVR

In [None]:
from sklearn.svm import SVR


kernel = ['linear', 'poly', 'rbf']
c = [0.001, 0.01, 0.1, 1]
epsilon = [0.001, 0.01, 0.1, 1]

best_mse = float('inf')
best_kernel = None
best_c = None
best_epsilon = None
best_svr = None

i = 0
max_iter = len(kernel) * len(c) * len(epsilon)

for k, c, e in product(kernel, c, epsilon):
    i+=1
    log_name = f"kernel={k}, c={c}, epsilon={e}"
    writer = SummaryWriter(f"results/tradML/pca/SVR/{log_name}")

    svr = SVR(kernel=k, C=c, epsilon=e)
    svr.fit(X_train, y_train)
    y_pred = svr.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    writer.add_scalar('Loss', mse)
    writer.add_hparams(
        {'kernel': k, 'C': c, 'epsilon': e},
        {'mse': mse}
    )
    
    if mse < best_mse:
        best_mse = mse
        best_kernel = k
        best_c = c
        best_epsilon = e
        svr_best = copy.deepcopy(svr)

    print("Iteration: {}/{} - Kernel: {} - C: {} - Epsilon: {} - MSE: {:.4f} - Best MSE: {:.4f}".format(i, max_iter, k, c, e, mse, best_mse), end='\r')

y_pred = svr_best.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("\nHyperparameter Tuning")
print("Kernel: ", best_kernel)
print("C: ", best_c)
print("Epsilon: ", best_epsilon)
print("MSE: ", best_mse)
print("R2: ", r2)


In [4]:
dt = DecisionTreeRegressor(max_depth= 10)
dt= dt.fit(X_train,y_train)

In [5]:
from sklearn.metrics import mean_absolute_error

y_pred= dt.predict(X_train)
print("Mean Squared Error: ", mean_squared_error(y_train, y_pred))
print ("R2 Score: ", r2_score(y_train, y_pred))

Mean Squared Error:  0.020054903976457348
R2 Score:  0.9157019426735893
