In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

data = pd.read_excel(r'satisfaction_pred.xlsx')

data

Unnamed: 0,Age,Income,Number of Purchases,Time Spent on Website,Region,Membership Type,Subscription Status,Satisfaction Score
0,64,20856,5,1.544254,South,Gold,Active,3.931273
1,39,69991,3,22.158004,East,Gold,Active,8.813718
2,31,64297,7,17.411279,South,Gold,Active,4.423490
3,40,24440,10,31.671055,South,Gold,Active,1.018383
4,33,49525,5,43.892811,South,Basic,Inactive,8.948029
...,...,...,...,...,...,...,...,...
31995,31,54536,1,41.384237,North,Premium,Inactive,5.354833
31996,53,63890,8,45.561462,North,Gold,Active,4.959559
31997,50,71687,10,38.220942,East,Gold,Inactive,0.002699
31998,38,30655,19,17.848667,South,Basic,Active,6.750565


In [3]:
data.describe(include='all')

Unnamed: 0,Age,Income,Number of Purchases,Time Spent on Website,Region,Membership Type,Subscription Status,Satisfaction Score
count,32000.0,32000.0,32000.0,32000.0,32000,32000,32000,32000.0
unique,,,,,4,3,2,
top,,,,,North,Basic,Active,
freq,,,,,8076,10734,16088,
mean,43.415094,60151.562469,10.018812,29.944621,,,,4.986471
std,14.996725,23008.904191,5.500571,17.338381,,,,2.887159
min,18.0,20004.0,1.0,0.003282,,,,0.000272
25%,30.0,40365.75,5.0,14.973681,,,,2.480716
50%,43.0,60158.0,10.0,29.900169,,,,4.977834
75%,56.0,79996.0,15.0,44.781076,,,,7.477885


In [4]:
data.isnull().sum()

Age                      0
Income                   0
Number of Purchases      0
Time Spent on Website    0
Region                   0
Membership Type          0
Subscription Status      0
Satisfaction Score       0
dtype: int64

In [5]:
data = pd.get_dummies(data, drop_first=True)

In [6]:
data

Unnamed: 0,Age,Income,Number of Purchases,Time Spent on Website,Satisfaction Score,Region_North,Region_South,Region_West,Membership Type_Gold,Membership Type_Premium,Subscription Status_Inactive
0,64,20856,5,1.544254,3.931273,False,True,False,True,False,False
1,39,69991,3,22.158004,8.813718,False,False,False,True,False,False
2,31,64297,7,17.411279,4.423490,False,True,False,True,False,False
3,40,24440,10,31.671055,1.018383,False,True,False,True,False,False
4,33,49525,5,43.892811,8.948029,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
31995,31,54536,1,41.384237,5.354833,True,False,False,False,True,True
31996,53,63890,8,45.561462,4.959559,True,False,False,True,False,False
31997,50,71687,10,38.220942,0.002699,False,False,False,True,False,True
31998,38,30655,19,17.848667,6.750565,False,True,False,False,False,False


In [7]:
inputs = data.drop('Satisfaction Score', axis=1)
target = data['Satisfaction Score']

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2, random_state=42)

In [9]:
from sklearn import metrics

def evaluate(model, x_test, y_test):
    
    y_pred = model.predict(x_test)
    
    mae_test = metrics.mean_absolute_error(y_test, y_pred)
    mse_test = metrics.mean_squared_error(y_test, y_pred)
    rmse_test = np.sqrt(mse_test)
    r2_test = metrics.r2_score(y_test, y_pred)


    y_pred_train = model.predict(x_train)

    mae_train = metrics.mean_absolute_error(y_train, y_pred_train)
    mse_train = metrics.mean_squared_error(y_train, y_pred_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = metrics.r2_score(y_train, y_pred_train)


    results_dict = {
        'Metric': ['MAE', 'MSE', 'RMSE', 'R2'],
        'Train': [mae_train, mse_train, rmse_train, r2_train*100],
        'Test': [mae_test, mse_test, rmse_test, r2_test*100]
    }

    results_df = pd.DataFrame(results_dict)
    
    print(results_df)

# Default model

In [10]:
from sklearn.svm import SVR

base_model = SVR(kernel='rbf')
base_model.fit(x_train, y_train)
base_accuracy = evaluate(base_model, x_test, y_test)

  Metric     Train      Test
0    MAE  2.500077  2.500571
1    MSE  8.341807  8.326677
2   RMSE  2.888219  2.885598
3     R2 -0.030918 -0.084720


# Optuna Search


In [None]:
from sklearn.model_selection import cross_val_score
import optuna

def best_params_for_model(trial):

    param = {
        'C': trial.suggest_loguniform('C', 0.1, 10),  # Regularization parameter
        'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),  # Kernel type
        'degree': trial.suggest_int('degree', 2, 5),  # Degree for polynomial kernel
        'gamma': trial.suggest_categorical('gamma', ['auto', 'scale']),  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
        'epsilon': trial.suggest_loguniform('epsilon', 0.01, 1.0)  # Epsilon in the epsilon-SVR model
    }

    svr = SVR(**param)

    score = cross_val_score(svr, x_train, y_train, cv=3, scoring='r2', n_jobs=-1).mean()

    return score

study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_model, n_trials=1)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)


[I 2024-06-30 18:29:29,296] A new study created in memory with name: no-name-ccb1e819-536c-4b83-a3c1-45d5c97dd598
  'C': trial.suggest_loguniform('C', 0.1, 10),  # Regularization parameter
  'epsilon': trial.suggest_loguniform('epsilon', 0.01, 1.0)  # Epsilon in the epsilon-SVR model


In [None]:
best_svr_model = SVR(**best_params)
best_svr_model.fit(x_train, y_train)

evaluate(best_svr_model, x_test, y_test)

# Univariate Analysis

In [None]:
variables = []
train_r2_scores = []
test_r2_scores = []

for i in x_train.columns: 
    X_train_single_var = x_train[[i]]
    X_test_single_var = x_test[[i]]

    best_svr_model.fit(X_train_single_var, y_train)
    y_pred_train_single_var = best_svr_model.predict(X_train_single_var)

    train_r2 = metrics.r2_score(y_train, y_pred_train_single_var)

    y_pred_test_single_var = best_svr_model.predict(X_test_single_var)

    test_r2 = metrics.r2_score(y_test, y_pred_test_single_var)

    variables.append(i)
    train_r2_scores.append(train_r2)
    test_r2_scores.append(test_r2)

results_df = pd.DataFrame({'Variable': variables, 'Train R2': train_r2_scores, 'Test R2': test_r2_scores})

results_df_sorted = results_df.sort_values(by='Test R2', ascending=False)

pd.options.display.float_format = '{:.4f}'.format

results_df_sorted


# Final inputs

In [None]:
fin_inputs = data[[]]

x_train, x_test, y_train, y_test = train_test_split(fin_inputs, targets, test_size=0.2, random_state=42)

In [None]:
best_svr_model.fit(x_train, y_train)

evaluate(best_svr_model, x_test, y_test)