# Input
After having focused on a classification problem due to multiple reasons, our problem owner suggested to look into Logistic/Polynomial Regression again. His reasoning was that a regression model can rank the predictions, for example it knows that a 9 is better than a 5. In this file I therefore compare the ground truth to the predictions made with a logistic regression model compared to polynomial regression.

In [2]:
# Importing libraries 
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import random
from ml import *

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from scipy.special import expit

In [3]:
# Create df 
df = pd.read_csv('/datc/nano/notebooks/Target variable & Features (V3).csv', index_col = 0)

# Creating dataframe with only yen values
df_yen = df[df['Threshold method'] =='yen']
df_yen.head()

Unnamed: 0_level_0,Threshold method,Threshold: area spread,Threshold: border,Threshold: count,Threshold: fill,Threshold: intensity,Threshold: separation,User score
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5f11480a2d99b96663b97fdb,yen,0.265214,1.866963,13.192785,0.215541,1.711409,0.106493,9
5f15419a79e27502789bbbd1,yen,0.154306,5.64838,0.010033,0.1849,4.396552,0.355659,4
5f27c1cced3aa3f0c260b246,yen,0.012426,1.814439,2.246626,0.184488,1.734694,0.139754,1
5f30f7b9ed3aa3f0c260b563,yen,0.408562,4.563709,0.019553,0.220224,2.833333,0.162283,2
5f30fafded3aa3f0c260b6bc,yen,0.035861,2.068226,0.671646,0.160934,1.902985,0.130774,4


In [4]:
# Dividing into training and validation data
x = df_yen[['Threshold: separation', 'Threshold: border']]
y = df_yen[['User score']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

print(y_test.value_counts())

User score
8             2
7             2
6             2
1             2
10            1
9             1
4             1
3             1
dtype: int64


In [5]:
# # Normalizing training data

# scaler = StandardScaler()
# x_train_array = scaler.fit_transform(x_train)
# x_train = pd.DataFrame(x_train_array, index=x_train.index, columns=x_train.columns)

# #sns.pairplot(x_train)

# Logistic regression (10 classes)

In [6]:
# Training model
model = LogisticRegression(multi_class='auto', random_state=0)
model.fit(x_train, y_train)

LogisticRegression(random_state=0)

### Comparison to ground truth (LogReg)

In [7]:
# # Training set
# compare_log = y_train.copy()
# compare_log['Predicted score'] = model.predict(x_train)

# def highlight(val):
#     if val['User score'] == val['Predicted score']:
#         return ['background: green']*2 #because the green is "True" for one column by default but we have 2
#     elif val['User score'] == val['Predicted score']+1:
#         return ['background: yellow']*2
#     elif val['User score'] == val['Predicted score']-1:
#         return ['background: yellow']*2
#     else:
#         return ['background: red']*2
    
# compare_log.style.apply(highlight, axis=1)

In [8]:
# Test set
#x_test = scaler.transform(x_test)
compare_log = y_test.copy()
compare_log['Predicted score'] = model.predict(x_test)

def highlight(val):
    if val['User score'] == val['Predicted score']:
        return ['background: green']*2 #because the green is "True" for one column by default but we have 2
    elif val['User score'] == val['Predicted score']+1 or val['User score'] == val['Predicted score']+2:
        return ['background: yellow']*2
    elif val['User score'] == val['Predicted score']-1 or val['User score'] == val['Predicted score']-2:
        return ['background: yellow']*2
    else:
        return ['background: red']*2
    
compare_log.style.apply(highlight, axis=1)

Unnamed: 0_level_0,User score,Predicted score
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
5f491885298cf94b214e8f40,3,9
5f4caa47298cf94b214e9991,6,8
5f491c1d298cf94b214e90a4,1,9
5f4cb652298cf94b214ea0a8,6,8
5f4cb886298cf94b214ea3ca,8,8
5f4629d78d62faf2c4d4e268,1,8
5f4caa60298cf94b214e99a8,10,8
5f4911a5298cf94b214e8967,9,8
5f48ee847495efe38e28c50b,7,8
5f491278298cf94b214e8b6e,4,1


# Polynomial regression

In [9]:
# Creating polynomial
poly = PolynomialFeatures(degree=4, include_bias=False)
x2_train = poly.fit_transform(x_train)
x2_test = poly.transform(x_test)

# Creating model
model = LinearRegression()
model.fit(x2_train, y_train)

LinearRegression()

In [10]:
# # Training set
# compare_pol = y_train.copy()
# compare_pol['Predicted score'] = model.predict(x2_train).round()

# def highlight(val):
#     if val['User score'] == val['Predicted score']:
#         return ['background: green']*2 #because the green is "True" for one column by default but we have 2
#     elif val['User score'] == val['Predicted score']+1:
#         return ['background: yellow']*2
#     elif val['User score'] == val['Predicted score']-1:
#         return ['background: yellow']*2
#     else:
#         return ['background: red']*2
    
# compare_pol.style.apply(highlight, axis=1)

In [11]:
# Test set
compare_pol = y_test.copy()
compare_pol['Predicted score'] = model.predict(x2_test).round()

def highlight(val):
    if val['User score'] == val['Predicted score']:
        return ['background: green']*2 
    elif val['User score'] == val['Predicted score']+1 or val['User score'] == val['Predicted score']+2:
        return ['background: yellow']*2
    elif val['User score'] == val['Predicted score']-1 or val['User score'] == val['Predicted score']-2:
        return ['background: yellow']*2
    else:
        return ['background: red']*2
    
compare_pol.style.apply(highlight, axis=1)

Unnamed: 0_level_0,User score,Predicted score
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
5f491885298cf94b214e8f40,3,3.0
5f4caa47298cf94b214e9991,6,8.0
5f491c1d298cf94b214e90a4,1,4.0
5f4cb652298cf94b214ea0a8,6,12.0
5f4cb886298cf94b214ea3ca,8,7.0
5f4629d78d62faf2c4d4e268,1,19.0
5f4caa60298cf94b214e99a8,10,19.0
5f4911a5298cf94b214e8967,9,7.0
5f48ee847495efe38e28c50b,7,9.0
5f491278298cf94b214e8b6e,4,5.0


# Output

Runs 5f491c1d298cf94b214e90a4 and 5f4629d78d62faf2c4d4e268 are extremely wrong predicted in both models. Look into them. These could be outliers. The results are not very promising, they shoot outside the bounds of 0-10 and are not very accurate. Regardless we decided to do a small experiment on this to look into it a little bit further to gain more insights as our problem owner really saw potential in the idea. 