In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
from sklearn.preprocessing import minmax_scale

from numpy.random import default_rng
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
np.set_printoptions(suppress=True, precision=2)
plt.style.use('seaborn') # pretty matplotlib plots
sns.set(font_scale=2)

In [83]:
# Load 3 feature version of x arrays
x_train = np.loadtxt('./data_sneaker_vs_sandal/x_train.csv', delimiter=',', skiprows=1)
x_test = np.loadtxt('./data_sneaker_vs_sandal/x_test.csv', delimiter=',', skiprows=1)

y_train = np.loadtxt('./data_sneaker_vs_sandal/y_train.csv', delimiter=',', skiprows=1)
x_test.shape

(2000, 784)

In [87]:
x_train_flip = np.flip(x_train.reshape(12000,28,28), axis = 2)
x_test_flip = np.flip(x_test.reshape(2000,28,28), axis = 2)
x_train_ft_flip = np.append(x_train, x_train_flip.reshape(12000,784),axis=1)
x_test_ft_flip =  np.append(x_test, x_test_flip.reshape(2000,784),axis=1)
y_train_ft_flip = y_train

In [70]:
# model = LogisticRegression(solver='liblinear')
# model.fit(x_train_ft_flip, y_train_ft_flip)

# prob_train = model.predict_proba(x_train_ft_flip)
# acc_train = model.score(x_train_ft_flip, y_train_ft_flip)
# train_log_loss = log_loss(y_train_ft_flip,prob_train)

In [59]:
x_tr, x_va, y_tr, y_va = train_test_split(x_train_ft_flip, y_train_ft_flip, test_size=0.4)

In [78]:
C_grid = np.logspace(-9, 6, 31)

In [79]:
model_list = list()
train_loss_list = list()
train_acc_list = list()

In [80]:
for C in C_grid:
    model = LogisticRegression(solver='liblinear', C=C,max_iter=100000)
    model.fit(x_train_ft_flip, y_train_ft_flip)
    model_list.append(model)
    
    prob_train = model.predict_proba(x_train_ft_flip)
    acc_train = model.score(x_train_ft_flip, y_train_ft_flip)

    train_log_loss = log_loss(y_train_ft_flip,prob_train)
    train_loss_list.append(train_log_loss)
   
    train_acc_list.append(acc_train)

In [89]:
min_train_loss = min(train_loss_list)
min_train_loss_index = train_loss_list.index(min_train_loss)
best_C =  C_grid[min_train_loss_index]
best_acc = train_acc_list[min_train_loss_index]
best_model = model_list[min_train_loss_index]
yproba1_test = best_model.predict_proba(x_test_ft_flip)[:, 1] 
np.savetxt('yproba1_test.txt', yproba1_test)

In [None]:
# split

In [None]:
model_list = list()
tr_loss_list = list()
va_loss_list = list()
tr_acc_list = list()
va_acc_list = list()

In [44]:
for C in C_grid:
    model = LogisticRegression(solver='liblinear', C=C,max_iter=100000)
    model.fit(x_tr, y_tr)
    model_list.append(model)
    
    prob_tr = model.predict_proba(x_tr)
    prob_va = model.predict_proba(x_va)

    acc_tr = model.score(x_tr, y_tr)
    acc_va = model.score(x_va, y_va)

    tr_log_loss = log_loss(y_tr,prob_tr)
    va_log_loss = log_loss(y_va,prob_va)
    
    tr_loss_list.append(tr_log_loss)
    va_loss_list.append(va_log_loss)
    
    tr_acc_list.append(acc_tr)
    va_acc_list.append(acc_va)

In [45]:
min_va_loss = min(va_loss_list)
min_va_loss_index = va_loss_list.index(min_va_loss)
best_C =  C_grid[min_va_loss_index]
best_acc = va_acc_list[min_va_loss_index]
best_model = model_list[min_va_loss_index]
yproba1_test = best_model.predict_proba(x_test)[:, 1] 
np.savetxt('yproba1_test.txt', yproba1_test)

In [50]:
best_model

LogisticRegression(C=0.31622776601683794, max_iter=100000, solver='liblinear')

In [60]:
# cross validation 
k = 5
kfold = KFold(n_splits=k)
tr_scores = []
va_scores = []

for train_idx, test_idx in kfold.split(x_train):
    x_tr, x_va = x_train[train_idx,:], x_train[test_idx,:]
    y_tr, y_va = y_train[train_idx], y_train[test_idx]
    
    model.fit(x_tr, y_tr)
    pred_tr = model.predict(x_tr)
    pred_va = model.predict(x_va)

    acc_tr = accuracy_score(pred_tr, y_tr)
    acc_va = accuracy_score(pred_va, y_va)
    print("Train accuracy: ", acc_tr)
    print("Test accuracy: ", acc_va)
    
    tr_scores.append(acc_tr)
    va_scores.append(acc_va)
    
print("\nAverage train accuracy: ", np.average(acc_tr))
print("Average test accuracy: ", np.average(acc_va))

Train accuracy:  0.9842708333333333
Test accuracy:  0.9445833333333333
Train accuracy:  0.9857291666666667
Test accuracy:  0.93375
Train accuracy:  0.9848958333333333
Test accuracy:  0.9458333333333333
Train accuracy:  0.9847916666666666
Test accuracy:  0.9391666666666667
Train accuracy:  0.9841666666666666
Test accuracy:  0.9483333333333334

Average train accuracy:  0.9841666666666666
Average test accuracy:  0.9483333333333334


In [None]:
# minmax

In [None]:
X_scaled_train = minmax_scale(x_train_ft_flip)
model = LogisticRegression(solver='liblinear', C=0.031, max_iter=100000)
model.fit(X_scaled_train, y_train_ft_flip)
pred_train = model.predict(X_scaled_train)
acc_train = accuracy_score(pred_train, y_train_ft_flip)
print("Train accuracy: ", acc_train)

yproba1_test = model.predict_proba(x_test)[:, 1] 
np.savetxt('yproba1_test.txt', yproba1_test)
# plt.imshow(np.reshape(x_train_ft_flip[12001], (28,28)), cmap=plt.cm.gray, vmin=0.0, vmax=1.0)