# Objective

Here, we'll run through our training data to gather information and try to achieve the best possible model

In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
import csv
import pandas as pd
import time

In [2]:
# Functions

start = time.clock()
def addColumnThetaZero (array):
    return np.c_[np.ones(array.shape[0]), array]

def formatArray (dataFrame, columnToExtract) :
    array = dataFrame.values
    target = array[:,columnToExtract]
    params = np.delete(array, columnToExtract, axis = 1)
    return params, target

def loadFashionTrainData():
    return pd.read_csv("fashion-mnist-dataset/fashion-mnist_train.csv")

def loadFashionTestData():
    return pd.read_csv("fashion-mnist-dataset/fashion-mnist_test.csv")

def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]
print (time.clock() - start)

0.0003269999999999662


In [3]:
fashionTrainDataset = loadFashionTrainData()
fashionTestDataset = loadFashionTestData()

trainSet, validationSet = split_train_test(fashionTrainDataset, 0.2)

fashionTrainParams, fashionTrainTarget = formatArray(trainSet, 0)
fashionValidationSetParams, fashionValidationSetTarget = formatArray(validationSet, 0)

fashionTrainParams = addColumnThetaZero(fashionTrainParams)
fashionValidationSetParams = addColumnThetaZero(fashionValidationSetParams)
print (fashionTrainParams[:5])
print (type(fashionTrainParams))

[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
<class 'numpy.ndarray'>


In [17]:

# C = 1.0
# penalt = 'l1'
# stopFit = 0.0001
# solverMode = 'liblinear'
maxIter = 1000
multiClass = 'ovr'
# talk = True
# reuse = False

In [18]:
# logisticModel = LogisticRegression(C=C, penalty=penalt, tol = stopFit, solver = solverMode, max_iter = maxIter, multi_class = multiClass, verbose = talk, warm_start = reuse)
logisticModel = LogisticRegression(max_iter = maxIter, multi_class = multiClass, verbose = talk)


In [None]:
start = time.clock()
logisticModel.fit(fashionTrainParams, fashionTrainTarget)
print (time.clock() - start)

[LibLinear]

In [8]:
print("Score : "+ str(logisticModel.score(fashionValidationSetParams, fashionValidationSetTarget)))

Score : 0.8421666666666666


In [16]:
import sklearn.metrics as metrics
print(metrics.accuracy_score(logisticModel.predict(fashionValidationSetParams), fashionValidationSetTarget))
print(metrics.precision_recall_fscore_support(logisticModel.predict(fashionValidationSetParams), fashionValidationSetTarget))

0.8421666666666666
(array([0.80925325, 0.95229983, 0.76833333, 0.86070853, 0.76455696,
       0.91586328, 0.56395817, 0.93055556, 0.93124456, 0.94132231]), array([0.79569034, 0.93478261, 0.7557377 , 0.82167563, 0.72829582,
       0.92559787, 0.66319773, 0.91706924, 0.93124456, 0.94210091]), array([0.80241449, 0.94345992, 0.76198347, 0.84073928, 0.745986  ,
       0.92070485, 0.60956522, 0.92376318, 0.93124456, 0.94171145]), array([1253, 1196, 1220, 1301, 1244, 1129, 1057, 1242, 1149, 1209]))
