In [1]:
import csv
import numpy as np
import sklearn
from sklearn.model_selection import KFold
import math
from sklearn.metrics import accuracy_score

In [2]:
# Read the training data file
szDatasetPath = 'winequality-white.csv'
listClasses = []
listAttrs = []
bFirstRow = True
with open(szDatasetPath) as csvFile:
    csvReader = csv.reader(csvFile, delimiter=',')
    for row in csvReader:
        if bFirstRow:
            bFirstRow = False
            continue
        if int(row[-1]) < 6:
            listClasses.append(-1)
            listAttrs.append(list(map(float, row[1:len(row) - 1])))
        elif int(row[-1]) > 6:
            listClasses.append(+1)
            listAttrs.append(list(map(float, row[1:len(row) - 1])))

dataX = np.array(listAttrs)
dataY = np.array(listClasses)


In [65]:
def logistic_loss(train_y, pred_y):
    toMinimize = np.dot(train_y, pred_y)
    log_loss = np.log(1 + math.exp(-toMinimize))
    return log_loss

def hinge_loss(train_y, pred_y):
    toMinimize = np.dot(train_y, pred_y)
    if (1 - toMinimize > 0):
        return 1 - toMinimize
    else:
        return 0 

'''
The regularizers shall compute the loss without considering the bias term in the weights
'''

def l1_reg(w):
    # take sum of absolute values 
    l1_loss = 0;
    for i in range(len(w)):
        l1_loss += abs(w[i])  
    return l1_loss

def l2_reg(w):
    l2_loss = np.dot(w[1:], np.transpose(w[1:]))
    return np.sqrt(l2_loss)

def test_classifier(w, test_x):
    pred_y = np.zeros(len(test_x))
    for i in range(len(test_x)):
        pred_y[i] = np.dot(w[1:], test_x[i]) + w[0]
    return pred_y    

In [89]:
#Classifier with lambda and regularizer
def train_classifier(train_x, train_y, learn_rate, loss, lambda_val=None, regularizer=None):
    
    if(lambda_val!=None and regularizer!=None):
        # w = w - learning_rate * deriv(loss function/w)    
        weight_vector = np.random.rand(len(dataX[0]) + 1)/100 # bias term included
        num_iters = 10
        # numerical differentiation 
        h = 0.0001
        for i in range(num_iters):
            current_weight = np.copy(weight_vector)

            delta_weight = np.zeros(len(dataX[0]) + 1) # produce delta_weight to update weight w = w - delta_weight

            pred_y = test_classifier(current_weight,train_x) #sua
            current_loss = loss(train_y, pred_y) #sua

            for i in range(len(delta_weight)):
                temp_current_weight = np.copy(current_weight)
                temp_current_weight[i] = temp_current_weight[i] + h
                temp_pred_y = test_classifier(temp_current_weight,dataX)

                # produce loss
                temp_loss = loss(dataY, temp_pred_y)

                # partial differentiation
                delta_weight[i] = (temp_loss - current_loss) / h

            # update weight vector :W = W - n dl/dW    
            weight_vector = current_weight - learn_rate * delta_weight    
        return weight_vector
    
    else:
        # create pred_y --> goes into loss function  

        # w = w - learning_rate * deriv(loss function/w)    
        weight_vector = np.random.rand(len(dataX[0]) + 1) # bias term included
        num_iters = 10
        # numerical differentiation 
        h = 0.001
        for i in range(num_iters):
            current_weight = np.copy(weight_vector)

            delta_weight = np.zeros(len(dataX[0]) + 1) # produce delta_weight to update weight w = w - delta_weight

            pred_y = test_classifier(current_weight,train_x) #sua
            current_loss = loss(train_y, pred_y) + lambda_val*regularizer(current_weight) #sua

            for i in range(len(delta_weight)):
                temp_current_weight = np.copy(current_weight)
                temp_current_weight[i] = temp_current_weight[i] + h
                temp_pred_y = test_classifier(temp_current_weight,dataX)

                # produce loss
                temp_loss = loss(dataY, temp_pred_y) + regularizer(temp_current_weight)

                # partial differentiation
                delta_weight[i] = (temp_loss - current_loss) / h

            # update weight vector :W = W - n dl/dW    
            weight_vector = current_weight - learn_rate * delta_weight    
        return weight_vector

In [90]:
# Perform feature normalization after spliting the data to training and validation set.
# (F - mean)/std_dev 
def normalize(train_data):
    normalized_data = []
    for i in range(len(train_data[0])):
        temp = train_data[:,i]
        normalized_col = np.copy(temp)
        for j in range(len(temp)):
            normalized_col[j] = (temp[j]-np.mean(temp))/np.std(temp)
        normalized_data.append(list(normalized_col))
    return np.transpose(np.array(normalized_data))

In [100]:
# 5-fold cross-validation
kf = KFold(n_splits=5)
kf.get_n_splits(dataX)
for train_index, test_index in kf.split(dataX):
    X_train, X_test = dataX[train_index], dataX[test_index]
    y_train, y_test = dataY[train_index], dataY[test_index]
    
    X_train = normalize(X_train)
    w=train_classifier(X_train,y_train,0.00001,hinge_loss,0.0001,l2_reg)
    print(w)
    pred = test_classifier(w,normalize(X_test))
    print(logistic_loss(y_test,pred))

[-36.51348398 -36.50999269 -36.50985792 -36.5721983  -36.50895208
 -36.72540038 -37.62417927 -36.5197047  -36.52724969 -36.51177183
 -36.55645977]


OverflowError: math range error