In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Question 1: Developing the code

In this question, we will develop the code for logistic regression where model fitting would be carried out through gradient descent. You can use the code provided in class to assemble your final code here. Particularly the code for sigmoid function $(h(X,\beta))$, negative log-liklihood model cost $(model\_cost(X,y,\beta))$ and gradient descent algorithm $(gradient\_descent(X,y,\beta_0, \alpha, num\_iter))$ will be useful here.

## 1.1

Write a function $data\_preprocessing(X)$, that takes a matrix $X$ and:

*   Standardizes the columns of $X$, i.e. make them centered around 0. You can use the StandardScaler from sklearn for this.
*   Adds a column of 1's to the standardized $X$ matrix. Hence if the input matrix had 4 columns for the 4 features, the returned matrix will have 5 columns with first column containing all 1's. This will provide bias to the classification model.

$data\_preprocessing(X)$ should return this final modified matrix and the standardscaler object learnt.



In [None]:
#create function
def data_preprocessing(X):
  #standardscaler object
  scaler = StandardScaler()

  #standardize columns
  X_standard = scaler.fit_transform(X)

  #add column of 1's
  ones = np.ones((X.shape[0], 1))
  X_result = np.hstack((ones, X_standard))

  return X_result, scaler

In [None]:
#test use of function
X_test_1 = np.array([[3, 6, 3, 7], [4, 8, 3, 6], [5, 7, 2, 9], [9, 6, 4, 1]])
X_result_1, scaler_1 = data_preprocessing(X_test_1)

#display result
print('Input X matrix: \n', X_test_1)
print('\nProcessed X matrix: \n', X_result_1)
print('\nScaler object mean: \n', scaler_1.mean_)
print('\nScalar object standard deviation: \n', scaler_1.scale_)

Input X matrix: 
 [[3 6 3 7]
 [4 8 3 6]
 [5 7 2 9]
 [9 6 4 1]]

Processed X matrix: 
 [[ 1.         -0.98787834 -0.90453403  0.          0.42409446]
 [ 1.         -0.5488213   1.50755672  0.          0.08481889]
 [ 1.         -0.10976426  0.30151134 -1.41421356  1.10264561]
 [ 1.          1.6464639  -0.90453403  1.41421356 -1.61155897]]

Scaler object mean: 
 [5.25 6.75 3.   5.75]

Scalar object standard deviation: 
 [2.27760839 0.8291562  0.70710678 2.94745653]


## 1.2

Write a function $multiclass\_classification(X,y,\beta_0,\alpha,num\_iter)$that implements the multiclass logistic regression algorithm discussed in class. Here the function should

*   First compute the number of classes in the dataset
*   For fitting each individual classifider, modify the target vector as needed.
*   Fit individual binary logistic models using $gradient\_descent()$ function from class.
*   Save the final $\beta$ vectors in a list: $best\_beta$

This function should return the list $(best\_beta)$ containing multiple $\beta$ vectors (one for each class).





In [None]:
#bring in necessary functions from lectures (copied directly from lecture Multiclass_probabilistic_classification)

def h(X,beta):
    ## returns the value of the sigmoid function
    ypred = 1/(1 + np.exp(-1*X.dot(beta)))
    return ypred

def model_cost(X,y,beta):
    ## computes the value of the model fitting cost
    cost = 0
    for j in range(X.shape[0]):
        term1 = y[j]*np.log(h(X[j,:],beta))[0]
        term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
        cost = cost + (term1+term2)
    return -1*cost

def gradient_descent(X,y,beta0,alpha,iteration):
    ## Implemented iteration for the gradient descent algorithm
    ypred = h(X,beta0)
    beta = [beta0]
    cost = [model_cost(X,y,beta0)]
    for j in range(iteration):
        grad_l = -1*X.T.dot(y.reshape(-1,1) - ypred)
        beta_new = beta[-1] - alpha * grad_l
        ypred = h(X,beta_new)
        beta.append(beta_new)
        #print(f'beta: {beta_new[0][0],beta_new[1][0]}')
        print('Cost at iteration '+str(j)+' is: ',model_cost(X,y,beta_new))
        cost.append(model_cost(X,y,beta_new))
    return [beta_new,cost]

In [None]:
#create multiclass_classification function
def multiclass_classification(X, y, beta0, alpha, num_iter):
  #compute number of classes in dataset
  classes = list(set(y))
  num_classes = len(classes)

  #create empty array for best beta
  best_beta = []

  #utilize steps from lecture for finding betas
  for i in classes:
    #modify target vector
    y_new = y.copy()
    y_new[y == i] = 1
    y_new[y!= i] = 0

    #utilize gradient_descent
    beta_new, cost = gradient_descent(X, y_new, beta0, alpha, num_iter)
    best_beta.append(beta_new)

  return best_beta

## 1.3

Write a function $predict\_multiclass(X,best\_beta)$, that:

*   Computes the number of possible classes from the target vector $best\_beta$.
*   For each sample of $X$, predict the probability of each of the class. Hence if you had 100 samples in $X$ and 3 possible classes, this step should generate a 100 x 3 matrix. Name this matrix: $prob\_multiclass$
*   Predict the final class label using $prob\_multiclass$ and save these predicted labels in $label\_multiclass$ vector.

This function should return the matrix $prob\_multiclass$ and vector $label\_multiclass$.




In [None]:
#utilize logic from pred(X, beta_vec)
def predict_multiclass(X, best_beta):

  #define number of classes
  num_classes = len(best_beta)

  #create empty array for probabilities
  prob1 = []

  #for loop for X generate probabiltiies of classes
  for i in X:
    temp = []
    for j in range(num_classes):
      beta = best_beta[j].ravel()
      prob = np.exp(i.dot(beta))
      temp.append(prob)

    prob1.append([ii / sum(temp) for ii in temp])
  probability = np.array(prob1)

  #create probability multiclass matrix
  prob_multiclass = probability.reshape(len(X), len(best_beta))

  #create label multiclass vector
  label_multiclass = np.argmax(prob_multiclass, axis = 1)

  return prob_multiclass, label_multiclass

# Question 2: Applying Logistic Regression code

For this problem we will be using the $wine$ dataset. The dataset is available directly from sklearn. This dataset has 3 classes of wine and 12 features to predict this class.

In [None]:
#load in wine dataset from sklearn
from sklearn.datasets import load_wine
wine = load_wine()

## 2.1

Divide the dataset into training and testing dataset with 30\% of the samples in the testing set. Use random state 0.

In [None]:
X = wine.data
y = wine.target

#use train test split from sklearn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

#check sizes to ensure 30% of samples in training set
print(X_train.shape)
print(X_test.shape)

(124, 13)
(54, 13)


## 2.2

Use the function $data\_preprocessing()$ you developed in question 1.1 to standardize $X\_train$ and add a column of 1 to it. Name this modified $X\_train$ as $X\_train\_st$.

In [None]:
#apply data_preprocessing function
X_train_st, scaler_st = data_preprocessing(X_train)

In [None]:
#show results, ensure column of ones is applied
print(X_train_st)

[[ 1.          0.91083058 -0.46259897 ...  0.65706596  1.94354495
   0.93700997]
 [ 1.         -0.95609928 -0.96608672 ... -0.40859506  0.58118003
  -1.41336684]
 [ 1.          0.35952243  1.67501572 ... -1.55950896 -1.44846566
   0.28683658]
 ...
 [ 1.         -0.70550467 -0.68342693 ...  0.44393375  0.49776993
  -1.30608823]
 [ 1.          1.14889546 -0.6215951  ... -0.19546286  1.0121322
   0.77446662]
 [ 1.          1.47466845  0.11155374 ... -1.43162964 -1.23994042
  -0.28206514]]


## 2.3

By experimenting with different initialization for $\alpha$ and $\beta$ (assuming 500 iterations), and using the function $multi\_classification()$ you developed in question 1.2, fit the multiclass logistic regression on the training set $(X\_train\_st, y\_train)$.

In [None]:
#DO NOT RUN THIS CELL - runtime ~ 5min
#Testing purposes for alpha and beta values.

from sklearn.metrics import accuracy_score

alpha_values = [0.001, 0.01, 0.05, 0.1, 0.5]
beta0_values = [0.0001*np.ones([X.shape[1]+1,1]), 0.001*np.ones([X.shape[1]+1,1]), 0.01*np.ones([X.shape[1]+1,1]), 0.05*np.ones([X.shape[1]+1,1]), 0.1*np.ones([X.shape[1]+1,1])]

num_iter = 500

results = {}

best_combo = None
best_accuracy = 0.0

X_test = scaler_st.fit_transform(X_test)
X_test_st = np.concatenate((np.ones([len(X_test), 1]), X_test), axis = 1)

for alpha in alpha_values:
  for beta0 in beta0_values:
    best_beta = multiclass_classification(X_train_st, y_train, beta0, alpha, num_iter)
    prob_test, label_test = predict_multiclass(X_test_st, best_beta)
    accuracy = accuracy_score(y_test, label_test)

    beta_tuple = tuple(np.array(best_beta).flatten())
    results[(alpha, beta_tuple)] = {'Best Beta': best_beta}

    if accuracy > best_accuracy:
      best_accuracy = accuracy
      best_combination = (alpha, beta_tuple)

print("\nBest Hyperparameters:")
print(f"Alpha: {best_combination[0]}, Beta Tuple: {best_combination[1]}")
print(f"Best Accuracy: {best_accuracy}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Cost at iteration 477 is:  0.6785405996121907
Cost at iteration 478 is:  0.677369187964183
Cost at iteration 479 is:  0.6762020812603442
Cost at iteration 480 is:  0.675039255071149
Cost at iteration 481 is:  0.6738806851549716
Cost at iteration 482 is:  0.6727263474562657
Cost at iteration 483 is:  0.671576218103754
Cost at iteration 484 is:  0.6704302734086478
Cost at iteration 485 is:  0.6692884898628951
Cost at iteration 486 is:  0.6681508441374212
Cost at iteration 487 is:  0.6670173130804269
Cost at iteration 488 is:  0.6658878737156765
Cost at iteration 489 is:  0.6647625032408175
Cost at iteration 490 is:  0.6636411790257248
Cost at iteration 491 is:  0.6625238786108539
Cost at iteration 492 is:  0.6614105797056185
Cost at iteration 493 is:  0.6603012601867839
Cost at iteration 494 is:  0.6591958980968827
Cost at iteration 495 is:  0.6580944716426467
Cost at iteration 496 is:  0.656996959193461
Cost at iteration 4

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


Cost at iteration 477 is:  nan
Cost at iteration 478 is:  nan
Cost at iteration 479 is:  nan
Cost at iteration 480 is:  nan
Cost at iteration 481 is:  nan
Cost at iteration 482 is:  nan
Cost at iteration 483 is:  nan
Cost at iteration 484 is:  nan
Cost at iteration 485 is:  nan
Cost at iteration 486 is:  nan
Cost at iteration 487 is:  nan
Cost at iteration 488 is:  nan
Cost at iteration 489 is:  nan
Cost at iteration 490 is:  nan
Cost at iteration 491 is:  nan
Cost at iteration 492 is:  nan
Cost at iteration 493 is:  nan
Cost at iteration 494 is:  nan
Cost at iteration 495 is:  nan
Cost at iteration 496 is:  nan
Cost at iteration 497 is:  nan
Cost at iteration 498 is:  nan
Cost at iteration 499 is:  nan
Cost at iteration 0 is:  40.16494069582217
Cost at iteration 1 is:  17.019676282939677
Cost at iteration 2 is:  10.386824973077943
Cost at iteration 3 is:  7.130343422719961
Cost at iteration 4 is:  5.171075218446944
Cost at iteration 5 is:  4.044366714551041
Cost at iteration 6 is:  3.

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


 nan
Cost at iteration 492 is:  nan
Cost at iteration 493 is:  nan
Cost at iteration 494 is:  nan
Cost at iteration 495 is:  nan
Cost at iteration 496 is:  nan
Cost at iteration 497 is:  nan
Cost at iteration 498 is:  nan
Cost at iteration 499 is:  nan
Cost at iteration 0 is:  40.34263551752081
Cost at iteration 1 is:  17.09044044785458
Cost at iteration 2 is:  10.420583856586866
Cost at iteration 3 is:  7.151379895265716
Cost at iteration 4 is:  5.182003475432519
Cost at iteration 5 is:  4.0476020206011025
Cost at iteration 6 is:  3.414461755198851
Cost at iteration 7 is:  3.041757119651255
Cost at iteration 8 is:  2.798182782997836
Cost at iteration 9 is:  2.620595714976607
Cost at iteration 10 is:  2.4805312083781828
Cost at iteration 11 is:  2.364856211361473
Cost at iteration 12 is:  2.2667224627834375
Cost at iteration 13 is:  2.181973878937858
Cost at iteration 14 is:  2.1077836488926893
Cost at iteration 15 is:  2.042095917946795
Cost at iteration 16 is:  1.9833585547975459
Cos

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


Cost at iteration 483 is:  nan
Cost at iteration 484 is:  nan
Cost at iteration 485 is:  nan
Cost at iteration 486 is:  nan
Cost at iteration 487 is:  nan
Cost at iteration 488 is:  nan
Cost at iteration 489 is:  nan
Cost at iteration 490 is:  nan
Cost at iteration 491 is:  nan
Cost at iteration 492 is:  nan
Cost at iteration 493 is:  nan
Cost at iteration 494 is:  nan
Cost at iteration 495 is:  nan
Cost at iteration 496 is:  nan
Cost at iteration 497 is:  nan
Cost at iteration 498 is:  nan
Cost at iteration 499 is:  nan
Cost at iteration 0 is:  42.138517675260864
Cost at iteration 1 is:  17.809582819620132
Cost at iteration 2 is:  10.759382619386665
Cost at iteration 3 is:  7.365209483175607
Cost at iteration 4 is:  5.297102642105391
Cost at iteration 5 is:  4.085434685875324
Cost at iteration 6 is:  3.4072495122316684
Cost at iteration 7 is:  3.014137430762753
Cost at iteration 8 is:  2.7643076774234387
Cost at iteration 9 is:  2.5868387016509247
Cost at iteration 10 is:  2.448980885

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


Cost at iteration 465 is:  nan
Cost at iteration 466 is:  nan
Cost at iteration 467 is:  nan
Cost at iteration 468 is:  nan
Cost at iteration 469 is:  nan
Cost at iteration 470 is:  nan
Cost at iteration 471 is:  nan
Cost at iteration 472 is:  nan
Cost at iteration 473 is:  nan
Cost at iteration 474 is:  nan
Cost at iteration 475 is:  nan
Cost at iteration 476 is:  nan
Cost at iteration 477 is:  nan
Cost at iteration 478 is:  nan
Cost at iteration 479 is:  nan
Cost at iteration 480 is:  nan
Cost at iteration 481 is:  nan
Cost at iteration 482 is:  nan
Cost at iteration 483 is:  nan
Cost at iteration 484 is:  nan
Cost at iteration 485 is:  nan
Cost at iteration 486 is:  nan
Cost at iteration 487 is:  nan
Cost at iteration 488 is:  nan
Cost at iteration 489 is:  nan
Cost at iteration 490 is:  nan
Cost at iteration 491 is:  nan
Cost at iteration 492 is:  nan
Cost at iteration 493 is:  nan
Cost at iteration 494 is:  nan
Cost at iteration 495 is:  nan
Cost at iteration 496 is:  nan
Cost at 

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


Cost at iteration 475 is:  nan
Cost at iteration 476 is:  nan
Cost at iteration 477 is:  nan
Cost at iteration 478 is:  nan
Cost at iteration 479 is:  nan
Cost at iteration 480 is:  nan
Cost at iteration 481 is:  nan
Cost at iteration 482 is:  nan
Cost at iteration 483 is:  nan
Cost at iteration 484 is:  nan
Cost at iteration 485 is:  nan
Cost at iteration 486 is:  nan
Cost at iteration 487 is:  nan
Cost at iteration 488 is:  nan
Cost at iteration 489 is:  nan
Cost at iteration 490 is:  nan
Cost at iteration 491 is:  nan
Cost at iteration 492 is:  nan
Cost at iteration 493 is:  nan
Cost at iteration 494 is:  nan
Cost at iteration 495 is:  nan
Cost at iteration 496 is:  nan
Cost at iteration 497 is:  nan
Cost at iteration 498 is:  nan
Cost at iteration 499 is:  nan
Cost at iteration 0 is:  61.006729119520415
Cost at iteration 1 is:  25.28872882472914
Cost at iteration 2 is:  13.85352309671964
Cost at iteration 3 is:  9.451064189039975
Cost at iteration 4 is:  6.684802042295884
Cost at i

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


Cost at iteration 3 is:  nan
Cost at iteration 4 is:  nan
Cost at iteration 5 is:  nan
Cost at iteration 6 is:  nan
Cost at iteration 7 is:  nan
Cost at iteration 8 is:  nan
Cost at iteration 9 is:  nan
Cost at iteration 10 is:  nan
Cost at iteration 11 is:  nan
Cost at iteration 12 is:  nan
Cost at iteration 13 is:  nan
Cost at iteration 14 is:  nan
Cost at iteration 15 is:  nan
Cost at iteration 16 is:  nan
Cost at iteration 17 is:  nan
Cost at iteration 18 is:  nan
Cost at iteration 19 is:  nan
Cost at iteration 20 is:  nan
Cost at iteration 21 is:  nan
Cost at iteration 22 is:  nan
Cost at iteration 23 is:  nan
Cost at iteration 24 is:  nan
Cost at iteration 25 is:  nan
Cost at iteration 26 is:  nan
Cost at iteration 27 is:  nan
Cost at iteration 28 is:  nan
Cost at iteration 29 is:  nan
Cost at iteration 30 is:  nan
Cost at iteration 31 is:  nan
Cost at iteration 32 is:  nan
Cost at iteration 33 is:  nan
Cost at iteration 34 is:  nan
Cost at iteration 35 is:  nan
Cost at iteration

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


Cost at iteration 2 is:  nan
Cost at iteration 3 is:  nan
Cost at iteration 4 is:  nan
Cost at iteration 5 is:  nan
Cost at iteration 6 is:  nan
Cost at iteration 7 is:  nan
Cost at iteration 8 is:  nan
Cost at iteration 9 is:  nan
Cost at iteration 10 is:  nan
Cost at iteration 11 is:  nan
Cost at iteration 12 is:  nan
Cost at iteration 13 is:  nan
Cost at iteration 14 is:  nan
Cost at iteration 15 is:  nan
Cost at iteration 16 is:  nan
Cost at iteration 17 is:  nan
Cost at iteration 18 is:  nan
Cost at iteration 19 is:  nan
Cost at iteration 20 is:  nan
Cost at iteration 21 is:  nan
Cost at iteration 22 is:  nan
Cost at iteration 23 is:  nan
Cost at iteration 24 is:  nan
Cost at iteration 25 is:  nan
Cost at iteration 26 is:  nan
Cost at iteration 27 is:  nan
Cost at iteration 28 is:  nan
Cost at iteration 29 is:  nan
Cost at iteration 30 is:  nan
Cost at iteration 31 is:  nan
Cost at iteration 32 is:  nan
Cost at iteration 33 is:  nan
Cost at iteration 34 is:  nan
Cost at iteration 

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


Cost at iteration 12 is:  nan
Cost at iteration 13 is:  nan
Cost at iteration 14 is:  nan
Cost at iteration 15 is:  nan
Cost at iteration 16 is:  nan
Cost at iteration 17 is:  nan
Cost at iteration 18 is:  nan
Cost at iteration 19 is:  nan
Cost at iteration 20 is:  nan
Cost at iteration 21 is:  nan
Cost at iteration 22 is:  nan
Cost at iteration 23 is:  nan
Cost at iteration 24 is:  nan
Cost at iteration 25 is:  nan
Cost at iteration 26 is:  nan
Cost at iteration 27 is:  nan
Cost at iteration 28 is:  nan
Cost at iteration 29 is:  nan
Cost at iteration 30 is:  nan
Cost at iteration 31 is:  nan
Cost at iteration 32 is:  nan
Cost at iteration 33 is:  nan
Cost at iteration 34 is:  nan
Cost at iteration 35 is:  nan
Cost at iteration 36 is:  nan
Cost at iteration 37 is:  nan
Cost at iteration 38 is:  nan
Cost at iteration 39 is:  nan
Cost at iteration 40 is:  nan
Cost at iteration 41 is:  nan
Cost at iteration 42 is:  nan
Cost at iteration 43 is:  nan
Cost at iteration 44 is:  nan
Cost at it

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


Cost at iteration 7 is:  1.0561185953193564
Cost at iteration 8 is:  0.9288165056809433
Cost at iteration 9 is:  0.8336495633851413
Cost at iteration 10 is:  0.759899471455184
Cost at iteration 11 is:  0.700960810549008
Cost at iteration 12 is:  0.652621747638367
Cost at iteration 13 is:  0.6121073436957758
Cost at iteration 14 is:  0.577532503078759
Cost at iteration 15 is:  0.5475796812407009
Cost at iteration 16 is:  0.521303428630813
Cost at iteration 17 is:  0.498008602624025
Cost at iteration 18 is:  0.4771725495511013
Cost at iteration 19 is:  0.4583942012351716
Cost at iteration 20 is:  0.44136003433950133
Cost at iteration 21 is:  0.4258208263581825
Cost at iteration 22 is:  0.4115754666043203
Cost at iteration 23 is:  0.39845946712485564
Cost at iteration 24 is:  0.386336662773953
Cost at iteration 25 is:  0.37509311368481324
Cost at iteration 26 is:  0.36463255446555504
Cost at iteration 27 is:  0.3548729472063276
Cost at iteration 28 is:  0.3457438343355839
Cost at iteratio

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


Cost at iteration 13 is:  0.6434390114887155
Cost at iteration 14 is:  0.6086984181578434
Cost at iteration 15 is:  0.5785930015979143
Cost at iteration 16 is:  0.5521942692712194
Cost at iteration 17 is:  0.5288119354121066
Cost at iteration 18 is:  0.5079223112026839
Cost at iteration 19 is:  0.4891207006989821
Cost at iteration 20 is:  0.4720890703740249
Cost at iteration 21 is:  0.4565736530663243
Cost at iteration 22 is:  0.4423691454645381
Cost at iteration 23 is:  0.4293073642904772
Cost at iteration 24 is:  0.41724897050694754
Cost at iteration 25 is:  0.4060773393994216
Cost at iteration 26 is:  0.39569395485024167
Cost at iteration 27 is:  0.386014902125654
Cost at iteration 28 is:  0.3769681633921254
Cost at iteration 29 is:  0.36849150755316734
Cost at iteration 30 is:  0.3605308256014837
Cost at iteration 31 is:  0.3530388038865828
Cost at iteration 32 is:  0.3459738565487225
Cost at iteration 33 is:  0.3392992588177793
Cost at iteration 34 is:  0.332982437537853
Cost at i

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


Cost at iteration 19 is:  nan
Cost at iteration 20 is:  nan
Cost at iteration 21 is:  nan
Cost at iteration 22 is:  nan
Cost at iteration 23 is:  nan
Cost at iteration 24 is:  nan
Cost at iteration 25 is:  nan
Cost at iteration 26 is:  nan
Cost at iteration 27 is:  nan
Cost at iteration 28 is:  nan
Cost at iteration 29 is:  nan
Cost at iteration 30 is:  nan
Cost at iteration 31 is:  nan
Cost at iteration 32 is:  nan
Cost at iteration 33 is:  nan
Cost at iteration 34 is:  nan
Cost at iteration 35 is:  nan
Cost at iteration 36 is:  nan
Cost at iteration 37 is:  nan
Cost at iteration 38 is:  nan
Cost at iteration 39 is:  nan
Cost at iteration 40 is:  nan
Cost at iteration 41 is:  nan
Cost at iteration 42 is:  nan
Cost at iteration 43 is:  nan
Cost at iteration 44 is:  nan
Cost at iteration 45 is:  nan
Cost at iteration 46 is:  nan
Cost at iteration 47 is:  nan
Cost at iteration 48 is:  nan
Cost at iteration 49 is:  nan
Cost at iteration 50 is:  nan
Cost at iteration 51 is:  nan
Cost at it

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


Cost at iteration 9 is:  nan
Cost at iteration 10 is:  nan
Cost at iteration 11 is:  nan
Cost at iteration 12 is:  nan
Cost at iteration 13 is:  nan
Cost at iteration 14 is:  nan
Cost at iteration 15 is:  nan
Cost at iteration 16 is:  nan
Cost at iteration 17 is:  nan
Cost at iteration 18 is:  nan
Cost at iteration 19 is:  nan
Cost at iteration 20 is:  nan
Cost at iteration 21 is:  nan
Cost at iteration 22 is:  nan
Cost at iteration 23 is:  nan
Cost at iteration 24 is:  nan
Cost at iteration 25 is:  nan
Cost at iteration 26 is:  nan
Cost at iteration 27 is:  nan
Cost at iteration 28 is:  nan
Cost at iteration 29 is:  nan
Cost at iteration 30 is:  nan
Cost at iteration 31 is:  nan
Cost at iteration 32 is:  nan
Cost at iteration 33 is:  nan
Cost at iteration 34 is:  nan
Cost at iteration 35 is:  nan
Cost at iteration 36 is:  nan
Cost at iteration 37 is:  nan
Cost at iteration 38 is:  nan
Cost at iteration 39 is:  nan
Cost at iteration 40 is:  nan
Cost at iteration 41 is:  nan
Cost at ite

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


Cost at iteration 9 is:  nan
Cost at iteration 10 is:  nan
Cost at iteration 11 is:  nan
Cost at iteration 12 is:  nan
Cost at iteration 13 is:  nan
Cost at iteration 14 is:  nan
Cost at iteration 15 is:  nan
Cost at iteration 16 is:  nan
Cost at iteration 17 is:  nan
Cost at iteration 18 is:  nan
Cost at iteration 19 is:  nan
Cost at iteration 20 is:  nan
Cost at iteration 21 is:  nan
Cost at iteration 22 is:  nan
Cost at iteration 23 is:  nan
Cost at iteration 24 is:  nan
Cost at iteration 25 is:  nan
Cost at iteration 26 is:  nan
Cost at iteration 27 is:  nan
Cost at iteration 28 is:  nan
Cost at iteration 29 is:  nan
Cost at iteration 30 is:  nan
Cost at iteration 31 is:  nan
Cost at iteration 32 is:  nan
Cost at iteration 33 is:  nan
Cost at iteration 34 is:  nan
Cost at iteration 35 is:  nan
Cost at iteration 36 is:  nan
Cost at iteration 37 is:  nan
Cost at iteration 38 is:  nan
Cost at iteration 39 is:  nan
Cost at iteration 40 is:  nan
Cost at iteration 41 is:  nan
Cost at ite

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


Cost at iteration 15 is:  nan
Cost at iteration 16 is:  nan
Cost at iteration 17 is:  nan
Cost at iteration 18 is:  nan
Cost at iteration 19 is:  nan
Cost at iteration 20 is:  nan
Cost at iteration 21 is:  nan
Cost at iteration 22 is:  nan
Cost at iteration 23 is:  nan
Cost at iteration 24 is:  nan
Cost at iteration 25 is:  nan
Cost at iteration 26 is:  nan
Cost at iteration 27 is:  nan
Cost at iteration 28 is:  nan
Cost at iteration 29 is:  nan
Cost at iteration 30 is:  nan
Cost at iteration 31 is:  nan
Cost at iteration 32 is:  nan
Cost at iteration 33 is:  nan
Cost at iteration 34 is:  nan
Cost at iteration 35 is:  nan
Cost at iteration 36 is:  nan
Cost at iteration 37 is:  nan
Cost at iteration 38 is:  nan
Cost at iteration 39 is:  nan
Cost at iteration 40 is:  nan
Cost at iteration 41 is:  nan
Cost at iteration 42 is:  nan
Cost at iteration 43 is:  nan
Cost at iteration 44 is:  nan
Cost at iteration 45 is:  nan
Cost at iteration 46 is:  nan
Cost at iteration 47 is:  nan
Cost at it

  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]
  term2 = (1 - y[j])*np.log(1 - h(X[j,:],beta))[0]


Cost at iteration 2 is:  nan
Cost at iteration 3 is:  nan
Cost at iteration 4 is:  nan
Cost at iteration 5 is:  nan
Cost at iteration 6 is:  nan
Cost at iteration 7 is:  nan
Cost at iteration 8 is:  nan
Cost at iteration 9 is:  nan
Cost at iteration 10 is:  nan
Cost at iteration 11 is:  nan
Cost at iteration 12 is:  nan
Cost at iteration 13 is:  nan
Cost at iteration 14 is:  nan
Cost at iteration 15 is:  nan
Cost at iteration 16 is:  nan
Cost at iteration 17 is:  nan
Cost at iteration 18 is:  nan
Cost at iteration 19 is:  nan
Cost at iteration 20 is:  nan
Cost at iteration 21 is:  nan
Cost at iteration 22 is:  nan
Cost at iteration 23 is:  nan
Cost at iteration 24 is:  nan
Cost at iteration 25 is:  nan
Cost at iteration 26 is:  nan
Cost at iteration 27 is:  nan
Cost at iteration 28 is:  nan
Cost at iteration 29 is:  nan
Cost at iteration 30 is:  nan
Cost at iteration 31 is:  nan
Cost at iteration 32 is:  nan
Cost at iteration 33 is:  nan
Cost at iteration 34 is:  nan
Cost at iteration 

In [None]:
beta0 = 0.0001*np.ones([X.shape[1]+1,1])
alpha = 0.001
num_iter = 500
best_beta = multiclass_classification(X_train_st, y_train, beta0, alpha, num_iter)

Cost at iteration 0 is:  75.2789547052209
Cost at iteration 1 is:  67.19553547435574
Cost at iteration 2 is:  60.9387735804044
Cost at iteration 3 is:  55.98019346603246
Cost at iteration 4 is:  51.95806303038975
Cost at iteration 5 is:  48.62595758260991
Cost at iteration 6 is:  45.81404421315966
Cost at iteration 7 is:  43.4030616054382
Cost at iteration 8 is:  41.30747200924431
Cost at iteration 9 is:  39.46460986261543
Cost at iteration 10 is:  37.82762835377315
Cost at iteration 11 is:  36.36084235563676
Cost at iteration 12 is:  35.03659695941913
Cost at iteration 13 is:  33.8331216718683
Cost at iteration 14 is:  32.73303247043828
Cost at iteration 15 is:  31.7222673769808
Cost at iteration 16 is:  30.789317315143624
Cost at iteration 17 is:  29.92466156720651
Cost at iteration 18 is:  29.12034732175524
Cost at iteration 19 is:  28.36967226720676
Cost at iteration 20 is:  27.66694194630978
Cost at iteration 21 is:  27.007282085987303
Cost at iteration 22 is:  26.3864918652087
Co

In [None]:
print(tuple(np.array(best_beta).flatten()))

(-1.5671107237744462, 1.303092668915191, 0.35929692964581783, 0.7635697090967184, -1.4189960485064903, -0.04749276316134652, 0.5114531756727843, 1.0820198515453745, -0.14836905869704312, -0.1665092548471237, 0.2843213045495142, -0.004873468460159282, 0.9385491756584317, 1.8834511540689605, -1.177587685104686, -1.4927057007360092, -0.8082079418029153, -1.1802777911821034, 0.8278267209830801, -0.15418925041149234, -0.02854893893584923, 0.22488535592895037, 0.2935235577659074, 0.43358638460890103, -1.8146633780320258, 1.0217524736899912, 0.27865242905298027, -1.8506432599644702, -2.1854800299536765, 0.4337018388841976, 0.5064335498660081, 0.4352697434381746, 0.2976994877169169, 0.29218422826726237, -0.18882234319473362, -1.2051988042019446, -0.0008260602647677797, -0.445874079468241, 1.5630419822792792, -1.0219103557632496, -1.13662446272789, -0.00414853346515825)


## 2.4

Using the standardscaler object from question 2.2, standardize $X\_test$. Also, then add a column of 1 as the first column.

In [None]:
#standardize x_test
X_test = scaler_st.fit_transform(X_test)
X_test_st = np.concatenate((np.ones([len(X_test), 1]), X_test), axis = 1)

In [None]:
print(X_test_st)

[[ 1.00000000e+00  8.38394341e-01 -5.24382663e-01 -4.61079968e-01
  -1.11446600e+00  1.42732176e+00  3.27650893e-01  7.09495998e-01
  -1.17251987e+00  1.55841793e-01  2.94490258e-01 -1.94122110e-01
   8.31387018e-01  9.52918164e-01]
 [ 1.00000000e+00 -3.00933598e-01  4.16057849e-01  3.92364623e-01
   9.49309376e-01  9.77902662e-01 -1.33618840e+00 -7.80069444e-01
  -9.32079086e-01 -5.56996903e-01  2.50875878e+00 -2.27175334e+00
  -1.71011086e+00 -8.13918025e-01]
 [ 1.00000000e+00 -8.04636476e-01 -1.03222054e+00 -7.95036547e-01
  -1.56284577e-01 -8.94676903e-01  1.66466461e+00  9.02946055e-01
  -1.33281373e+00  6.50868666e-01 -3.31767506e-01  1.22244464e+00
   3.46592624e-01 -9.96694182e-01]
 [ 1.00000000e+00  6.22521679e-01 -4.67956232e-01  3.18152050e-01
   3.96512399e-01  1.27751539e+00  8.62456381e-01  5.93425963e-01
  -1.25266680e+00  1.79933101e+00  4.73421048e-01  8.91912395e-02
   5.81644451e-01  1.13569432e+00]
 [ 1.00000000e+00  1.08824696e-02  3.35963665e+00 -9.06355407e-01
  

## 2.5

Use the function $predict\_multiclass()$ and make label predictions on the testing set. Then print the accuracy value on testing set.

In [None]:
#apply predict_multiclass() function
prob_test, label_test = predict_multiclass(X_test_st, best_beta)

#display results
print(prob_test)
print(label_test)

[[9.99459067e-01 9.73868076e-05 4.43545991e-04]
 [3.45224664e-07 1.59000538e-07 9.99999496e-01]
 [1.65371439e-04 9.99832785e-01 1.84316021e-06]
 [9.95383075e-01 1.29666316e-03 3.32026154e-03]
 [7.92768125e-03 9.85446337e-01 6.62598173e-03]
 [3.18907687e-03 9.96457426e-01 3.53496709e-04]
 [9.99955594e-01 9.78584069e-07 4.34270880e-05]
 [5.21122694e-06 7.21628566e-05 9.99922626e-01]
 [4.01228873e-05 9.99941052e-01 1.88249700e-05]
 [4.63298872e-06 9.99951136e-01 4.42309775e-05]
 [1.91518615e-03 6.82073616e-04 9.97402740e-01]
 [3.62429482e-06 1.46163495e-05 9.99981759e-01]
 [9.99996056e-01 2.17213908e-08 3.92244702e-06]
 [7.04457004e-03 9.92943111e-01 1.23186297e-05]
 [1.28849217e-05 2.61028425e-06 9.99984505e-01]
 [5.31915761e-08 9.99999942e-01 4.61502094e-09]
 [9.99463071e-01 1.05440090e-04 4.31488512e-04]
 [9.99999122e-01 9.72066223e-10 8.77231390e-07]
 [4.32779759e-04 7.49089662e-01 2.50477559e-01]
 [9.99978212e-01 1.63733314e-05 5.41510295e-06]
 [6.65801612e-03 9.93339012e-01 2.972117

In [None]:
#review y test results to visually compare to label test
y_test

array([0, 2, 1, 0, 1, 1, 0, 2, 1, 1, 2, 2, 0, 1, 2, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 2, 0, 0, 1, 0, 0, 0, 2, 1, 1, 2, 0, 0, 1, 1,
       1, 0, 2, 1, 2, 0, 2, 2, 0, 2])

In [None]:
#import accuracy score metric
from sklearn.metrics import accuracy_score

#compare y_test to label_test
score = accuracy_score(y_test, label_test)

print(score)

1.0


## 2.6

Plot the confusion matrix for the testing set.

In [None]:
#import confusion matrix metric
from sklearn.metrics import confusion_matrix

#compare y_test to label_test
conf_mat = confusion_matrix(y_test, label_test)

print(conf_mat)

[[19  0  0]
 [ 0 22  0]
 [ 0  0 13]]


True positives for class 1: 19

True positives for class 2: 22

True positives for class 3: 13