In [1]:

##########################################################
#  Python script template for Question 2 (IAML Level 10)
#  Note that
#  - You should not change the filename of this file, 'iaml01cw2_q2.py', which is the file name you should use when you submit your code for this question.
#  - You should define the functions shown below in your code.
#  - You can define function arguments (parameters) and returns (attributes) if necessary.
#  - In case you define helper functions, do not define them here, but put them in a separate Python module file, "iaml01cw2_helpers.py", and import it in this script.
#  - For those questions requiring you to show results in tables, your code does not need to present them in tables - just showing them with print() is fine.
#  - You do not need to include this header in your submission
##########################################################

#--- Code for loading the data set and pre-processing --->
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
sys.path.insert(0, os.path.abspath('../helpers'))
from iaml01cw2_helpers import *

# Load the data:
dataPath = os.path.join(os.getcwd(),'../data')
Xtrn, Ytrn, Xtst, Ytst = load_FashionMNIST(dataPath)
Xtrn_orig = Xtrn.copy
Xtst_orig = Xtst.copy
Xtrn = Xtrn/255
Xtst = Xtst/255

Xmean = Xtrn.mean(0)
Xtrn_nm = Xtrn - Xmean
Xtst_nm = Xtst - Xmean
#<----

# Q2.1
def iaml01cw2_q2_1():
    from sklearn.linear_model import LogisticRegression
    import pandas as pd
    
    print("starting...")
    print()
    lr = LogisticRegression()
    lr.fit(Xtrn_nm,Ytrn)
    pred = lr.predict(Xtst_nm)
    
    print("FREQUENCY CONFUSION MATRIX:")
    print()
    cm = pd.crosstab(pred,Ytst,rownames=["Actual"],colnames=["Predicted"])
    print(cm.round(3))
    print()
    print()
    print("PERCENTAGE CONFUSION MATRIX:")
    print()
    cm2 = cm/np.sum(cm,axis=1)
    print((cm2*100).round(1))
    print()
    print()
    print("Total classification accuracy = " + str((((cm*np.identity(10)).sum().sum()/cm.sum().sum())*100).round(3)) + "%")
    
#
# iaml01cw2_q2_1()   # comment this out when you run the function

# Q2.2
def iaml01cw2_q2_2():
    from sklearn.svm import SVC
    import pandas as pd
    
    print("starting...")
    print()
    svc = SVC(kernel = "rbf", C=1.0, gamma = "auto")
    svc.fit(Xtrn_nm,Ytrn)
    pred = svc.predict(Xtst_nm)
    
    print("FREQUENCY CONFUSION MATRIX:")
    print()
    cm = pd.crosstab(pred,Ytst,rownames=["Actual"],colnames=["Predicted"])
    print(cm.round(3))
    print()
    print()
    print("PERCENTAGE CONFUSION MATRIX:")
    print()
    cm2 = cm/np.sum(cm,axis=1)
    print((cm2*100).round(1))
    print()
    print()
    print("Mean classification accuracy = " + str((((cm2*np.identity(10)).sum().sum()/10)*100).round(3)) + "%")
    
#
# iaml01cw2_q2_2()   # comment this out when you run the function

# Q2.3
def iaml01cw2_q2_3():
    from sklearn.linear_model import LogisticRegression
    from sklearn.decomposition import PCA
    import statistics as stats
    from matplotlib.colors import ListedColormap
    from matplotlib import ticker

    print("starting...")
    print()
    lr = LogisticRegression()
    lr.fit(Xtrn_nm[0:100,:],Ytrn[0:100])
    
    pca = PCA(n_components=2).fit(Xtrn_nm[0:100,:])
    newXtrn_nm = pca.transform(Xtrn_nm[0:100,:])
    lr1 = LogisticRegression().fit(newXtrn_nm[0:100,:],Ytrn[0:100])
    
    pc0 = pca.components_[0]
    pc1 = pca.components_[1]
    
    pc0Stdev = stats.stdev(pc0)
    pc1Stdev = stats.stdev(pc1)
    
    #x = np.linspace(-5*pc0Stdev,5*pc0Stdev,784)
    #y = np.linspace(-5*pc1Stdev,5*pc1Stdev,784)
    #X, Y = np.meshgrid(x,y)
    
    xx, yy = np.mgrid[-5*pc0Stdev:5*pc0Stdev:10*pc0Stdev/784, -5*pc1Stdev:5*pc1Stdev:10*pc1Stdev/784]
    grid = np.c_[xx.ravel(), yy.ravel()]
    probs = lr1.predict_proba(grid)[:, 1].reshape(xx.shape)
    
    #Z = np.dot(x,pca.components_.T)
    #print(Z)
    #print(pca.components_.shape)

    #inputs = np.empty((2,784))
    #inputs[0,:] = x
    #inputs[1,:] = y
    
    #z = np.matmul(pca.components_.T,inputs)
    
    colors = plt.cm.get_cmap("coolwarm")(np.linspace(0,1,10))
    newCmap = ListedColormap(colors)
    
    fig, ax = plt.subplots()
    cs = ax.contourf(xx,yy,probs,cmap=newCmap)
        
    cbar = fig.colorbar(cs)
    tick_locator = ticker.MaxNLocator(nbins=12)
    cbar.locator = tick_locator
    cbar.update_ticks()
    cbar.ax.set_yticklabels(np.linspace(0,9,10))
    cbar.set_label("\nClass")
    
    plt.xlim(-5*pc0Stdev,5*pc0Stdev)
    plt.ylim(-5*pc1Stdev,5*pc1Stdev)
    plt.title("A graph to show the 2D decision regions for our\ntrained logistic regression classifier")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    
    newX = [-5*pc0Stdev,-2.5*pc0Stdev,0,2.5*pc0Stdev,5*pc0Stdev]
    plt.xticks(newX,["-5$\sigma_1$","-2.5$\sigma_1$","0$\sigma_1$","2.5$\sigma_1$","5$\sigma_1$"])
    newY = [-5*pc1Stdev,-2.5*pc1Stdev,0,2.5*pc1Stdev,5*pc1Stdev]
    plt.yticks(newY,["-5$\sigma_2$","-2.5$\sigma_2$","0$\sigma_2$","2.5$\sigma_2$","5$\sigma_2$"])
    
    plt.axhline(0, color='black')
    plt.axvline(0, color='black')
    plt.show()
#
# iaml01cw2_q2_3()   # comment this out when you run the function

# Q2.4
#def iaml01cw2_q2_4():
#
# iaml01cw2_q2_4()   # comment this out when you run the function

# Q2.5
def iaml01cw2_q2_5():
    from sklearn.svm import SVC
    from sklearn.model_selection import cross_val_score
    import math
        
    print("starting...")    
        
    Xsmall = np.empty((10000,784))
    Ysmall = np.empty(10000)
    classAmounts = np.zeros(10)
    
    #Initialize Xsmall and Ysmall
    index = 0
    for row in range(Xtrn_nm.shape[0]):
        
        label = Ytrn[row]
        
        if classAmounts[label] < 1000:
            Xsmall[index,:] = Xtrn_nm[row,:]
            Ysmall[index] = label
            classAmounts[label] += 1
            index += 1
            
        if classAmounts.sum() == 10000:
            break
    
    
    C = np.logspace(-2,3,10,endpoint=True)
    print(C)
    accuracies = np.empty(10)
    print(math.log(C[6],10))
    
    #Iterate through each value of C
    for i in range(C.size):
        c = C[i]
        print(c)
        
        #Initialize our SVM model
        svc = SVC(kernel = "rbf", C=c, gamma = "auto")
        
        #Get the cross-validated classification accuracy
        score = cross_val_score(svc,Xsmall,Ysmall,cv=3)
        accuracies[i] = np.sum(score)/len(score)
        print(accuracies[i])
    
    print(accuracies)
    print()
    plt.scatter(C,accuracies,c="black")
    plt.plot(C,accuracies,c="black")
    
    maxIndex = np.where(accuracies == max(accuracies))
    plt.scatter(C[maxIndex],accuracies[maxIndex],c="red",label="Highest mean accuracy")
    plt.legend(loc="upper left")
    
    plt.grid(True)
    plt.xscale('log')
    plt.xlabel("C")
    plt.ylabel("Mean cross-validated classification accuracy")
    plt.title("A graph to show the relationship between the regularisation\nparameter C and the cross-validation accuracy")
        
    
#
# iaml01cw2_q2_5()   # comment this out when you run the function

# Q2.6 
#def iaml01cw2_q2_6():
#
# iaml01cw2_q2_6()   # comment this out when you run the function



In [None]:
iaml01cw2_q2_5()

  from collections import Sequence


starting...
[1.00000000e-02 3.59381366e-02 1.29154967e-01 4.64158883e-01
 1.66810054e+00 5.99484250e+00 2.15443469e+01 7.74263683e+01
 2.78255940e+02 1.00000000e+03]
1.3333333333333333
0.01
