In [172]:

##########################################################
#  Python script template for Question 3 (IAML Level 10)
#  Note that:
#  - You should not change the filename of this file, 'iaml01cw2_q3.py', which is the file name you should use when you submit your code for this question.
#  - You should define the functions shown below in your code.
#  - You can define function arguments (parameters) and returns (attributes) if necessary.
#  - In case you define helper functions, do not define them here, but put them in a separate Python module file, "iaml01cw2_my_helpers.py", and import it in this script.
#  - For those questions requiring you to show results in tables, your code does not need to present them in tables - just showing them with print() is fine.
#  - You do not need to include this header in your submission
##########################################################

#--- Code for loading the data set and pre-processing --->
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
sys.path.insert(0, os.path.abspath('../helpers'))
from iaml01cw2_helpers import *

#Load the data:
dataPath = os.path.join(os.getcwd(),'../data')
Xtrn, Ytrn, Xtst, Ytst = load_CoVoST2(dataPath)

from numpy import loadtxt
languages = loadtxt("../data/languages.txt", dtype="str",delimiter="\n", unpack=False)
#<----

# Q3.1
def iaml01cw2_q3_1():
    from sklearn.cluster import KMeans
    import math
    
    #Get the cluster centers
    km = KMeans(n_clusters=22, random_state =1)
    km.fit(Xtrn)
    centers = km.cluster_centers_
    
    #Calculate the sum of squared distances between samples and their clusters
    sumSquaredDists = 0
    
    for row in range(Xtrn.shape[0]):
        cluster = km.labels_[row]
        dist = math.sqrt(sum((centers[cluster] - Xtrn[row,:])**2))
        sumSquaredDists += dist**2
        
    print("Sum of squared distances (Euclidean) of samples to their closest cluster center:")
    print(sumSquaredDists)
    print()
    
    #Calculate the number of samples for each cluster
    frequencies = np.zeros(22)
    for l in km.labels_:
        frequencies[l] += 1
        
    print("Number of samples for each cluster:")
    for i in range(22):
        print("Cluster " + str(i+1) + " = " + str(int(frequencies[i])))
        
#
# iaml01cw2_q3_1()   # comment this out when you run the function

# Q3.2
def iaml01cw2_q3_2():
    from sklearn.cluster import KMeans
    from sklearn.decomposition import PCA
    import math
    from matplotlib import ticker
    from matplotlib import lines
    
    plt.rcParams["figure.figsize"] = (10,10)
    
    print("starting...")
    print()
    
    #Get the cluster centers
    km = KMeans(n_clusters=22, random_state =1)
    km.fit(Xtrn)
    centers = km.cluster_centers_
    
    #Calculate the mean vectors for each class
    frequencies = np.zeros(22)
    means = np.zeros((22,26))
    
    for row in range(Xtrn.shape[0]):
        label = Ytrn[row]
        frequencies[label] += 1
        means[label,:] += Xtrn[row,:]
        
    for i in range(22):
        means[i,:] = means[i,:]/frequencies[i]
        
    #Perform PCA on this data
    pca = PCA(n_components=2)
    
    newMeans = pca.fit_transform(means)
    newCenters = pca.transform(centers)
    
    #Plot this 2D-PCA plane
    cMap = plt.cm.get_cmap("nipy_spectral")
    colors = range(22)
    
    sc = plt.scatter(newMeans[:,0],newMeans[:,1],s=50,c=colors,alpha=0.8,marker="o",vmin=0,vmax=21,cmap=cMap)
    plt.scatter(newCenters[:,0],newCenters[:,1],s=65,c="black",alpha=0.45,marker="^")
    
    #Implement a colorbar
    cbar = plt.colorbar(sc)
    cbar.locator = ticker.MaxNLocator(nbins=22)
    cbar.update_ticks()
    cbar.ax.set_yticklabels(range(22))
    cbar.set_label("\nClass number", fontsize=15)
    
    #Custom legend
    legEls = [lines.Line2D([0], [0], marker='o', color='w', label='Mean class vectors',
                          markerfacecolor='black', markersize=10), lines.Line2D([0], [0], marker='^', color='w', label='Cluster centers',
                          markerfacecolor='black', markersize=10)]
    plt.legend(handles=legEls,loc="upper right",fontsize=12)
    
    #Plot features for further readability
    plt.grid(True)
    plt.xlabel("PC1",fontsize=16)
    plt.ylabel("PC2",fontsize=16)
    plt.title("A 2D-PCA plane to show the relationship between the mean\nclass vectors and the trained cluster centers",fontsize=18)
    
#
# iaml01cw2_q3_2()   # comment this out when you run the function

# Q3.3
def iaml01cw2_q3_3():
    import scipy.cluster.hierarchy as hierarchy
    import math
    
    print("starting...")
    print()
    
    #Calculate the mean vectors for each class
    frequencies = np.zeros(22)
    means = np.zeros((22,26))
    
    for row in range(Xtrn.shape[0]):
        label = Ytrn[row]
        frequencies[label] += 1
        means[label,:] += Xtrn[row,:]
        
    for i in range(22):
        means[i,:] = means[i,:]/frequencies[i]
    
    #Carry out Hierarchical clustering with Ward's linkage
    z = hierarchy.linkage(means,method="ward")
    
    #Display the respective dendogram
    hierarchy.dendrogram(z,orientation="right", labels=languages)
    
#
# iaml01cw2_q3_3()   # comment this out when you run the function

# Q3.4
def iaml01cw2_q3_4():
    from sklearn.cluster import KMeans
    import scipy.cluster.hierarchy as hierarchy
    
    print("starting...")
    print()
    
    #Array to store the indexes of every instance that belongs to each class
    dataIndexes = []
    for l in range(22):
        dataIndexes.append([])
    
    #Iterates through the training set and stores the indexes in the dataIndexes array
    for row in range(Xtrn.shape[0]):
        lang = Ytrn[row]
        dataIndexes[lang] += [row]
    
    
    #Kmeans initialisation
    km = KMeans(n_clusters=3,random_state=1)
    
    #Numpy array to store the 3 cluster centers for all 22 classes
    centers = np.empty((66,26))
    labels = []
    
    #Iterates through each class
    for c in range(22):
        indexes = dataIndexes[c]
        
        #Numpy array to store the data for the given class (c)
        classData = np.empty((len(indexes),26))
        
        #Iterate through the dataIndexes and store the real data in 'classData'
        for d in range(len(indexes)):
            index = indexes[d]
            classData[d,:] = Xtrn[index,:]
        
        #Fit our KMeans model using this class data
        km.fit(classData)
        
        #Add the class' cluster centers to 'centers'
        classCenters = km.cluster_centers_
        centers[c*3,:] = classCenters[0,:]
        centers[(c*3)+1,:] = classCenters[1,:]
        centers[(c*3)+2,:] = classCenters[2,:]
        
        #Add the appropriate labels for each cluster
        labels.append("l" + str(c) + ": c1")
        labels.append("l" + str(c) + ": c2")
        labels.append("l" + str(c) + ": c3")
    
    
    #Produce seperate dendograms using these 3 linkage methods
    methods = ["ward","single","complete"]
    for m in methods:
        z = hierarchy.linkage(centers,method=m)
        hierarchy.dendrogram(z,labels=labels,leaf_font_size=6)
        
    
#
# iaml01cw2_q3_4()   # comment this out when you run the function

# Q3.5
#def iaml01cw2_q3_5():
#
# iaml01cw2_q3_5()   # comment this out when you run the function



In [None]:
iaml01cw2_q3_2()

starting...

