In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
#pass the data.csv file here
x_data = pd.read_csv("/Users/karunparashar/Downloads/New Folder With Items/coursework/home_work_assignments/DIC/Assignment1/data.csv")
x_t = x_data.iloc[:,0:48]
y_t = x_data.iloc[:,48]
x_train, x_test, y_train, y_test = train_test_split(x_t,y_t,train_size=0.7,test_size=0.3,
                                                    random_state = 7)

### Random Forest Implementation -The complete data takes more time to build parallel trees, hence the training data has been limited to 1000 data points and is tested upon 300 testing points(can be changed)

The following cell calculates gini index for whatever dataframe we pass through assuming that the last column is the response column(for standardisation purpose)

In [2]:
def gini_calc(x):
    gini = 1
    prob_sq = 0
    x_last = x.iloc[:,-1]
    xuni = x_last.unique()
    for i in xuni:
        prob_sq = prob_sq + (len(x_last[x_last==i])/len(x_last))**2
    return 1-prob_sq

The following cell finds the feature with most information gain, the feature name and the splitting value again with the assumption that the last column is the dataframe response column

In [3]:
def info_gain(x,m):
    import random
    df_a = x
    df_to_be_passed = df_a.iloc[:,0:len(df_a.columns)-1]
    df_WR = df_a.iloc[:,0:len(df_a.columns)-1]
    features_selected = random.sample(population=list(df_WR.columns), k=m)
    df_selected = df_WR[features_selected]
    df_joined = pd.concat([df_selected,df.iloc[:,-1]],axis=1)
    df2 = df_joined.copy(deep = True)
    best_info = 0
    x = df_joined
    for i in x:
        if i!=x.columns[-1]:
            for j in x[i].quantile([0.25,0.5,0.75]):
                df2 = x[x[i]<=j]
                gini_left = gini_calc(df2)
                WI_1 = gini_left*(len(x[i][x[i]<=j])/len(x[i]))
                df2 = x[x[i]>j]
                gini_right = gini_calc(df2)
                WI_2 = gini_right*(len(x[i][x[i]>j])/len(x[i]))
                WI = WI_1+WI_2
                info_gain = gini_calc(x) - WI
                if info_gain > best_info:
                    best_info = info_gain
                    condition_value = j
                    condition_column = i
                else:
                    pass
    return [condition_column,condition_value,best_info]

The following cell includes the building of a single tree number of features for that tree can be specified in line #21

The depth of the tree , specified in line # 13 has also been selected as 150 which increases the time taken by the random forest to build more number of trees

In [31]:
#defining the class Node with required attributes
class Node:
    def __init__(self,subtree,columnname,splitvalue):
        self.subtree=subtree
        self.columnname=columnname
        self.splitvalue=splitvalue
        self.leftchild=None
        self.rightchild=None
        self.classification=None

#to split the data at a node if the data is higher than a threshold, in this case, it is 150 data points
def split(data):
    if (len(data.index)<=150): 
        class_leafnode=(data[data.columns[-1]].value_counts().idxmax()) 
        #this part defines the leaf node, that is there is no left or right node with respect to that node
        newnode=Node(data,None,None)
        #the classification part is dedicated to the leaf node as there are no splits ahead
        newnode.classification=class_leafnode
        return newnode
#here we calculate the column and the value at which the node will split, on the basis of number of features selected(can be changed in this line)
    [condition_column,condition_value,best_info] = info_gain(data,15)
#the node is now assigned the dataframe(sliced), the splitting column and its value
    newnode= Node(data,condition_column,condition_value)
#extract left child data
    leftchild_data=data.loc[data[condition_column] <= condition_value]
    newnode.leftchild=split(leftchild_data)
#extract right child data
    rightchild_data=data.loc[data[condition_column] > condition_value]
    newnode.rightchild=split(rightchild_data)

    return newnode
#this function takes in the root node calculated from above class to start with and the test data to predict
def predictClass(root,testdata):
    list_pred=[]
    root_initial=root
    for i in range(len(testdata)):
        data=testdata.iloc[i,:]
        root=root_initial
        while(root.leftchild != None and root.rightchild != None):
            col_data=data[root.columnname]
            if (col_data <= root.splitvalue):
                root=root.leftchild
            else:
                root=root.rightchild    
        list_pred.append(root.classification)
    return list_pred    

The following cell implements RandomForest using multi-threading

In [22]:
import concurrent.futures
from tqdm import tqdm
def randomForest(dataset,nTree,testdata):
    trees=[]
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for i in range(nTree):
            dp_data = dataset.copy(deep=True)
            future=executor.submit(split,dp_data)
            rootnode=future.result()
            trees.append(rootnode)
                
    ensemble_pred=[]
    for tree in trees:
        list_pred=predictClass(tree,testdata)
        ensemble_pred.append(list_pred)
    
    matrix_response = np.array(ensemble_pred).T
    return [np.argmax(np.bincount(i)) for i in matrix_response]

Here the number of trees to be built is implemented on a smaller dataset, because the size of the data hampers the speed at which the trees are being built for the algorithm implemented above
Key points to be taken care of- the depth of the tree is 150 and for only 1000 data points, the algorithm takes 11 minutes to make 100 trees(a timer has also been included for the same)
The tradeoff is hence between the training sample size and number of trees created

In [32]:
import time
s = time.time()
df = x_data
#first 1000 records have been taken to implement the algorithm
prediction=randomForest(df.iloc[0:1000,:],10,df.iloc[1100:1400,:])
e = time.time()
print(prediction,[e-s])

[1, 11, 4, 7, 7, 2, 11, 5, 7, 1, 1, 8, 7, 1, 1, 10, 4, 11, 7, 8, 11, 7, 5, 2, 4, 4, 6, 6, 6, 10, 6, 1, 3, 7, 5, 6, 9, 7, 10, 10, 10, 11, 3, 5, 8, 10, 11, 1, 10, 11, 2, 6, 11, 9, 4, 1, 4, 7, 1, 2, 8, 1, 2, 9, 4, 2, 4, 3, 4, 3, 3, 10, 7, 11, 3, 3, 3, 4, 5, 3, 5, 7, 10, 8, 7, 4, 6, 4, 10, 4, 5, 10, 6, 3, 10, 8, 5, 2, 3, 4, 8, 10, 6, 5, 8, 6, 1, 4, 6, 6, 1, 7, 3, 1, 9, 4, 7, 6, 7, 2, 1, 4, 11, 10, 2, 4, 5, 10, 3, 5, 5, 6, 7, 7, 3, 11, 6, 1, 5, 7, 4, 4, 7, 6, 5, 6, 5, 1, 7, 11, 4, 1, 2, 10, 9, 3, 3, 4, 6, 4, 5, 6, 7, 8, 3, 3, 2, 6, 1, 10, 3, 2, 8, 1, 3, 2, 1, 6, 4, 4, 3, 5, 2, 11, 11, 4, 10, 1, 3, 11, 2, 5, 9, 3, 4, 10, 9, 6, 4, 7, 3, 9, 6, 5, 7, 1, 5, 4, 10, 6, 1, 2, 7, 5, 3, 5, 6, 1, 9, 2, 8, 6, 1, 4, 10, 2, 9, 1, 11, 5, 7, 7, 4, 7, 3, 4, 7, 6, 7, 5, 3, 10, 5, 5, 8, 9, 6, 6, 10, 2, 10, 7, 3, 5, 8, 11, 3, 6, 11, 4, 2, 4, 3, 7, 6, 11, 1, 6, 1, 10, 2, 4, 7, 5, 11, 7, 11, 11, 2, 8, 1, 9, 4, 7, 7, 6, 7, 7, 5, 4, 11, 10, 8, 7, 1, 9, 2, 4, 2, 10] [112.7381820678711]


Predicting data using sklearn's in-built classifier

Here same amount of training and testing data has been used

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rfc=  RandomForestClassifier(n_estimators=10)
rf_mod = rfc.fit(df.iloc[0:1000,0:48],df.iloc[0:1000,48])
rf_pred = rf_mod.predict(df.iloc[1100:1400,0:48])
print("in built classifier accuracy is = ",accuracy_score(rf_pred,df.iloc[1100:1400,48]))
print("Implemented RandomForest classifier accuracy is = ",accuracy_score(prediction,df.iloc[1100:1400,48]))

in built classifier accuracy is =  0.9133333333333333
Implemented RandomForest classifier accuracy is =  0.8133333333333334


### KNN implementation

In [5]:
def KNN_Implementation(training_data,training_y,testing_data,Number_of_neighbors):
    #time module has only been imported to calculate the time taken to complete the algorithm
    import time
    y_test =[]
    min_d = []
    s = time.time()
    #in this for loop, we are traversing across the rows of test dataset, so as to find the distances from all training points
    for i in range(len(testing_data.iloc[:,:])):
        d3 = pd.Series(np.linalg.norm(np.array(training_data)-np.array(testing_data.iloc[i,:]), axis = 1)).sort_values(ascending=True)
        #the min_d generates the list which contains the training responses for the indexes from distance series generated
        min_d = [training_y.iloc[d3.index[i],] for i in range(Number_of_neighbors)]
        #the response_per_sample calculates the majority votes of classes from all given neighbors
        response_per_sample = np.argmax(np.bincount(min_d))
        #this is supposed to be the output from this function or the test response
        y_test.append(response_per_sample)
    e = time.time()
    print("total running time is ",e-s)
    return y_test

Validating across complete data with Sklearn's KNN module, n=5 has been taken, can be changed at lines 5 and 6

In [6]:
x_train, x_test, y_train, y_test = train_test_split((x_t/x_t.max(axis = 0)),y_t,train_size=0.7,test_size=0.3,
                                                    random_state = 7)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
manual_knn=KNN_Implementation(x_train,y_train,x_test,5)
knc_mod = KNeighborsClassifier(n_neighbors=5).fit(x_train,y_train)
kn_pred = knc_mod.predict(x_test)
print("Sklearn accuracy score is ",accuracy_score(kn_pred,y_test))
print("manual accuracy score is ",accuracy_score(manual_knn,y_test))

total running time is  219.43598103523254
Sklearn accuracy score is  0.982094897045658
manual accuracy score is  0.982094897045658


### Accuracy, precision and recall functions

In [396]:
def accuracy_manual(y_true,y_pred):
    crossT = pd.crosstab(y_true,y_pred)
    sum_d = np.trace(crossT)
    return sum_d/len(y_true)

0.9746073085374787

In [397]:
def recall_manual(y_true,y_pred):
    import pandas as pd
    p ={}
    dt = pd.crosstab(y_true,y_pred)    
    dc = pd.DataFrame(dt)
    for i in range(dc.shape[0]):
        p[i+1] = dt.iloc[i,i]/sum(dt.iloc[:,i])
    return p

{1: 0.9938325991189427,
 2: 0.9776586237712243,
 3: 0.9956140350877193,
 4: 0.9981884057971014,
 5: 0.9814651368049426,
 6: 0.960431654676259,
 7: 1.0,
 8: 0.954954954954955,
 9: 0.9522935779816514,
 10: 0.9010270774976658,
 11: 1.0}

In [398]:
def precision_manual(y_true,y_pred):
    import pandas as pd
    p ={}
    dt = pd.crosstab(y_true,y_pred)    
    dc = pd.DataFrame(dt)
    for i in range(dc.shape[0]):
        p[i+1] = dt.iloc[i,i]/sum(dt.iloc[i,:])
    return p

{1: 0.967409948542024,
 2: 0.9109075770191507,
 3: 0.9826839826839827,
 4: 0.9963833634719711,
 5: 0.9569707401032702,
 6: 0.9535714285714286,
 7: 1.0,
 8: 0.9962406015037594,
 9: 0.9885714285714285,
 10: 0.9747474747474747,
 11: 1.0}

### Kmeans implementation

In [352]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
import random as rd
from scipy.spatial import distance
from copy import deepcopy
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from numpy.linalg import norm
import collections 
import cmath as math
import sys
from sklearn.cluster import KMeans
from sklearn import preprocessing

# # IMPORT AND READ DATA
df=pd.read_csv("/Users/karunparashar/Downloads/New Folder With Items/coursework/home_work_assignments/DIC/Assignment1/data.csv")
X=df.drop(columns=['48'])
Y=df['48'].values

sx=X.shape
sy=Y.shape

# # SPLIT DATA INTO TRAINING AND TESTING DATA
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.15)

# # NORMALISE THE DATA
scaler = preprocessing.MinMaxScaler()
train_X = preprocessing.normalize(train_x)
# print(train_x)
train_x_norm = scaler.fit_transform(train_x)

#Convert ndarray to dataframe
train_x_norm = pd.DataFrame(train_x_norm)

# # INITIALISE NUMBER OF CLUSTERS (K)
K=11
centroids = train_x.iloc[np.random.choice(np.arange(len(train_x)), 11, False)]
centroids = np.asarray(centroids) 
distances=np.zeros((train_x.shape[0], K))
clusters=[]

for i in range(len(centroids)): 
    distances[:,i] = np.linalg.norm(train_x-centroids[i],axis=1) 
     
clusters = np.argmin(distances, axis=1)
for i in range(K):
        centroids[i,:] = np.mean(train_x[clusters == i], axis=0)

# # Making list of clusters
dat = train_x.copy(deep=True)
dat['index_number'] = clusters
d ={}
for i in np.unique(clusters):
    d[i] = np.array(dat[dat['index_number']==i].iloc[:,0:48])

Within cluster sum of squares and cluster calculation

In [353]:
def cal_wcss(clusters):
    
    #Making list of clusters and storing it as dictionary of ndarray
    dat = train_x_norm.copy(deep=True)
    dat['num'] = clusters
    d ={}
    for i in np.unique(clusters):
        d[i] = np.array(dat[dat['num']==i].iloc[:,0:48])
    
    #Making centroids
    cent = train_x_norm.iloc[np.random.choice(np.arange(len(train_x_norm)), 11, False)]
    cent = np.asarray(cent) 
    for i in range(K):
        cent[i,:] = np.mean(train_x_norm[clusters == i], axis=0)
    
    #Calculating WCSS
    wcss=0
    for k in range(K):
        wcss+=np.sum((d[k]-cent[k,:])**2)
    return wcss


In [354]:
def KM():
    
    #Making random centroids
    centroids = train_x_norm.iloc[np.random.choice(np.arange(len(train_x_norm)), 11, False)]
    centroids = np.asarray(centroids) 
    
    distances=np.zeros((train_x_norm.shape[0], K))
    clusters=[]

    for i in range(len(centroids)): 
        distances[:,i] = np.linalg.norm(train_x_norm-centroids[i],axis=1) 
     
    clusters = np.argmin(distances, axis=1)
    
    #Reassigning clusters by mean value
    for i in range(K):
        centroids[i,:] = np.mean(train_x_norm[clusters == i], axis=0)
    
    wcss=cal_wcss(clusters)
    
    
    List = []
    
    List.append(centroids)
    List.append(clusters)
    List.append(wcss)
    
    return List 

In [355]:
Answer = []
Answer = KM()
print("Centroids :", Answer[0])
print()
print("Clusters :", Answer[1])
print()
print("Within Clusters Sum of Squares (WCSS) :", Answer[2])
print()

Centroids : [[5.96281147e-01 5.44581837e-01 6.45347928e-01 8.38933253e-01
  7.21542030e-01 7.83073317e-01 7.37011719e-01 7.36992424e-01
  7.36993424e-01 6.74244543e-01 6.74429566e-01 6.74667957e-01
  2.66908658e-02 3.33543099e-02 9.97415520e-02 2.62398492e-02
  5.96215574e-02 8.41182613e-02 9.11717519e-01 9.11700523e-01
  9.11868872e-01 9.12340111e-01 9.12386892e-01 9.12540375e-01
  5.41910522e-01 5.56888248e-01 4.46514565e-01 5.43713143e-01
  5.08377548e-01 5.87188775e-01 3.30098839e-01 3.45495385e-01
  3.35521652e-01 2.14272555e-01 2.17767975e-01 2.20605450e-01
  2.83335144e-04 3.11594607e-02 5.61393395e-02 2.77557967e-04
  4.12043535e-02 6.88400578e-02 3.47470384e-01 3.95385842e-01
  4.01009789e-01 3.33732503e-01 3.62300736e-01 3.26128047e-01]
 [5.96287320e-01 5.45516665e-01 6.45879542e-01 8.38951396e-01
  7.25590930e-01 7.83551216e-01 7.42249020e-01 7.42183631e-01
  7.42160008e-01 5.78481452e-01 5.78589565e-01 5.78788663e-01
  8.46532742e-03 1.03914422e-02 3.16841254e-02 8.27741982

Sklearn Kmeans

In [356]:
# Number of clusters
kmeans = KMeans(n_clusters=11)
# Fitting the input data
kmeans = kmeans.fit(train_x_norm)
# Getting the cluster labels
labels = kmeans.predict(train_x_norm)
# Centroid values
centroids_2 = kmeans.cluster_centers_
wcss1 = []
for i in range(0, 10):
    wcss1.append(kmeans.inertia_)
print("Centroids : ", centroids_2)
print()
print("Clusters : ", labels)
print()
print("Within Clusters Sum of Squares (WCSS) :", wcss1[0])
print()

Centroids :  [[5.96291177e-01 5.44336162e-01 6.43878312e-01 8.38881897e-01
  7.20909249e-01 7.82186635e-01 7.80093027e-01 7.80090312e-01
  7.80164030e-01 7.44302216e-01 7.44523315e-01 7.44809441e-01
  1.45991544e-02 1.84215010e-02 5.83755822e-02 1.46057748e-02
  3.30748639e-02 4.77674387e-02 6.04060648e-01 6.04066504e-01
  6.04208153e-01 6.04358984e-01 6.04403656e-01 6.04535402e-01
  5.42151497e-01 5.55280246e-01 4.44483912e-01 5.43625405e-01
  5.09283923e-01 5.89542682e-01 3.31468440e-01 3.46867370e-01
  3.36794338e-01 2.24016425e-01 2.27691337e-01 2.30755470e-01
  2.34201777e-04 4.18285318e-02 7.33308768e-02 2.37296410e-04
  5.95422972e-02 8.73471663e-02 3.68322092e-01 4.18514751e-01
  4.26704987e-01 3.22017157e-01 3.50502539e-01 3.14376361e-01]
 [5.96341643e-01 5.45256408e-01 6.47279098e-01 8.39337412e-01
  7.25434556e-01 7.84142846e-01 6.39492754e-01 6.39429999e-01
  6.39332188e-01 5.73396035e-01 5.73503274e-01 5.73680210e-01
  2.50757094e-02 3.19032678e-02 9.56532949e-02 2.4556055

### PCA Implementation

In [357]:
def pca_implementation(dataframe,k):
    #taking all columns except the response
    df_a = dataframe.iloc[:,0:48]
    #calculating mean across all columns to normalise the data
    mean_df = np.mean(df_a)
    #removing the mean from all records for all columns to center the data
    df_c = df_a - mean_df
    df_c = df_c.T
    #calculating covariance matrix for centered data
    df_var = np.cov(df_c)
    #calculating eigen values and eigen vectors for the covariance matrix
    eig_val, eig_vec = np.linalg.eig(df_var)
    #extracting top k eigen vectors
    df_test = pd.DataFrame(eig_vec)
    df_test.iloc[:,k:]=0
    #projecting the top-k eigen vector plane onto the existing dataset
    Projection = -(np.array(df_test).T.dot(df_c)).T
    return pd.DataFrame(Projection).iloc[:,0:k]

Validating with the sklearn's PCA module, number of features can be specified in the below cell

In [368]:
from sklearn.decomposition import PCA
manual_imp = pca_implementation(df,3)
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(df.iloc[:,0:48])
principalDf = pd.DataFrame(data = principalComponents)
print("Principal components with Sklearn-",principalDf)
print("Principal components with above implementation-",manual_imp)

Principal components with Sklearn-                0          1         2
0      -7.585132   0.253804  0.246843
1      -8.752380  -0.037331  0.122418
2      -2.086987  -1.278239  0.150201
3     -10.575990   0.122271  0.325414
4      -3.330015   0.054105  0.406775
...          ...        ...       ...
40951  -8.162366   0.916487  0.341272
40952  -5.773473   1.832014  0.150703
40953  12.678249   8.674939  0.441712
40954  29.135663 -20.213507 -1.722138
40955  -7.752290   2.051951  0.362834

[40956 rows x 3 columns]
Principal components with above implementation-                0          1         2
0      -7.585132   0.253804  0.246843
1      -8.752380  -0.037331  0.122418
2      -2.086987  -1.278239  0.150201
3     -10.575990   0.122271  0.325414
4      -3.330015   0.054105  0.406775
...          ...        ...       ...
40951  -8.162366   0.916487  0.341272
40952  -5.773473   1.832014  0.150703
40953  12.678249   8.674939  0.441712
40954  29.135663 -20.213507 -1.722138
40955  -7.752290 