# COMPSCI 361 ASSIGNMENT TWO

## Task One

### Load Libraries and Dataset

* Load the dataset and deal with missing values in an appropriate manner. Describe how you handled them and why you did it that way.

In [385]:
#Loading Libraries

from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.datasets

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from autorank import autorank, plot_stats, create_report, latex_table
from statistics import stdev

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier


# Render matplotlib plots in the notebook
%matplotlib inline

In [386]:
#Load Datasets
X = pd.read_csv('data_A2.csv')
y = pd.read_csv('labels_A2.csv')


Prior to imputation, we would split the data into training and test data so we avoid indirectly informing the training set about the test set and maintain a distinct separation. However, for this assignment we ignore this.

In [380]:
#Train-Test Split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Handling Missing Data

The best imputation strategy depends on the data. However, in the given case, we hypothesise that the best method for us to use is a measure of central tendency to replace missing data as we retain the same shape and do not lose data, whilst replacing domain knowledge with given data that would not require the  knowledge of an expert.

In [387]:
#Method 4: Use a measure of Central Tendency for the Missing attributes
from sklearn.impute import SimpleImputer

#Fill with mean
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

imputer.fit(X)

X = imputer.transform(X)

In [388]:
check = np.isnan(X)
check.any(axis=1)


array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

## Task Two

### Feature Selection
* Determine the 10 most important features of the cleaned dataset, explain how you found them and why you think they are important. This is the dataset you will be using for the rest of the assignment. 


In [383]:
#Univariate feature selection
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from numpy import transpose

features = SelectKBest(score_func = f_classif)

X = pd.DataFrame(X)
fit = features.fit(X, np.ravel(y))
imp_features = X.columns[features.get_support()].to_list()
print(imp_features)

for col in X.columns:
    if col not in imp_features:
        X = X.drop([col], axis = 1)


[0, 8, 11, 46, 60, 65, 69, 77, 78, 97]


Feature selection is done to reduce the amount of attributes and regulate dimensionality by eliminating features that are irrelevant, redundant or less useful than other features. Univariate selection works by selecting the best features based on univariate statistical tests and removing all but the highest-scoring features and in particular, we used the f_classif argument for classification. This method is based on the F-test and estimates the degree of linear dependency between two random variables.

## Task Three

### Classifiers

* Get results for RF, pruned DT (make sure to use a validation set for pruning!), unpruned DT, and decision stumps and determine if any are statistically significantly better than others. Why are the worst methods performing so badly compared to the others?


In [311]:
rs = 1234 #Random State

In [312]:
#Train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = rs)

In [313]:
#Random Forest
rf = RandomForestClassifier(random_state = rs)
rf.fit(X_train, np.ravel(y_train))

RandomForestClassifier(random_state=1234)

In [314]:
#Unpruned Decision Tree
dt = DecisionTreeClassifier(random_state = rs)
dt.fit(X_train, y_train)

DecisionTreeClassifier(random_state=1234)

In [315]:
#Decision stump
dts = DecisionTreeClassifier(max_depth = 1, random_state = rs)
dts.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=1, random_state=1234)

In [316]:
#Pruned Decision Tree
path = DecisionTreeClassifier().cost_complexity_pruning_path(X_train, y_train)
alphas = path['ccp_alphas']


acc_val = []
for c in path.ccp_alphas:
    #X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)
    acc_val.append(DecisionTreeClassifier(ccp_alpha=c).fit(X_train, y_train).score(X_val, y_val))

best_ccp_alpha = path.ccp_alphas[np.argmax(acc_val)]
best_clf = DecisionTreeClassifier(ccp_alpha=best_ccp_alpha).fit(X, y)



In [317]:
#Accuracy Scores

#Unpruned Decision Tree Score
dt_score = dt.score(X_train, y_train)
print(f'Decision Tree Accuracy on train set: {dt_score:.2f}')

dt_score = dt.score(X_test, y_test)
print(f'Decision Tree Accuracy on test set: {dt_score:.2f}')
print('')

#Random Forest Score
rf_score = rf.score(X_train, y_train)
print(f'Random Forest Accuracy on train set: {rf_score:.2f}')

rf_score = rf.score(X_test, y_test)
print(f'Random Forest Accuracy on test set: {rf_score:.2f}')
print('')



Decision Tree Accuracy on train set: 1.00
Decision Tree Accuracy on test set: 0.57

Random Forest Accuracy on train set: 1.00
Random Forest Accuracy on test set: 0.68



In [318]:
#Decision Stump Score
dts_score = dts.score(X_train, y_train)
print(f'Decision Stump Accuracy on train set: {dts_score:.2f}')

dts_score = dts.score(X_test, y_test)
print(f'Decision Stump Accuracy on test set: {dts_score:.2f}')
print('')

Decision Stump Accuracy on train set: 0.62
Decision Stump Accuracy on test set: 0.59



In [319]:
#Pruned Decision Tree
pdt_score = best_clf.score(X_train, y_train)
print(f'Pruned Decision Tree Accuracy on train set: {pdt_score:.2f}')

pdt_score = best_clf.score(X_val, y_test)
print(f'Pruned Decision Tree Accuracy on validation set: {pdt_score:.2f}')
print('')

Pruned Decision Tree Accuracy on train set: 0.97
Pruned Decision Tree Accuracy on validation set: 0.47



In [320]:
#Cross Validation
means = []
sds = []
y = np.ravel(y)

#Unpruned Decision Trees
dt_scores = cross_val_score(DecisionTreeClassifier(random_state = rs), X, y, cv=KFold(10))
#print(dt_scores)
print(f"Decision Tree Mean test accuracy: {np.mean(dt_scores):.3f}")
means += [dt_scores]

std = stdev(dt_scores)
sds += [std]
print(f"Decision Tree Standard Deviation: {std:.3f}")
print('')

#Random Forest
rf_scores = cross_val_score(RandomForestClassifier(random_state = rs), X, y, cv=KFold(10))
#print(rf_scores)
print(f"Random Forest Mean test accuracy: {np.mean(rf_scores):.3f}")
means += [rf_scores]

std = stdev(rf_scores)
sds += [std]
print(f"Random Forest Standard Deviation: {std:.3f}")
print('')

#Decision Stumps
dts_scores = cross_val_score(DecisionTreeClassifier(max_depth = 1, random_state = rs), X, y, cv=KFold(10))
#print(dt_scores)
print(f"Decision Stump Mean test accuracy: {np.mean(dts_scores):.3f}")
means += [dts_scores]

std = stdev(dts_scores)
sds += [std]
print(f"Decision Stump Standard Deviation: {std:.3f}")
print('')


Decision Tree Mean test accuracy: 0.612
Decision Tree Standard Deviation: 0.047

Random Forest Mean test accuracy: 0.690
Random Forest Standard Deviation: 0.044

Decision Stump Mean test accuracy: 0.608
Decision Stump Standard Deviation: 0.028



In [321]:
#print(type(X), type(y))
y = pd.DataFrame(y)
#print(type(y))

In [322]:
#Pruned Decision Tree
kf = KFold(n_splits = 10, shuffle = True)
pdt = DecisionTreeClassifier(ccp_alpha = best_ccp_alpha, random_state = rs)

pdt_scores = []
for i in range(10):
    result = next(kf.split(X), None)
    X_train = X.iloc[result[0]]
    X_test = X.iloc[result[1]]
    y_train = y.iloc[result[0]]
    y_test = y.iloc[result[1]]
    model = pdt.fit(X_train, y_train)
    predictions = pdt.predict(X_test)
    pdt_scores.append(model.score(X_test, y_test))

#print('Scores from each Iteration: ', pdt_scores)
print('Pruned Decision Tree Mean test accuracy:', np.mean(pdt_scores))
means += [np.mean(pdt_scores)]

std = stdev(pdt_scores)
sds += [std]
print(f"Pruned Decision Tree Standard Deviation: {std:.3f}")
print('')


Pruned Decision Tree Mean test accuracy: 0.6329999999999999
Pruned Decision Tree Standard Deviation: 0.040



In [323]:
#Autorank
classifiers = ['Decision Tree', 'Random Forest', 'Decision Stumps', 'Pruned Decision Tree']

data = pd.DataFrame()
for i in range(4):
     data[classifiers[i]] = np.random.normal(means[i], sds[i], 10).clip(0, 1)

result = autorank(data, alpha=0.05, verbose=False, approach = 'frequentist')
print(result)

RankResult(rankdf=
                      meanrank      mean       std  ci_lower  ci_upper  \
Random Forest              1.7  0.695457  0.098841  0.598221  0.792693   
Pruned Decision Tree       2.6  0.638364  0.052219  0.586993  0.689735   
Decision Stumps            2.7  0.620456  0.033515  0.587485  0.653426   
Decision Tree              3.0  0.604572  0.090419  0.515621  0.693522   

                     effect_size   magnitude  
Random Forest                  0  negligible  
Pruned Decision Tree    0.722281      medium  
Decision Stumps          1.01629       large  
Decision Tree           0.959484       large  
pvalue=0.13050077008525457
cd=1.483221853685529
omnibus=friedman
posthoc=nemenyi
all_normal=True
pvals_shapiro=[0.3560923635959625, 0.15305356681346893, 0.2787017226219177, 0.9948100447654724]
homoscedastic=False
pval_homogeneity=0.01158797227943928
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.0125
num_samples=10
posterior_matrix=
None
decision_matrix=
None
rope=

## Task Four

### Additive Noise 
* Add 20% normal additive noise to the features and train the classifiers from step 3 and determine if any are performing significantly worse/better than on the clean dataset. Give reasons based on your knowledge of the classifiers.

In [384]:
#Additive normal noise
noise = np.random.normal(0, 0.2, np.shape(X))
X_noise = X + np.multiply(noise, np.average(X, axis=0))
 

In [325]:
rs = 5678

#Train-test-split
X_train, X_test, y_train, y_test = train_test_split(X_noise, y, test_size=0.3, random_state = rs)

#Random Forest
rf = RandomForestClassifier(random_state = rs)
rf.fit(X_train, np.ravel(y_train))

#Unpruned Decision Tree
dt = DecisionTreeClassifier(random_state = rs)
dt.fit(X_train, y_train)

#Decision stump
dts = DecisionTreeClassifier(max_depth = 1, random_state = rs)
dts.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=1, random_state=5678)

In [326]:
#Accuracy Scores

#Unpruned Decision Tree Score
dt_score = dt.score(X_train, y_train)
print(f'Decision Tree Accuracy on train set: {dt_score:.2f}')

dt_score = dt.score(X_test, y_test)
print(f'Decision Tree Accuracy on test set: {dt_score:.2f}')
print('')

#Random Forest Score
rf_score = rf.score(X_train, y_train)
print(f'Random Forest Accuracy on train set: {rf_score:.2f}')

rf_score = rf.score(X_test, y_test)
print(f'Random Forest Accuracy on test set: {rf_score:.2f}')
print('')

#Decision Stump Score
dts_score = dts.score(X_train, y_train)
print(f'Decision Stump Accuracy on train set: {dts_score:.2f}')

dts_score = dts.score(X_test, y_test)
print(f'Decision Stump Accuracy on test set: {dts_score:.2f}')
print('')

Decision Tree Accuracy on train set: 1.00
Decision Tree Accuracy on test set: 0.61

Random Forest Accuracy on train set: 1.00
Random Forest Accuracy on test set: 0.68

Decision Stump Accuracy on train set: 0.64
Decision Stump Accuracy on test set: 0.58



In [327]:
#Cross Validation
means = []
sds = []
y = np.ravel(y)

#Unpruned Decision Trees
dt_scores = cross_val_score(DecisionTreeClassifier(random_state = rs), X_noise, y, cv=KFold(10))
#print(dt_scores)
print(f"Decision Tree Mean test accuracy: {np.mean(dt_scores):.3f}")
means += [dt_scores]

std = stdev(dt_scores)
sds += [std]
print(f"Decision Tree Standard Deviation: {std:.3f}")
print('')

#Random Forest
rf_scores = cross_val_score(RandomForestClassifier(random_state = rs), X_noise, y, cv=KFold(10))
#print(rf_scores)
print(f"Random Forest Mean test accuracy: {np.mean(rf_scores):.3f}")
means += [rf_scores]

std = stdev(rf_scores)
sds += [std]
print(f"Random Forest Standard Deviation: {std:.3f}")
print('')

#Decision Stumps
dts_scores = cross_val_score(DecisionTreeClassifier(max_depth = 1, random_state = rs), X_noise, y, cv=KFold(10))
#print(dt_scores)
print(f"Decision Stump Mean test accuracy: {np.mean(dts_scores):.3f}")
means += [dts_scores]

std = stdev(dts_scores)
sds += [std]
print(f"Decision Stump Standard Deviation: {std:.3f}")
print('')

Decision Tree Mean test accuracy: 0.585
Decision Tree Standard Deviation: 0.060

Random Forest Mean test accuracy: 0.699
Random Forest Standard Deviation: 0.038

Decision Stump Mean test accuracy: 0.603
Decision Stump Standard Deviation: 0.032



In [328]:
y = pd.DataFrame(y)
#Pruned Decision Tree
kf = KFold(n_splits = 10, shuffle = True)
pdt = DecisionTreeClassifier(ccp_alpha = best_ccp_alpha, random_state = rs)

pdt_scores = []
for i in range(10):
    result = next(kf.split(X_noise), None)
    X_train = X_noise.iloc[result[0]]
    X_test = X_noise.iloc[result[1]]
    y_train = y.iloc[result[0]]
    y_test = y.iloc[result[1]]
    model = pdt.fit(X_train, y_train)
    predictions = pdt.predict(X_test)
    pdt_scores.append(model.score(X_test, y_test))

#print('Scores from each Iteration: ', pdt_scores)
print('Pruned Decision Tree Mean test accuracy:', np.mean(pdt_scores))
means += [np.mean(pdt_scores)]

std = stdev(pdt_scores)
sds += [std]
print(f"Pruned Decision Tree Standard Deviation: {std:.3f}")
print('')

Pruned Decision Tree Mean test accuracy: 0.599
Pruned Decision Tree Standard Deviation: 0.041



In [329]:
#Autorank
classifiers = ['Decision Tree', 'Random Forest', 'Decision Stumps', 'Pruned Decision Tree']

data = pd.DataFrame()
for i in range(4):
     data[classifiers[i]] = np.random.normal(means[i], sds[i], 10).clip(0, 1)

result = autorank(data, alpha=0.05, verbose=False)
print(result)

RankResult(rankdf=
                      meanrank      mean       std  ci_lower  ci_upper  \
Random Forest              1.3  0.703513  0.053407  0.650973  0.756053   
Decision Stumps            2.6  0.603753  0.039493  0.564901  0.642605   
Pruned Decision Tree       3.0  0.592255  0.035760  0.557076  0.627434   
Decision Tree              3.1  0.578865  0.097291  0.483154  0.674577   

                     effect_size   magnitude  
Random Forest                  0  negligible  
Decision Stumps          2.12401       large  
Pruned Decision Tree     2.44804       large  
Decision Tree             1.5883       large  
pvalue=0.006246400552273741
cd=1.483221853685529
omnibus=friedman
posthoc=nemenyi
all_normal=True
pvals_shapiro=[0.3412385582923889, 0.8706462979316711, 0.5630434155464172, 0.8345291018486023]
homoscedastic=False
pval_homogeneity=0.00910950246508038
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.0125
num_samples=10
posterior_matrix=
None
decision_matrix=
None
rope=

It would make sense that the Decision tree would perform better with noisy data given that it would usually overfit on the clean data.

## Task Five

### Multiplicative Noise
* Repeat step 4 with 20% normal multiplicative noise. Additionally, explain why you think additive and multiplicative noise influence the classifiers differently.

In [330]:
# multiplicative normal noise
noise = np.random.normal(1, 0.2, np.shape(X))
X_noise = np.multiply(X, noise)

In [331]:
rs = 8910

#Train-test-split
X_train, X_test, y_train, y_test = train_test_split(X_noise, y, test_size=0.3, random_state = rs)

#Random Forest
rf = RandomForestClassifier(random_state = rs)
rf.fit(X_train, np.ravel(y_train))

#Unpruned Decision Tree
dt = DecisionTreeClassifier(random_state = rs)
dt.fit(X_train, y_train)

#Decision stump
dts = DecisionTreeClassifier(max_depth = 1, random_state = rs)
dts.fit(X_train, y_train)


DecisionTreeClassifier(max_depth=1, random_state=8910)

In [332]:
#Accuracy Scores

#Unpruned Decision Tree Score
dt_score = dt.score(X_train, y_train)
print(f'Decision Tree Accuracy on train set: {dt_score:.2f}')

dt_score = dt.score(X_test, y_test)
print(f'Decision Tree Accuracy on test set: {dt_score:.2f}')
print('')

#Random Forest Score
rf_score = rf.score(X_train, y_train)
print(f'Random Forest Accuracy on train set: {rf_score:.2f}')

rf_score = rf.score(X_test, y_test)
print(f'Random Forest Accuracy on test set: {rf_score:.2f}')
print('')

#Decision Stump Score
dts_score = dts.score(X_train, y_train)
print(f'Decision Stump Accuracy on train set: {dts_score:.2f}')

dts_score = dts.score(X_test, y_test)
print(f'Decision Stump Accuracy on test set: {dts_score:.2f}')
print('')

Decision Tree Accuracy on train set: 1.00
Decision Tree Accuracy on test set: 0.64

Random Forest Accuracy on train set: 1.00
Random Forest Accuracy on test set: 0.71

Decision Stump Accuracy on train set: 0.63
Decision Stump Accuracy on test set: 0.59



In [333]:
#Cross Validation
means = []
sds = []
y = np.ravel(y)

#Unpruned Decision Trees
dt_scores = cross_val_score(DecisionTreeClassifier(random_state = rs), X_noise, y, cv=KFold(10))
#print(dt_scores)
print(f"Decision Tree Mean test accuracy: {np.mean(dt_scores):.3f}")
means += [dt_scores]

std = stdev(dt_scores)
sds += [std]
print(f"Decision Tree Standard Deviation: {std:.3f}")
print('')

#Random Forest
rf_scores = cross_val_score(RandomForestClassifier(random_state = rs), X_noise, y, cv=KFold(10))
#print(rf_scores)
print(f"Random Forest Mean test accuracy: {np.mean(rf_scores):.3f}")
means += [rf_scores]

std = stdev(rf_scores)
sds += [std]
print(f"Random Forest Standard Deviation: {std:.3f}")
print('')

#Decision Stumps
dts_scores = cross_val_score(DecisionTreeClassifier(max_depth = 1, random_state = rs), X_noise, y, cv=KFold(10))
#print(dt_scores)
print(f"Decision Stump Mean test accuracy: {np.mean(dts_scores):.3f}")
means += [dts_scores]

std = stdev(dts_scores)
sds += [std]
print(f"Decision Stump Standard Deviation: {std:.3f}")
print('')

Decision Tree Mean test accuracy: 0.633
Decision Tree Standard Deviation: 0.053

Random Forest Mean test accuracy: 0.691
Random Forest Standard Deviation: 0.036

Decision Stump Mean test accuracy: 0.596
Decision Stump Standard Deviation: 0.035



In [334]:
y = pd.DataFrame(y)
#Pruned Decision Tree
kf = KFold(n_splits = 10, shuffle = True)
pdt = DecisionTreeClassifier(ccp_alpha = best_ccp_alpha, random_state = rs)

pdt_scores = []
for i in range(10):
    result = next(kf.split(X_noise), None)
    X_train = X_noise.iloc[result[0]]
    X_test = X_noise.iloc[result[1]]
    y_train = y.iloc[result[0]]
    y_test = y.iloc[result[1]]
    model = pdt.fit(X_train, y_train)
    predictions = pdt.predict(X_test)
    pdt_scores.append(model.score(X_test, y_test))

#print('Scores from each Iteration: ', pdt_scores)
print('Pruned Decision Tree Mean test accuracy:', np.mean(pdt_scores))
means += [np.mean(pdt_scores)]

std = stdev(pdt_scores)
sds += [std]
print(f"Pruned Decision Tree Standard Deviation: {std:.3f}")
print('')

Pruned Decision Tree Mean test accuracy: 0.617
Pruned Decision Tree Standard Deviation: 0.050



In [335]:
#Autorank
classifiers = ['Decision Tree', 'Random Forest', 'Decision Stumps', 'Pruned Decision Trees']

data = pd.DataFrame()
for i in range(4):
     data[classifiers[i]] = np.random.normal(means[i], sds[i], 10).clip(0, 1)

result = autorank(data, alpha=0.05, verbose=False)
print(result)

  ax1.set_yticklabels(np.insert(self.groupsunique.astype(str), 0, ''))


RankResult(rankdf=
                       meanrank      mean       std  ci_lower  ci_upper  \
Random Forest               1.9  0.674488  0.054919  0.639471  0.709505   
Decision Tree               2.6  0.634482  0.069610  0.599465    0.6695   
Pruned Decision Trees       2.6  0.605374  0.062716  0.570357  0.640391   
Decision Stumps             2.9  0.604249  0.041579  0.569232  0.639266   

                      effect_size   magnitude  
Random Forest                   0  negligible  
Decision Tree            0.638092      medium  
Pruned Decision Trees     1.17249       large  
Decision Stumps           1.44206       large  
pvalue=0.041358914536295366
cd=None
omnibus=anova
posthoc=tukeyhsd
all_normal=True
pvals_shapiro=[0.5078120231628418, 0.3019663691520691, 0.616109311580658, 0.46593230962753296]
homoscedastic=True
pval_homogeneity=0.5022449262889062
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.0125
num_samples=10
posterior_matrix=
None
decision_matrix=
None
rope=None
ro

## Task Six

### Class Noise
* Use 5% class noise (that is, you flip 5% of the labels to the other class) and investigate which classifiers’ performance is affected significantly. Why do you think class noise affects the classifiers differently from feature noise?

In [342]:
y = np.ravel(y)
flip = np.random.binomial(1, 0.05, y.shape).astype(bool)
y_flipped = np.where(flip, 1 - y, y)
print(np.c_[y, y_flipped][flip])

[[0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]]


In [343]:
rs = 1112

#Train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y_flipped, test_size=0.3)

#Random Forest
rf = RandomForestClassifier(random_state = rs)
rf.fit(X_train, np.ravel(y_train))

#Unpruned Decision Tree
dt = DecisionTreeClassifier(random_state = rs)
dt.fit(X_train, y_train)

#Decision stump
dts = DecisionTreeClassifier(max_depth = 1, random_state = rs)
dts.fit(X_train, y_train)


DecisionTreeClassifier(max_depth=1, random_state=1112)

In [344]:
#Accuracy Scores

#Unpruned Decision Tree Score
dt_score = dt.score(X_train, y_train)
print(f'Decision Tree Accuracy on train set: {dt_score:.2f}')

dt_score = dt.score(X_val, y_test)
print(f'Decision Tree Accuracy on test set: {dt_score:.2f}')
print('')

#Random Forest Score
rf_score = rf.score(X_train, y_train)
print(f'Random Forest Accuracy on train set: {rf_score:.2f}')

rf_score = rf.score(X_val, y_test)
print(f'Random Forest Accuracy on test set: {rf_score:.2f}')
print('')

#Decision Stump Score
dts_score = dts.score(X_train, y_train)
print(f'Decision Stump Accuracy on train set: {dts_score:.2f}')

dts_score = dts.score(X_val, y_test)
print(f'Decision Stump Accuracy on test set: {dts_score:.2f}')
print('')


Decision Tree Accuracy on train set: 1.00
Decision Tree Accuracy on test set: 0.52

Random Forest Accuracy on train set: 1.00
Random Forest Accuracy on test set: 0.54

Decision Stump Accuracy on train set: 0.61
Decision Stump Accuracy on test set: 0.50



In [345]:
#Cross Validation
means = []
sds = []

#Unpruned Decision Trees
dt_scores = cross_val_score(DecisionTreeClassifier(random_state = rs), X, y_flipped, cv=KFold(10))
#print(dt_scores)
print(f"Decision Tree Mean test accuracy: {np.mean(dt_scores):.3f}")
means += [dt_scores]

std = stdev(dt_scores)
sds += [std]
print(f"Decision Tree Standard Deviation: {std:.3f}")
print('')

#Random Forest
rf_scores = cross_val_score(RandomForestClassifier(random_state = rs), X, y_flipped, cv=KFold(10))
#print(rf_scores)
print(f"Random Forest Mean test accuracy: {np.mean(rf_scores):.3f}")
means += [rf_scores]

std = stdev(rf_scores)
sds += [std]
print(f"Random Forest Standard Deviation: {std:.3f}")
print('')

#Decision Stumps
dts_scores = cross_val_score(DecisionTreeClassifier(max_depth = 1, random_state = rs), X, y_flipped, cv=KFold(10))
#print(dt_scores)
print(f"Decision Stump Mean test accuracy: {np.mean(dts_scores):.3f}")
means += [dts_scores]

std = stdev(dts_scores)
sds += [std]
print(f"Decision Stump Standard Deviation: {std:.3f}")
print('')

Decision Tree Mean test accuracy: 0.580
Decision Tree Standard Deviation: 0.045

Random Forest Mean test accuracy: 0.661
Random Forest Standard Deviation: 0.040

Decision Stump Mean test accuracy: 0.579
Decision Stump Standard Deviation: 0.027



In [346]:
y_flipped = pd.DataFrame(y_flipped)
#Pruned Decision Tree
kf = KFold(n_splits = 10, shuffle = True)
pdt = DecisionTreeClassifier(ccp_alpha = best_ccp_alpha, random_state = rs)

pdt_scores = []
for i in range(10):
    result = next(kf.split(X_noise), None)
    X_train = X.iloc[result[0]]
    X_test = X.iloc[result[1]]
    y_train = y_flipped.iloc[result[0]]
    y_test = y_flipped.iloc[result[1]]
    model = pdt.fit(X_train, y_train)
    predictions = pdt.predict(X_test)
    pdt_scores.append(model.score(X_test, y_test))

#print('Scores from each Iteration: ', pdt_scores)
print('Pruned Decision Tree Mean test accuracy:', np.mean(pdt_scores))
means += [np.mean(pdt_scores)]

std = stdev(pdt_scores)
sds += [std]
print(f"Pruned Decision Tree Standard Deviation: {std:.3f}")
print('')

Pruned Decision Tree Mean test accuracy: 0.594
Pruned Decision Tree Standard Deviation: 0.034



In [347]:
#Autorank
classifiers = ['Decision Tree', 'Random Forest', 'Decision Stumps', 'Pruned Decision Trees']

data = pd.DataFrame()
for i in range(4):
     data[classifiers[i]] = np.random.normal(means[i], sds[i], 10).clip(0, 1)

result = autorank(data, alpha=0.05, verbose=False)
print(result)

  ax1.set_yticklabels(np.insert(self.groupsunique.astype(str), 0, ''))


RankResult(rankdf=
                       meanrank      mean       std  ci_lower  ci_upper  \
Random Forest               1.5  0.645182  0.076839  0.609736  0.680627   
Pruned Decision Trees       2.7  0.589153  0.030703  0.553707  0.624598   
Decision Tree               2.9  0.591672  0.068819  0.556226  0.627117   
Decision Stumps             2.9  0.580831  0.047684  0.545386  0.616277   

                      effect_size   magnitude  
Random Forest                   0  negligible  
Pruned Decision Trees    0.957601       large  
Decision Tree            0.733623      medium  
Decision Stumps           1.00634       large  
pvalue=0.04444426074756692
cd=None
omnibus=anova
posthoc=tukeyhsd
all_normal=True
pvals_shapiro=[0.5800461769104004, 0.14320023357868195, 0.31138208508491516, 0.585840106010437]
homoscedastic=True
pval_homogeneity=0.056523377705134714
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.0125
num_samples=10
posterior_matrix=
None
decision_matrix=
None
rope=None


## Task Seven

* If you split the data into a training and test set first, and only afterwards add 20% normal multiplicative noise to the training set, how differently does your algorithm behave? Try this again and add noise only to the test set. Are the results different? Discuss how each of these approaches affect your results

In [389]:
rs = 121314

#Train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#Noise
noise = np.random.normal(1, 0.2, np.shape(X_test))
X_test = np.multiply(X_test, noise)

#Random Forest
rf = RandomForestClassifier(random_state = rs)
rf.fit(X_train, np.ravel(y_train))

#Unpruned Decision Tree
dt = DecisionTreeClassifier(random_state = rs)
dt.fit(X_train, y_train)

#Decision stump
dts = DecisionTreeClassifier(max_depth = 1, random_state = rs)
dts.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=1, random_state=121314)

In [390]:
#Accuracy Scores

#Unpruned Decision Tree Score
dt_score = dt.score(X_train, y_train)
print(f'Decision Tree Accuracy on train set: {dt_score:.2f}')

dt_score = dt.score(X_test, y_test)
print(f'Decision Tree Accuracy on test set: {dt_score:.2f}')
print('')

#Random Forest Score
rf_score = rf.score(X_train, y_train)
print(f'Random Forest Accuracy on train set: {rf_score:.2f}')

rf_score = rf.score(X_test, y_test)
print(f'Random Forest Accuracy on test set: {rf_score:.2f}')
print('')

#Decision Stump Score
dts_score = dts.score(X_train, y_train)
print(f'Decision Stump Accuracy on train set: {dts_score:.2f}')

dts_score = dts.score(X_test, y_test)
print(f'Decision Stump Accuracy on test set: {dts_score:.2f}')
print('')


Decision Tree Accuracy on train set: 1.00
Decision Tree Accuracy on test set: 0.61

Random Forest Accuracy on train set: 1.00
Random Forest Accuracy on test set: 0.73

Decision Stump Accuracy on train set: 0.63
Decision Stump Accuracy on test set: 0.57



In [391]:
means = []
sds = []
X = pd.DataFrame(X)
y = pd.DataFrame(y)

#Decision Tree
kf = KFold(n_splits = 10, shuffle = True)
dt = (DecisionTreeClassifier(random_state = rs))

dt_scores = []
for i in range(10):
    result = next(kf.split(X), None)
    X_train = X.iloc[result[0]]
    X_test = X.iloc[result[1]]
    noise = np.random.normal(1, 0.2, np.shape(X_test))
    X_test = np.multiply(X_test, noise)
    
    y_train = y.iloc[result[0]]
    y_test = y.iloc[result[1]]
    model = dt.fit(X_train, y_train)
    predictions = dt.predict(X_test)
    dt_scores.append(model.score(X_test, y_test))

#print('Scores from each Iteration: ', pdt_scores)
print('Decision Tree Mean test accuracy:', np.mean(dt_scores))
means += [np.mean(dt_scores)]

std = stdev(dt_scores)
sds += [std]
print(f"Decision Tree Standard Deviation: {std:.3f}")
print('')

#y = np.ravel(y)
#Random Forest
rf = (RandomForestClassifier(random_state = rs))

rf_scores = []
for i in range(10):
    result = next(kf.split(X), None)
    X_train = X.iloc[result[0]]
    X_test = X.iloc[result[1]]
    noise = np.random.normal(1, 0.2, np.shape(X_test))
    X_test = np.multiply(X_test, noise)
    
    y_train = y.iloc[result[0]]
    y_train = np.ravel(y_train)
    y_test = y.iloc[result[1]]
    model = rf.fit(X_train, y_train)
    predictions = rf.predict(X_test)
    rf_scores.append(model.score(X_test, y_test))

#print('Scores from each Iteration: ', pdt_scores)
print('Random Forest Mean test accuracy:', np.mean(rf_scores))
means += [np.mean(rf_scores)]

std = stdev(rf_scores)
sds += [std]
print(f"Random Forest Standard Deviation: {std:.3f}")
print('')

#Decision Stump
ds = (DecisionTreeClassifier(max_depth = 1, random_state = rs))

ds_scores = []
for i in range(10):
    result = next(kf.split(X), None)
    X_train = X.iloc[result[0]]
    X_test = X.iloc[result[1]]
    noise = np.random.normal(1, 0.2, np.shape(X_test))
    X_test = np.multiply(X_test, noise)
    
    y_train = y.iloc[result[0]]
    y_test = y.iloc[result[1]]
    y_train = np.ravel(y_train)
    model = ds.fit(X_train, y_train)
    predictions = ds.predict(X_test)
    ds_scores.append(model.score(X_test, y_test))

#print('Scores from each Iteration: ', pdt_scores)
print('Decision Tree Mean test accuracy:', np.mean(ds_scores))
means += [np.mean(ds_scores)]

std = stdev(ds_scores)
sds += [std]
print(f"Decision Tree Standard Deviation: {std:.3f}")
print('')



Decision Tree Mean test accuracy: 0.6589999999999999
Decision Tree Standard Deviation: 0.034

Random Forest Mean test accuracy: 0.724
Random Forest Standard Deviation: 0.044

Decision Tree Mean test accuracy: 0.615
Decision Tree Standard Deviation: 0.049



In [392]:

#Pruned Decision Tree
pdt = DecisionTreeClassifier(ccp_alpha = best_ccp_alpha, random_state = rs)

pdt_scores = []
for i in range(10):
    result = next(kf.split(X_noise), None)
    X_train = X.iloc[result[0]]
    X_test = X.iloc[result[1]]
    noise = np.random.normal(1, 0.2, np.shape(X_test))
    X_test = np.multiply(X_test, noise)
    
    
    y_train = y.iloc[result[0]]
    y_test = y.iloc[result[1]]
    model = pdt.fit(X_train, y_train)
    predictions = pdt.predict(X_test)
    pdt_scores.append(model.score(X_test, y_test))

#print('Scores from each Iteration: ', pdt_scores)
print('Pruned Decision Tree Mean test accuracy:', np.mean(pdt_scores))
means += [np.mean(pdt_scores)]

std = stdev(pdt_scores)
sds += [std]
print(f"Pruned Decision Tree Standard Deviation: {std:.3f}")
print('')

Pruned Decision Tree Mean test accuracy: 0.619
Pruned Decision Tree Standard Deviation: 0.055



In [393]:
#Autorank
classifiers = ['Decision Tree', 'Random Forest', 'Decision Stumps', 'Pruned Decision Trees']

data = pd.DataFrame()
for i in range(4):
     data[classifiers[i]] = np.random.normal(means[i], sds[i], 10).clip(0, 1)

result = autorank(data, alpha=0.05, verbose=False)
print(result)

RankResult(rankdf=
                       meanrank      mean       std  ci_lower  ci_upper  \
Random Forest               1.2  0.730022  0.037257  0.703174  0.756869   
Decision Tree               2.5  0.658249  0.023950  0.631402  0.685097   
Decision Stumps             3.0  0.615517  0.048870  0.588669  0.642364   
Pruned Decision Trees       3.3  0.612904  0.059996  0.586056  0.639751   

                      effect_size   magnitude  
Random Forest                   0  negligible  
Decision Tree              2.2917       large  
Decision Stumps           2.63514       large  
Pruned Decision Trees     2.34527       large  
pvalue=8.299012785904896e-06
cd=None
omnibus=anova
posthoc=tukeyhsd
all_normal=True
pvals_shapiro=[0.3737439513206482, 0.1279202401638031, 0.6751849055290222, 0.9680892825126648]
homoscedastic=True
pval_homogeneity=0.06921407977297021
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.0125
num_samples=10
posterior_matrix=
None
decision_matrix=
None
rope=None


  ax1.set_yticklabels(np.insert(self.groupsunique.astype(str), 0, ''))
