In [97]:
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

%matplotlib inline

### Obscured A

#### Decision Tree - Accuracy vs Dummy Classifier

In [227]:
data = loadarff("Data\\obscuredA.arff")
df = pd.DataFrame(data[0])

#iterate through each column and convert the bytes to string
columns = list(df.columns)
for colName in columns:
    df[colName] = df[colName].str.decode("utf-8")
    

In [228]:
#create features and labels
labels = df['contam_code']
features = df.drop("contam_code", axis=1)

In [229]:
#encode features and labels as integers
oe = OrdinalEncoder()
encodedFeatures = oe.fit_transform(features)

lbe = LabelEncoder()
encodedLabels = lbe.fit_transform(labels)

In [187]:
#create decision tree 
clf = DecisionTreeClassifier()

In [188]:
#cross validate
acc = cross_val_score(clf, encodedFeatures, encodedLabels, cv=10, n_jobs=-1)



In [189]:
acc

array([0.92085954, 0.92007338, 0.91954927, 0.9216457 , 0.92059748,
       0.91902516, 0.92348008, 0.92085954, 0.92295597, 0.91797694])

#### Confusion Matrix

In [241]:
#80 20 split
trainFeatures, testFeatures, trainLabels, testLabels = train_test_split(encodedFeatures, encodedLabels, test_size=0.2, random_state=42)

In [242]:
#create decision tree 
clf = DecisionTreeClassifier()
clf.fit(trainFeatures,trainLabels)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [243]:
acc = accuracy_score(testLabels,predictions)
print(acc)

0.8819444444444444


In [249]:
#predict and create confusion matrix
predictions = clf.predict(testFeatures)
cm = confusion_matrix(testLabels,predictions, labels=range(0,len(lbe.classes_)))

In [250]:
#confusion matrix 
df_cm = pd.DataFrame(cm)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_cm)

    0  1     2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18   19  \
0   0  0     0  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0    0   
1   0  2    31  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0    0   
2   0  2  6715  0  0  0  1  9  0  1   1   3   0   1   0   0   7   0   0   25   
3   0  0     3  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0    0   
4   0  0     2  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0    0   
5   0  0    10  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0    0   
6   0  0    19  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0    0   
7   0  0    63  0  0  0  1  3  0  0   0   0   0   0   0   0   0   0   0   17   
8   0  0     6  0  0  0  0  0  0  0   0   0   0   0   0   0   1   0   0    0   
9   0  0     0  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0    0   
10  0  0    12  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0    0   
11  0  0    13  0  0  0  0  0  0  0   0 

In [251]:
df_cm.to_csv("ConfusionMatrixA.csv")

### Dummy Classifier - Majority Class

In [231]:
#look at distribution of labels in the class
labels = np.unique(encodedLabels)

#count labels
labelCount = list()
for label in labels:
    #count how many times a specific label occurs in the entire label set
    count = np.sum(np.equal(label,encodedLabels))
    labelCount.append(count)

#create dataframe to examine
labelInfoDf = pd.DataFrame({"raw label":np.unique(df['contam_code']), "encoded label": labels, "count":labelCount})

In [232]:
labelInfoDf

Unnamed: 0,raw label,encoded label,count
0,a,0,3
1,b,1,146
2,c,2,33950
3,d,3,14
4,e,4,7
5,f,5,47
6,g,6,107
7,h,7,482
8,i,8,33
9,j,9,10


In [225]:
#if you pick label c the entire time what is the accuracy 
acc = labelInfoDf['count'][2]/np.sum(labelInfoDf['count'])
print(acc)

#accuarcy is high but this does not mean there is a signal

0.8896750524109015


### Obscured B 

#### Decision Tree - Accuracy vs Dummy Classifier

In [252]:
data = loadarff("Data\\obscuredB.arff")
df = pd.DataFrame(data[0])

#iterate through each column and convert the bytes to string
columns = list(df.columns)
for colName in columns:
    df[colName] = df[colName].str.decode("utf-8")
    

In [253]:
#create features and labels
labels = df['contam_code']
features = df.drop("contam_code", axis=1)

In [254]:
#encode features and labels as integers
oe = OrdinalEncoder()
encodedFeatures = oe.fit_transform(features)

lbe = LabelEncoder()
encodedLabels = lbe.fit_transform(labels)

In [255]:
#create decision tree 
clf = DecisionTreeClassifier()

In [256]:
acc = cross_val_score(clf, encodedFeatures, encodedLabels, cv=10, n_jobs=-1)



In [257]:
acc

array([0.8865304 , 0.88548218, 0.88757862, 0.88679245, 0.88600629,
       0.88443396, 0.88574423, 0.88574423, 0.88574423, 0.88548218])

#### Confusion Matrix

In [258]:
#80 20 split
trainFeatures, testFeatures, trainLabels, testLabels = train_test_split(encodedFeatures, encodedLabels, test_size=0.2, random_state=42)

In [259]:
#create decision tree 
clf = DecisionTreeClassifier()
clf.fit(trainFeatures,trainLabels)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [260]:
acc = accuracy_score(testLabels,predictions)
print(acc)

0.8343815513626834


In [261]:
#predict and create confusion matrix
predictions = clf.predict(testFeatures)
cm = confusion_matrix(testLabels,predictions, labels=range(0,len(lbe.classes_)))

In [262]:
#confusion matrix 
df_cm = pd.DataFrame(cm)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_cm)

    0  1     2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  \
0   0  0     1  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0   0   
1   0  0    22  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   1   0   
2   0  0  6743  0  0  1  3  1  0  0   0   1   0   0   0   0   6   0   0  20   
3   0  0     4  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0   0   
4   0  0     2  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0   0   
5   0  0     8  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0   0   
6   0  0    21  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0   0   
7   0  0   101  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0   0   
8   0  0     5  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0   0   
9   0  0     2  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0   0   
10  0  0    13  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0   0   0   
11  0  0    13  0  0  0  0  0  0  0   0   0   0   0 

In [263]:
df_cm.to_csv("ConfusionMatrixB.csv")