In [97]:
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

%matplotlib inline

### Obscured A

#### Decision Tree - Accuracy vs Dummy Classifier

In [85]:
data = loadarff("Data\\obscuredA.arff")
df = pd.DataFrame(data[0])

#iterate through each column and convert the bytes to string
columns = list(df.columns)
for colName in columns:
    df[colName] = df[colName].str.decode("utf-8")
    

In [86]:
#create features and labels
labels = df['contam_code']
features = df.drop("contam_code", axis=1)

In [87]:
#encode features and labels as integers
oe = OrdinalEncoder()
encodedFeatures = oe.fit_transform(features)

lbe = LabelEncoder()
encodedLabels = lbe.fit_transform(labels)

In [88]:
#create decision tree 
clf = DecisionTreeClassifier()

In [89]:
#cross validate
acc = cross_val_score(clf, encodedFeatures, encodedLabels, cv=10, n_jobs=-1)



In [90]:
acc

array([0.92085954, 0.92059748, 0.91981132, 0.92112159, 0.92085954,
       0.91902516, 0.92348008, 0.92085954, 0.92321803, 0.91771488])

#### Confusion Matrix

In [117]:
#80 20 split
trainFeatures, testFeatures, trainLabels, testLabels = train_test_split(encodedFeatures, encodedLabels, test_size=0.2, random_state=42)

In [118]:
#create decision tree 
clf = DecisionTreeClassifier()
clf.fit(trainFeatures,trainLabels)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [119]:
acc = accuracy_score(testLabels,predictions)
print(acc)

0.9183700209643606


In [120]:
#predict and create confusion matrix
predictions = clf.predict(testFeatures)
cm = confusion_matrix(testLabels,predictions)

In [121]:
#confusion matrix 
df_cm = pd.DataFrame(cm)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_cm)

    0     1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16   17  18  19  \
0   2    31  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0    0   0   0   
1   4  6713  0  0  0  1  9  0  1  1   3   1   0   0   7   0   0   25   0   2   
2   0     3  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0    0   0   0   
3   0     2  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0    0   0   0   
4   0    10  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0    0   0   0   
5   0    19  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0    0   0   0   
6   0    63  0  0  0  1  3  0  0  0   0   0   0   0   0   0   0   17   0   0   
7   0     6  0  0  0  0  0  0  0  0   0   0   0   0   1   0   0    0   0   0   
8   0     0  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0    0   0   0   
9   0    12  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0    0   0   1   
10  0    13  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0    0   0   0   
11  0     1  0  0  0  0  0  0  0  0   0 

### Dummy Classifier - Majority Class

In [55]:
#look at distribution of labels in the class
labels = np.unique(encodedLabels)

#count labels
labelCount = list()
for label in labels:
    #count how many times a specific label occurs in the entire label set
    count = np.sum(np.equal(label,encodedLabels))
    labelCount.append(count)

#create dataframe to examine
labelInfoDf = pd.DataFrame({"raw label":np.unique(df['contam_code']), "encoded label": labels, "count":labelCount})

In [60]:
#if you pick label c the entire time what is the accuracy 
acc = labelInfoDf['count'][2]/np.sum(labelInfoDf['count'])
print(acc)

#accuarcy is high but this does not mean there is a signal

0.8896750524109015


### Obscured B 

#### Decision Tree - Accuracy vs Dummy Classifier

In [69]:
data = loadarff("Data\\obscuredB.arff")
df = pd.DataFrame(data[0])

#iterate through each column and convert the bytes to string
columns = list(df.columns)
for colName in columns:
    df[colName] = df[colName].str.decode("utf-8")
    

In [70]:
#create features and labels
labels = df['contam_code']
features = df.drop("contam_code", axis=1)

In [71]:
#encode features and labels as integers
oe = OrdinalEncoder()
encodedFeatures = oe.fit_transform(features)

lbe = LabelEncoder()
encodedLabels = lbe.fit_transform(labels)

In [72]:
#create decision tree 
clf = DecisionTreeClassifier()

In [77]:
acc = cross_val_score(clf, encodedFeatures, encodedLabels, cv=10, n_jobs=-1)



In [78]:
acc

array([0.8865304 , 0.88574423, 0.88731656, 0.88679245, 0.88574423,
       0.88443396, 0.88495807, 0.88574423, 0.88600629, 0.88548218])

#### Confusion Matrix