In [9]:
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff
import copy

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder


### Obscured A

#### Decision Tree - Accuracy vs Dummy Classifier

In [78]:
data = loadarff("Data/obscuredB.arff")
df = pd.DataFrame(data[0])

#iterate through each column and convert the bytes to string
columns = list(df.columns)
for colName in columns:
    df[colName] = df[colName].str.decode("utf-8")
    

In [79]:
labelsA = df['contam_code']
distA = [np.sum(labelsA==label)*100/len(labelsA) for label in np.unique(labelsA)]

In [6]:
#create features and labels
labels = df['contam_code']
features = df.drop("contam_code", axis=1)

In [7]:
#encode features and labels as integers
oe = OrdinalEncoder()
encodedFeatures = oe.fit_transform(features)

lbe = LabelEncoder()
encodedLabels = lbe.fit_transform(labels)

In [187]:
#create decision tree 
clf = DecisionTreeClassifier()

In [188]:
#cross validate
acc = cross_val_score(clf, encodedFeatures, encodedLabels, cv=10, n_jobs=-1)



In [189]:
acc

array([0.92085954, 0.92007338, 0.91954927, 0.9216457 , 0.92059748,
       0.91902516, 0.92348008, 0.92085954, 0.92295597, 0.91797694])

### Randomisation

In [10]:
randomisedEncodedLabels = copy.deepcopy(encodedLabels)
np.random.shuffle(randomisedEncodedLabels)

In [11]:
#create decision tree 
clf = DecisionTreeClassifier()

In [13]:
#cross validate
acc = cross_val_score(clf, encodedFeatures, randomisedEncodedLabels, cv=10, n_jobs=-1)



In [14]:
acc

array([0.88731656, 0.88784067, 0.88522013, 0.88705451, 0.88810273,
       0.88679245, 0.88548218, 0.88600629, 0.88495807, 0.88626834])

### Obscured B 

#### Decision Tree - Accuracy vs Dummy Classifier

In [252]:
data = loadarff("Data\\obscuredB.arff")
df = pd.DataFrame(data[0])

#iterate through each column and convert the bytes to string
columns = list(df.columns)
for colName in columns:
    df[colName] = df[colName].str.decode("utf-8")

np.random.shuffle(df)

In [253]:
#create features and labels
labels = df['contam_code']
features = df.drop("contam_code", axis=1)

In [254]:
#encode features and labels as integers
oe = OrdinalEncoder()
encodedFeatures = oe.fit_transform(features)

lbe = LabelEncoder()
encodedLabels = lbe.fit_transform(labels)

In [255]:
#create decision tree 
clf = DecisionTreeClassifier()

In [256]:
acc = cross_val_score(clf, encodedFeatures, encodedLabels, cv=10, n_jobs=-1)



In [257]:
acc

array([0.8865304 , 0.88548218, 0.88757862, 0.88679245, 0.88600629,
       0.88443396, 0.88574423, 0.88574423, 0.88574423, 0.88548218])

#### Obscure B distribution as set gets smaller

### Decrease size

In [86]:
data = loadarff("Data/obscuredB.arff")
df = pd.DataFrame(data[0])

#iterate through each column and convert the bytes to string
columns = list(df.columns)
for colName in columns:
    df[colName] = df[colName].str.decode("utf-8")

In [87]:
labelsA50 = df['contam_code']
distA50 = [np.sum(labelsA50==label)*100/len(labelsA50) for label in np.unique(labelsA50)]

In [88]:
pd.DataFrame({'labels':np.unique(labelsA50), 'A':distA50})

Unnamed: 0,labels,A
0,a,0.007862
1,b,0.3826
2,c,88.967505
3,d,0.036688
4,e,0.018344
5,f,0.123166
6,g,0.280398
7,h,1.263103
8,i,0.086478
9,j,0.026205


In [74]:
data = loadarff("Data/obscuredA-25.arff")
df = pd.DataFrame(data[0])

#iterate through each column and convert the bytes to string
columns = list(df.columns)
for colName in columns:
    df[colName] = df[colName].str.decode("utf-8")

In [75]:
labelsA25 = df['contam_code']
distA25 = [np.sum(labelsA25==label)*100/len(labelsA25) for label in np.unique(labelsA25)]

In [77]:
pd.DataFrame({'labels':np.unique(labelsA25), 'A':distA25})

Unnamed: 0,labels,A
0,a,0.010482
1,b,0.398323
2,c,88.144654
3,d,0.041929
4,e,0.031447
5,f,0.125786
6,g,0.303983
7,h,1.310273
8,i,0.062893
9,j,0.010482


In [63]:
pd.DataFrame({'labels':np.unique(labelsA), 'A':distA})

Unnamed: 0,labels,A
0,a,0.007862
1,b,0.3826
2,c,88.967505
3,d,0.036688
4,e,0.018344
5,f,0.123166
6,g,0.280398
7,h,1.263103
8,i,0.086478
9,j,0.026205
