In [264]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import MinMaxScaler
from ReliefF import ReliefF
from sklearn.feature_selection import VarianceThreshold

In [265]:
df = pd.read_csv("data.csv", header=None)
print(df.shape)

labels = df[3120]
features = df.drop(3120, axis=1)

(1085, 3121)


### Idiot Classifier

In [247]:
#class balance
print((np.sum(labels == 1))/labels.size)
print((np.sum(labels == 0))/labels.size)

0.34654377880184334
0.6534562211981567


### 1. Drop any NA rows

In [248]:
method1Df = df.dropna()
method1Df.shape

(0, 3121)

All rows are missing atleast one value

### 2. Simple Imputation - Mean of column

In [266]:
#split to make sure no data leakages
labelsTrain, labelsTest, featuresTrain, featuresTest = train_test_split(labels,features,test_size=0.3, random_state=42)
print(featuresTrain.shape)
print(featuresTest.shape)

(759, 3120)
(326, 3120)


In [295]:
#impute using simple impute - imputes one column at a time
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
featuresTrainImputed = imp.fit_transform(featuresTrain)
featuresTestImputed = imp.transform(featuresTest)

In [296]:
#alot of attritubes very little instances need to reduce - run baseline to get performance 
clf = GaussianNB()
clf.fit(featuresTrainImputed, labelsTrain)
predictions = clf.predict(featuresTestImputed)
acc = accuracy_score(labelsTest,predictions)
print(acc)

0.6932515337423313


### 3. Normalisation using StandardScaler

In [297]:
scaler = MinMaxScaler()
featuresTrainScaled = scaler.fit_transform(featuresTrainImputed)
featuresTestScaled = scaler.transform(featuresTestImputed)

In [298]:
#alot of attritubes very little instances need to reduce - run baseline to get performance 
clf = GaussianNB()
clf.fit(featuresTrainScaled, labelsTrain)
predictions = clf.predict(featuresTestScaled)
acc = accuracy_score(labelsTest,predictions)
print(acc)

0.6993865030674846


### 4. Remove low variance features

In [299]:
sel = VarianceThreshold(threshold=0.05)
featuresTrainVarianceAdjusted = sel.fit_transform(featuresTrainScaled)
featuresTestVarianceAdjusted = sel.transform(featuresTestScaled)

In [300]:
#alot of attritubes very little instances need to reduce - run baseline to get performance 
clf = GaussianNB()
clf.fit(featuresTrainVarianceAdjusted, labelsTrain)
predictions = clf.predict(featuresTestVarianceAdjusted)
acc = accuracy_score(labelsTest,predictions)
print(acc)

0.6779141104294478


### 5. Class imbalance

### Feature Selection - using ReliefF

In [210]:
featuresToKeep = 500
fs = ReliefF(n_neighbors=20, n_features_to_keep=featuresToKeep)

In [211]:
reducedFeaturesTrain = fs.fit_transform(featuresTrain, labelsTrain.values)

In [212]:
#extract best features
reducedFeaturesTest = featuresTest[:,fs.top_features][:,:featuresToKeep] 

In [213]:
clf = GaussianNB()
clf.fit(reducedFeaturesTrain, labelsTrain)
predictions = clf.predict(reducedFeaturesTest)
acc = accuracy_score(labelsTest,predictions)
print(acc)

0.6319018404907976
