In [1]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from ReliefF import ReliefF
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFECV
from sklearn import tree

### Helper function

In [2]:
def runNB(featuresTrain, labelsTrain, featuresTest, labelsTest):    
    clf = GaussianNB()
    clf.fit(featuresTrain, labelsTrain)
    predictions = clf.predict(featuresTest)
    acc = accuracy_score(labelsTest,predictions)
    print(acc)

In [3]:
def meanOrMode(x):
    if x.dtype == 'object':
        return x.mode()
    else:
        return x.mean()

## Main

In [141]:
df = pd.read_csv("data.csv", header=None)

labels = df[3120]
features = df.drop(3120, axis=1)

### Idiot Classifier

In [247]:
#class balance
print((np.sum(labels == 1))/labels.size)
print((np.sum(labels == 0))/labels.size)

0.34654377880184334
0.6534562211981567


## Imputation

### 2. Impute - Class mean

In [142]:
imputedDf = pd.DataFrame()
for label in labels.unique():
    tempDf = features[labels==label].apply(lambda x: x.fillna(meanOrMode(x)), axis=0)
    imputedDf = pd.concat([imputedDf,tempDf], axis=0)
    
imputedDf = imputedDf.sort_index()

In [143]:
labelsTrain, labelsTest, featuresTrain, featuresTest = train_test_split(labels,imputedDf,test_size=0.3, random_state=42)

In [144]:
runNB(featuresTrain, labelsTrain, featuresTest, labelsTest)

0.6993865030674846


### 3. Normalisation using MinMaxScaler

In [145]:
scaler = MinMaxScaler()
featuresTrainScaled = scaler.fit_transform(featuresTrain)
featuresTestScaled = scaler.transform(featuresTest)

In [146]:
runNB(featuresTrainScaled, labelsTrain, featuresTestScaled, labelsTest)

0.7147239263803681


### 4. Recursive Feature Elimination

In [147]:
clf = tree.DecisionTreeClassifier()
selector = RFECV(clf, step=0.1, cv=10, n_jobs=-1)
featuresTrainReduced = selector.fit_transform(featuresTrainScaled, labelsTrain)
featuresTestReduced = selector.transform(featuresTestScaled)

In [148]:
featuresTestReduced.shape

(326, 1248)

In [149]:
runNB(featuresTrainReduced, labelsTrain, featuresTestReduced, labelsTest)

0.7760736196319018


### Feature Selection - using ReliefF

In [160]:
featuresToKeep = 100
fs = ReliefF(n_neighbors=20, n_features_to_keep=featuresToKeep)

In [161]:
reducedFeaturesTrain = fs.fit_transform(featuresTrainReduced, labelsTrain.values)

In [162]:
#extract best features
reducedFeaturesTest = fs.transform(featuresTestReduced)

In [163]:
runNB(reducedFeaturesTrain, labelsTrain, reducedFeaturesTest, labelsTest)

0.9386503067484663


### Top 5 Features

In [181]:
sortedDf = pd.DataFrame(featuresTrainReduced)[fs.top_features]
sortedDf

Unnamed: 0,1148,1178,303,713,693,1177,712,1147,692,302,...,244,164,724,414,799,664,949,1009,144,54
0,0.180801,0.358771,0.527640,0.282962,0.302648,0.388143,0.288655,0.180084,0.311655,0.543096,...,0.396522,0.883553,0.546681,0.196126,0.182257,0.614470,0.255813,0.487738,0.589567,0.045295
1,0.180801,0.358771,0.527640,0.282962,0.302648,0.388143,0.288655,0.180084,0.311655,0.543096,...,0.178842,0.155559,0.122775,0.158051,0.851660,0.598413,0.105225,0.403876,0.524496,0.866622
2,0.057432,0.440684,0.550914,0.217817,0.156841,0.277338,0.133664,0.146919,0.247760,0.819526,...,0.265566,0.280168,0.069191,0.062438,0.165734,0.415687,0.441714,0.380846,0.192838,0.845130
3,0.618699,0.493165,0.385806,0.492996,0.432608,0.534052,0.491678,0.604108,0.473886,0.390886,...,0.800694,0.867029,0.442132,0.311441,0.177214,0.308868,0.446441,0.310418,0.703393,0.025908
4,0.180801,0.358771,0.527640,0.282962,0.302648,0.388143,0.288655,0.180084,0.311655,0.543096,...,0.655879,0.891801,0.489394,0.279762,0.135927,0.414494,0.180550,0.494330,0.688114,0.050136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
754,0.922221,0.814669,0.414147,0.497143,0.637205,0.484904,0.518048,0.784261,0.553191,0.686778,...,0.161839,0.155416,0.506176,0.739155,0.569086,0.287075,0.292170,0.579183,0.611233,0.861116
755,0.801897,0.235689,0.333711,0.386360,0.616213,0.042335,0.594147,0.946161,0.448090,0.643750,...,0.204138,0.116447,0.349077,0.352315,0.684089,0.291026,0.437605,0.390397,0.493745,0.874752
756,0.618699,0.493165,0.385806,0.492996,0.432608,0.534052,0.491678,0.604108,0.473886,0.390886,...,0.812339,0.932494,0.386266,0.567797,0.141876,0.268601,0.152100,0.477531,0.782348,0.051094
757,0.085312,0.000000,0.253839,0.256388,0.297132,0.404359,0.484862,0.077360,0.380118,0.517870,...,0.144064,0.217634,0.124467,0.104235,0.125405,0.289966,0.358314,0.337667,0.122295,0.819535


In [183]:
fs.top_features[:5]

array([1148, 1178,  303,  713,  693], dtype=int64)