In [108]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import StandardScaler
from ReliefF import ReliefF
from sklearn import tree
from sklearn.impute import SimpleImputer

### Helper function

In [95]:
def runNB(featuresTrain, labelsTrain, featuresTest, labelsTest):    
    clf = GaussianNB()
    clf.fit(featuresTrain, labelsTrain)
    predictions = clf.predict(featuresTest)
    acc = accuracy_score(labelsTest,predictions)
    return acc

## Main

In [63]:
df = pd.read_csv("data.csv", header=None)

labels = df[3120]
features = df.drop(3120, axis=1)

### Idiot Classifier

In [64]:
#class balance
print((np.sum(labels == 1))/labels.shape[0])
print((np.sum(labels == 0))/labels.shape[0])

0.34654377880184334
0.6534562211981567


### Main

In [111]:
labelSplitTrain1 = list()
labelSplitTrain0 = list()
labelSplitTest1 = list()
labelSplitTest0 = list()

accPreSimple = list()
accPostSimple = list()
accPreClass = list()
accPostClass = list()

for _ in range(0,10):
    
    labelsTrain, labelsTest, featuresTrain, featuresTest = train_test_split(labels,features,test_size=0.3)
    
    labelSplitTrain1.append((np.sum(labelsTrain == 1))/labelsTrain.shape[0])
    labelSplitTrain0.append((np.sum(labelsTrain == 0))/labelsTrain.shape[0])
    labelSplitTest1.append((np.sum(labelsTest == 1))/labelsTest.shape[0])
    labelSplitTest0.append((np.sum(labelsTest == 0))/labelsTest.shape[0])
    
#     accPre, accPost = simpleImputationRelief(labelsTrain,labelsTest,featuresTrain,featuresTest)
#     accPreSimple.append(accPre)
#     accPostSimple.append(accPost)
    
    accPre, accPost = classMeanImputationRelif(labelsTrain,labelsTest,featuresTrain,featuresTest)
    accPreClass.append(accPre)
    accPostClass.append(accPost)


[3019 2669  309  699  719]
[3049  719 2669 3019  309]
[3048  699 2669  719  718]
[ 309 3019 2669  719  699]
[ 719 2669 3019  699  309]
[3048  699 2668 3019 3018]
[2669  719  309  699 3019]
[ 719 3049 2669 3019  699]
[ 309 3019  699 2669  719]
[3049 3019  719  699 2669]


In [89]:
#write out as pdf
pd.DataFrame({'Simple Imputation Pre': accPreSimple,
             'Simple Imputation Post': accPostSimple, 
             'Class Imputation Pre': accPreClass,
             'Class Imputation Post': accPostClass}).to_excel("results.xlsx")

## Strategies

In [84]:
def simpleImputationRelief(labelsTrain,labelsTest,featuresTrain,featuresTest):
    #imputation
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputedTrainFeatureDf = imp_mean.fit_transform(featuresTrain)
    imputedTestFeatureDf = imp_mean.transform(featuresTest)
    acc1 = runNB(imputedTrainFeatureDf, labelsTrain, imputedTestFeatureDf, labelsTest)
    
    #relief
    featuresToKeep = 100
    fs = ReliefF(n_neighbors=100, n_features_to_keep=featuresToKeep)
    reducedFeaturesTrain = fs.fit_transform(imputedTrainFeatureDf, labelsTrain.values)
    #extract best features
    reducedFeaturesTest = fs.transform(imputedTestFeatureDf)
    acc2 = runNB(reducedFeaturesTrain, labelsTrain, reducedFeaturesTest, labelsTest)
    
    return acc1, acc2
    

In [110]:
def classMeanImputationRelif(labelsTrain,labelsTest,featuresTrain,featuresTest):
    #class mean imputation
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

    labels = pd.unique(labelsTest)
    imputedTrainFeatureDf = pd.DataFrame()
    imputedTestFeatureDf = pd.DataFrame()

    trainingLabels = list()
    testingLabels = list()

    for label in labels:
        #subset dataframe
        trainDf = featuresTrain[labelsTrain == label]
        testDf = featuresTest[labelsTest == label]

        trainDf = imp_mean.fit_transform(trainDf)
        testDf = imp_mean.transform(testDf)

        trainingLabels = trainingLabels + ([label] * trainDf.shape[0])
        testingLabels = testingLabels + ([label] * testDf.shape[0])

        imputedTrainFeatureDf = pd.concat([imputedTrainFeatureDf, pd.DataFrame(trainDf)], ignore_index=True)
        imputedTestFeatureDf = pd.concat([imputedTestFeatureDf, pd.DataFrame(testDf)], ignore_index=True)

    labelsTrain = np.array(trainingLabels)
    labelsTest = np.array(testingLabels)
    acc1 = runNB(imputedTrainFeatureDf, labelsTrain, imputedTestFeatureDf, labelsTest)
    
    #relief
    featuresToKeep = 100
    fs = ReliefF(n_neighbors=500, n_features_to_keep=featuresToKeep)
    reducedFeaturesTrain = fs.fit_transform(imputedTrainFeatureDf.values, labelsTrain)
    #extract best features
    reducedFeaturesTest = fs.transform(imputedTestFeatureDf.values)
    acc2 = runNB(reducedFeaturesTrain, labelsTrain, reducedFeaturesTest, labelsTest)
    
    #print top 5 features
    print(fs.top_features[:5])
#     print(fs.feature_scores[:5])
    
    return acc1, acc2

### Top 5 Features

In [147]:
top_features = pd.read_excel("results.xlsx", sheet_name="Feature Ranking", dtype=str)

In [149]:
ranking = top_features.groupby(['Top features']).size().sort_values(ascending=False)

In [150]:
ranking.to_excel("ranking.xlsx")