In [108]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import StandardScaler
from ReliefF import ReliefF
from sklearn import tree
from sklearn.impute import SimpleImputer

### Helper function

In [95]:
def runNB(featuresTrain, labelsTrain, featuresTest, labelsTest):    
    clf = GaussianNB()
    clf.fit(featuresTrain, labelsTrain)
    predictions = clf.predict(featuresTest)
    acc = accuracy_score(labelsTest,predictions)
    return acc

## Main

In [63]:
df = pd.read_csv("data.csv", header=None)

labels = df[3120]
features = df.drop(3120, axis=1)

### Idiot Classifier

In [64]:
#class balance
print((np.sum(labels == 1))/labels.shape[0])
print((np.sum(labels == 0))/labels.shape[0])

0.34654377880184334
0.6534562211981567


### Main

In [109]:
labelSplitTrain1 = list()
labelSplitTrain0 = list()
labelSplitTest1 = list()
labelSplitTest0 = list()

accPreSimple = list()
accPostSimple = list()
accPreClass = list()
accPostClass = list()

for _ in range(0,10):
    
    labelsTrain, labelsTest, featuresTrain, featuresTest = train_test_split(labels,features,test_size=0.3)
    
    labelSplitTrain1.append((np.sum(labelsTrain == 1))/labelsTrain.shape[0])
    labelSplitTrain0.append((np.sum(labelsTrain == 0))/labelsTrain.shape[0])
    labelSplitTest1.append((np.sum(labelsTest == 1))/labelsTest.shape[0])
    labelSplitTest0.append((np.sum(labelsTest == 0))/labelsTest.shape[0])
    
#     accPre, accPost = simpleImputationRelief(labelsTrain,labelsTest,featuresTrain,featuresTest)
#     accPreSimple.append(accPre)
#     accPostSimple.append(accPost)
    
    accPre, accPost = classMeanImputationRelif(labelsTrain,labelsTest,featuresTrain,featuresTest)
    accPreClass.append(accPre)
    accPostClass.append(accPost)


[3019 2669  719  309  699]
[-31248. -31226. -31208. -31190. -31124.]
[3048 3018  699  719  718]
[-27148. -27124. -27082. -27034. -26906.]
[2669  719 3019  699  309]
[-32024. -32030. -31972. -31930. -31814.]
[ 309  719  699 2669 3019]
[-28302. -28314. -28288. -28244. -28196.]
[ 309  699 2669 3019  719]
[-23268. -23250. -23206. -23174. -23088.]
[3049 2669  699 3019  309]
[-31926. -31924. -31896. -31856. -31756.]
[3019  309  699  719 2669]
[-26890. -26872. -26844. -26764. -26652.]
[3048  308  718 2668  309]
[-25722. -25704. -25684. -25638. -25550.]
[3049  309 3019  699 2669]
[-29010. -28998. -28972. -28946. -28884.]
[3049 2669 3019  719  309]
[-30414. -30394. -30360. -30298. -30254.]


In [89]:
#write out as pdf
pd.DataFrame({'Simple Imputation Pre': accPreSimple,
             'Simple Imputation Post': accPostSimple, 
             'Class Imputation Pre': accPreClass,
             'Class Imputation Post': accPostClass}).to_excel("results.xlsx")

## Strategies

In [84]:
def simpleImputationRelief(labelsTrain,labelsTest,featuresTrain,featuresTest):
    #imputation
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputedTrainFeatureDf = imp_mean.fit_transform(featuresTrain)
    imputedTestFeatureDf = imp_mean.transform(featuresTest)
    acc1 = runNB(imputedTrainFeatureDf, labelsTrain, imputedTestFeatureDf, labelsTest)
    
    #relief
    featuresToKeep = 100
    fs = ReliefF(n_neighbors=100, n_features_to_keep=featuresToKeep)
    reducedFeaturesTrain = fs.fit_transform(imputedTrainFeatureDf, labelsTrain.values)
    #extract best features
    reducedFeaturesTest = fs.transform(imputedTestFeatureDf)
    acc2 = runNB(reducedFeaturesTrain, labelsTrain, reducedFeaturesTest, labelsTest)
    
    return acc1, acc2
    

In [104]:
def classMeanImputationRelif(labelsTrain,labelsTest,featuresTrain,featuresTest):
    #class mean imputation
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

    labels = pd.unique(labelsTest)
    imputedTrainFeatureDf = pd.DataFrame()
    imputedTestFeatureDf = pd.DataFrame()

    trainingLabels = list()
    testingLabels = list()

    for label in labels:
        #subset dataframe
        trainDf = featuresTrain[labelsTrain == label]
        testDf = featuresTest[labelsTest == label]

        trainDf = imp_mean.fit_transform(trainDf)
        testDf = imp_mean.transform(testDf)

        trainingLabels = trainingLabels + ([label] * trainDf.shape[0])
        testingLabels = testingLabels + ([label] * testDf.shape[0])

        imputedTrainFeatureDf = pd.concat([imputedTrainFeatureDf, pd.DataFrame(trainDf)], ignore_index=True)
        imputedTestFeatureDf = pd.concat([imputedTestFeatureDf, pd.DataFrame(testDf)], ignore_index=True)

    labelsTrain = np.array(trainingLabels)
    labelsTest = np.array(testingLabels)
    acc1 = runNB(imputedTrainFeatureDf, labelsTrain, imputedTestFeatureDf, labelsTest)
    
    #relief
    featuresToKeep = 100
    fs = ReliefF(n_neighbors=500, n_features_to_keep=featuresToKeep)
    reducedFeaturesTrain = fs.fit_transform(imputedTrainFeatureDf.values, labelsTrain)
    #extract best features
    reducedFeaturesTest = fs.transform(imputedTestFeatureDf.values)
    acc2 = runNB(reducedFeaturesTrain, labelsTrain, reducedFeaturesTest, labelsTest)
    
    #print top 5 features
    print(fs.top_features[:5])
#     print(fs.feature_scores[:5])
    
    return acc1, acc2

### Top 5 Features

In [34]:
sortedDf = pd.DataFrame(imputedTrainFeatureDf)[fs.top_features]
sortedDf

Unnamed: 0,3047,3048,699,2669,3049,719,3019,309,697,698,...,1310,190,470,3080,1040,1370,0,1410,1510,1790
0,1.277675,1.251375,0.275110,0.188708,1.255038,0.346858,0.655775,1.007470,0.272668,0.269376,...,0.898679,69.7772,14.8323,0.983728,4.29053,0.332307,1051.160,2.35395,0.692904,0.751020
1,1.277675,1.251375,0.275110,0.188708,1.255038,0.346858,0.655775,1.007470,0.272668,0.269376,...,0.930511,70.2354,16.4235,1.635370,3.42465,0.256522,921.464,2.26761,0.736669,0.781069
2,1.229800,1.105800,0.248065,0.172040,1.188340,0.263858,0.603416,1.326580,0.259206,0.257855,...,0.582305,70.0793,13.0901,0.685188,2.64425,0.354710,771.188,1.19248,0.365511,0.674632
3,1.277675,1.251375,0.275110,0.188708,1.255038,0.346858,0.655775,1.007470,0.272668,0.269376,...,1.009940,110.7630,11.6436,0.350582,1.09466,0.693424,765.374,1.84226,1.785510,2.964010
4,1.277675,1.251375,0.275110,0.188708,1.255038,0.346858,0.655775,1.007470,0.272668,0.269376,...,0.736879,69.5602,16.9353,1.550760,3.36894,0.332045,969.171,2.21095,0.827404,0.720115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
754,1.342332,1.347992,0.332973,0.218379,1.346903,0.462932,1.218788,0.829584,0.324611,0.332276,...,1.086750,112.0420,11.4311,0.425897,1.14175,0.636677,750.271,1.91232,1.906580,2.875620
755,1.433730,1.537990,0.496854,0.165573,1.311540,0.585602,1.423160,0.637661,0.352362,0.378251,...,1.041070,65.4637,13.6933,0.760714,3.02086,0.335463,1123.720,1.82158,0.901600,0.693700
756,1.267230,1.514610,0.267908,0.220346,1.441310,0.395297,0.588791,0.951802,0.221326,0.195777,...,0.553528,70.7964,13.8329,0.617122,2.31757,0.345972,671.886,1.05556,0.449086,0.462930
757,1.342332,1.347992,0.332973,0.218379,1.346903,0.462932,1.218788,0.829584,0.324611,0.332276,...,0.988567,111.6200,12.7461,0.396878,1.23115,0.678457,762.541,1.81302,1.732990,2.984340


In [27]:
fs.top_features

array([2669,  699,  309, ..., 1040, 1370,    0], dtype=int64)