# Missing data and classification analysis

### classifiers that provide a prediction probability

GradientBoostingClassifier()
DecisionTreeClassifier()
RandomForestClassifier()
LinearDiscriminantAnalysis()
LogisticRegression()
KNeighborsClassifier()
GaussianNB()
ExtraTreesClassifier()
BaggingClassifier()

In [1]:
import numpy as np
import pandas as pd
import fancyimpute
from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold







Using TensorFlow backend.


In [2]:
from sklearn.ensemble import AdaBoostClassifier # PROBABILITY
from sklearn.tree import DecisionTreeClassifier # PROBABILITY
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier # PROBABILITY
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier # PROBABILITY
from sklearn.linear_model import LogisticRegression # PROBABILITY
from sklearn.naive_bayes import GaussianNB # PROBABILITY
from sklearn.ensemble import ExtraTreesClassifier # PROBABILITY
from sklearn.neighbors import KNeighborsClassifier # PROBABILITY
from sklearn.ensemble import BaggingClassifier # PROBABILITY


class dataTest:
    
    ## to be used if you want to test and train on a very specific dataset
    ## after this initialization use -> train() and predict()
    def __init__(self, X, target,test_size = 0.33, random_state = 12345678, k_fold = 5):
        
        
        
        # create test and training sets
        train_x, test_x, train_y, test_y = model_selection.train_test_split(X, target, test_size=test_size, random_state=random_state)
        trainData = pd.concat([train_x,train_y], axis = 1)
        test_y = test_y.ravel()


        
        
        self.x_train = train_x
        self.y_train = train_y
        self.x_test = test_x
        self.y_test = test_y
        self.random_state = random_state
        self.models = [] ## list of the models
        self.models_definition(self.random_state)
        
        self.cv_x = X
        self.cv_y = target
        self.k_fold = k_fold
 

    def crossValidation(self):
        # cross validation
        print ("begin cross validation")
        evaluation = []
        for i in self.models:
            e = model_selection.cross_val_score(i, self.cv_x, self.cv_y, cv=StratifiedKFold(n_splits=self.k_fold,random_state=self.random_state,shuffle=True))
            evaluation.append ((round(np.average(e),4) * 100, round(np.std(e),4) * 100))
            
        print ("end cross validation")
        df_cv = pd.DataFrame (evaluation)
        return df_cv

    def models_definition(self,random_state):
        
        ## here we can tune the paramenters of the models
        
        self.models.append(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, random_state = self.random_state),algorithm="SAMME", n_estimators=200))
        #self.model.append(RadiusNeighborsClassifier(radius=10.0, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski'))
        self.models.append(RidgeClassifier(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, class_weight=None, solver='auto', random_state=self.random_state))
        paramsGB = {'n_estimators': 120, 'max_depth': 3, 'subsample': 0.5,'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': self.random_state}
        self.models.append(GradientBoostingClassifier(**paramsGB))
        self.models.append(DecisionTreeClassifier(random_state=self.random_state))
        
        self.models.append(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',max_depth=2, max_features='auto', max_leaf_nodes=None,min_impurity_decrease=0.0, min_impurity_split=None,min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,oob_score=False, random_state=self.random_state, verbose=0, warm_start=False))
        
        self.models.append(LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,solver='svd', store_covariance=False, tol=0.0001))
        self.models.append(LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=self.random_state, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1))
        self.models.append(KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=1))
        self.models.append(GaussianNB())

        
        self.models.append(ExtraTreesClassifier(n_estimators=250, random_state=self.random_state))        

        self.models.append(BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=1, random_state=self.random_state, verbose=0))
        
        ## add other models ...
        
               
    

    def train(self, x_train = 'self_train_x', y_train = 'self_train_y'):
        if x_train == 'self_train_x' and y_train == 'self_train_y':
            x_train = self.x_train
            y_train = self.y_train
        
        print ("START TRAINING")
        for i in self.models:
            i.fit(x_train,y_train)     
        print ("END TRAINING")
    
    
    def statistics(self,predicted_y, test_y):
        countPerTrue = 0
        countParTrue = 0
        countPerFalse = 0
        countParFalse = 0
        result = []
        for i in range(test_y.size):
            #print (test_y[i],predicted_y[i],"\n")
            if (test_y[i] == predicted_y[i]) and (test_y[i]== 1):
                countPerTrue += 1
            if (test_y[i] == predicted_y[i]) and (test_y[i]== 0):
                countParTrue += 1
            if (test_y[i] != predicted_y[i]) and (test_y[i]== 1):
                countParFalse += 1
            if (test_y[i] != predicted_y[i]) and (test_y[i]== 0):
                countPerFalse += 1
            #print (Y_bal_1_array[i],Y_P_bal_1[i])
        result.append(countPerTrue)
        result.append(countParTrue)
        result.append(countPerFalse)
        result.append(countParFalse)
        result.append((countPerTrue + countParTrue)/test_y.size) #ACCURACY
        result.append(countPerTrue/ (countPerTrue+countPerFalse)) #PRECISION
        result.append(countPerTrue/ (countPerTrue+countParFalse)) #RECALL
        
        return result
    
    def predict(self, x_test = "self_test_x", test_y = "self_test_y"):
        if x_test == "self_test_x" and test_y == "self_test_y":
            x_test = self.x_test
            test_y = self.y_test
        
        prediction = []
        
        for i in self.models:
            prediction.append(self.statistics(i.predict(x_test).ravel(), test_y))
        
        df_prediction = pd.DataFrame (prediction)
        return df_prediction
    
      

In [3]:
# TEST

#import data
d = pd.read_pickle('../data/data.pickle')

# create balanced dataset
Data_pers = d[d['AFclass'] == 'persistierend (>7 Tage, EKV)']
Data_paro = d[d['AFclass'] == 'paroxysmal']
Data_paro_1 = Data_paro.sample(n=332, random_state = 1234, replace = False)
balanced_1 = pd.concat([Data_pers,Data_paro_1])


# missing values recover 

target = "AFclass"
balanced_1 = balanced_1.drop('Soggetti', axis = 1)
balanced_1 = balanced_1.drop('PCneg', axis = 1)
balanced_1 = balanced_1.drop('IPG', axis = 1)
balanced_1['patsex'] = balanced_1['patsex'].map({'männlich' : 1, 'weiblich' : 0})
balanced_1["AFclass"] = balanced_1["AFclass"].map({'persistierend (>7 Tage, EKV)' : 1, 'paroxysmal' : 0}) 
features = balanced_1.columns[balanced_1.columns != target]
x_incomplete = balanced_1[features]
y = balanced_1["AFclass"].as_matrix()
# impute missing value with KNN strategy
x_knn_a = fancyimpute.KNN(15).complete(x_incomplete)
x_knn = pd.DataFrame(x_knn_a, columns = features)
y_new = pd.DataFrame(y)
y_new = y_new.rename(columns = {y_new.columns[0] : 'AFclass'})
Data_KNN = pd.concat([x_knn,y_new], axis = 1)
balanced_1 = Data_KNN


# get target variables 
X_bal_1 = balanced_1[balanced_1.columns[balanced_1.columns != "AFclass"]]
Y_bal_1 = balanced_1["AFclass"]


# create test and training sets
train_x, test_x, train_y, test_y = model_selection.train_test_split(X_bal_1, Y_bal_1, test_size=0.33, random_state=1242)

#train_x.shape

#trainData = pd.concat([train_x,train_y])
#train_y = pd.DataFrame(train_y)
trainData = pd.concat([train_x,train_y], axis = 1)
trainData.head()
test_y = test_y.ravel()
#test_y = test_y.map({'persistierend (>7 Tage, EKV)' : 1, 'paroxysmal' : 0})



Imputing row 1/684 with 1 missing, elapsed time: 0.253
Imputing row 101/684 with 0 missing, elapsed time: 0.426
Imputing row 201/684 with 60 missing, elapsed time: 0.436
Imputing row 301/684 with 0 missing, elapsed time: 0.451
Imputing row 401/684 with 0 missing, elapsed time: 0.470
Imputing row 501/684 with 0 missing, elapsed time: 0.481
Imputing row 601/684 with 0 missing, elapsed time: 0.502


In [4]:
#g = dataTest(train_x,train_y,test_x,test_y)
g = dataTest(X = X_bal_1,target = Y_bal_1)

In [5]:
g.train()

START TRAINING
END TRAINING


In [6]:
hh = g.predict()

In [7]:
# each row corresponds to a classifier, the columns 4,5,6 are (respectively) the ACCURACY,PREDICTION AND RECALL


hh.head(20)

Unnamed: 0,0,1,2,3,4,5,6
0,69,55,55,47,0.548673,0.556452,0.594828
1,60,59,51,56,0.526549,0.540541,0.517241
2,83,49,61,33,0.584071,0.576389,0.715517
3,62,46,64,54,0.477876,0.492063,0.534483
4,68,61,49,48,0.570796,0.581197,0.586207
5,60,59,51,56,0.526549,0.540541,0.517241
6,63,56,54,53,0.526549,0.538462,0.543103
7,59,68,42,57,0.561947,0.584158,0.508621
8,51,78,32,65,0.570796,0.614458,0.439655
9,69,54,56,47,0.544248,0.552,0.594828


In [8]:
cv = g.crossValidation()

begin cross validation
end cross validation


In [9]:
cv

Unnamed: 0,0,1
0,57.16,1.82
1,55.43,3.25
2,56.56,2.92
3,51.89,3.6
4,58.34,1.02
5,55.28,3.21
6,55.71,2.93
7,54.38,2.81
8,54.97,2.18
9,56.72,1.07
