In [283]:
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sklearn.metrics import classification_report

In [284]:
def load_data(filename):
    data = []
    with open(filename) as f:
        lines = f.readlines()
        for line in lines:
            line = line.replace("y", "1").replace('republican', '1')
            line = line.replace("n", "0").replace('democrat', '0')
            line = line.replace("?", "None")
            data.append(line.strip().split(','))
    
    data = pd.DataFrame(data)
    data = data.apply(pd.to_numeric, errors='coerce')
    
    return data 



In [285]:
data = load_data("house-votes-84.data")
columns = list(data.columns)

noMissing_data = data.dropna()
noMissing_data = noMissing_data.reset_index(drop=True)

fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=1)
imputed_data = pd.DataFrame(fill_NaN.fit_transform(data))
imputed_data.columns = data.columns
imputed_data.index = data.index

yesMissing_data = data.fillna(3)

dt = tree.DecisionTreeClassifier()
gnb = GaussianNB()

datas = {"noMissing_data":noMissing_data, 'imputed_data':imputed_data, 'yesMissing_data':yesMissing_data}
models = {'Decision Tree':dt, 'Naïve Bayes':gnb}
for data_name, data in datas.items():
    for model_name, model in models.items():
#         scores = cross_val_score(estimator= model,     # Model to test
#                         X= data[columns[1:]],  
#                         y = data[0],      # Target variable
#                         scoring = "accuracy",               # Scoring metric    
#                         cv=5)
        fold_accuracy = []
        cv = KFold(n=len(data),  # Number of elements
           n_folds=5,            # Desired number of cv folds
           random_state=12)
        for train_fold, valid_fold in cv:
            train = data.loc[train_fold] # Extract train data with cv indices
            valid = data.loc[valid_fold] # Extract valid data with cv indices

            trained_model = model.fit(X = train[columns[1:]], 
                                   y = train[0])
            valid_acc = trained_model.score(X = valid[columns[1:]], 
                                    y = valid[0])
            fold_accuracy.append(valid_acc)    
        
        print ('Report of %s on %s:\n' %(model_name, data_name))
        pred = model.predict(data[columns[1:]])
        print(classification_report(data[0], pred, target_names=['republican', 'democrat']))
        print ("\n\n")
#         print ('Accuracy of %s on %s : %f' %(model_name, data_name, sum(fold_accuracy)/len(fold_accuracy)))
        



Report of Decision Tree on imputed_data:

             precision    recall  f1-score   support

 republican       1.00      0.99      0.99       267
   democrat       0.98      0.99      0.99       168

avg / total       0.99      0.99      0.99       435




Report of Naïve Bayes on imputed_data:

             precision    recall  f1-score   support

 republican       0.97      0.95      0.96       267
   democrat       0.92      0.95      0.93       168

avg / total       0.95      0.95      0.95       435




Report of Decision Tree on noMissing_data:

             precision    recall  f1-score   support

 republican       0.98      0.98      0.98       124
   democrat       0.97      0.98      0.98       108

avg / total       0.98      0.98      0.98       232




Report of Naïve Bayes on noMissing_data:

             precision    recall  f1-score   support

 republican       0.96      0.96      0.96       124
   democrat       0.95      0.95      0.95       108

avg / total      