In [46]:
import numpy as np
import collections as collections
from sklearn import datasets
import graphviz
import sklearn.tree
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score, cross_val_predict, KFold
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from StringIO import StringIO

In [47]:
#Name of CSV file containing the data
filename = "house-votes-84.data.txt"

#Names of the columns
colnames = "Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,\
physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,\
mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,\
crime,duty-free-exports,export-administration-act-south-africa"

#Array of column names
colarr = colnames.split(",")

#Feature names
feature_names = np.array(colarr[1:])

#Target names
target_names = ['republican', 'democrat']

#Gets the raw data from the CSV and return a list of instances loaded into arrays
def _rawList():
    rows = list()
    #Iterate over the CSV line by line
    with open(filename) as f:
        for line in f:
            arr=line.strip().split(",")#split line into array
            rows.append(arr)
    return rows

#Store the rawList() output
_rawListOutput = _rawList()

#Gets the raw data as a numpy array
def _rawData():
    return np.asarray(_rawListOutput)

#Replace the missing values with the most frequent value for the corresponding feature
#Return the updated numpy array
def _replaceMissing():
    #get the unaltered numpy array 
    d = _rawData()
    #Replace missing values for all features (excluding target) with most frequent value
    for i in range(1, len(colarr)-1,1):
        l = d[:, i] #get column
        counted = collections.Counter(l) #get frequency of each value in column
        #remove the missing ('?') key from the frequency dictionary
        counted = {k: v for k, v in counted.iteritems() if v != '?'}
        #Get the key with the greatest frequency
        maxValue = max(counted, key=counted.get)
        #Replace the missing value in the column with the most frequent
        l[:] = [x if x != '?' else maxValue for x in l]
    return d #return updated numpy array

#Remove instances containing missing values
#Return the updated numpy array
def _excludeData():
    rows = _rawListOutput
    return np.asarray([elem for elem in rows if '?' not in elem])

#Converts values to binary and changes numpyarray to type int
def _toBinary(npa):
    binaryDict = {'republican':0, 'democrat':1, 'y':1, 'n':0, '?':2}
    for i in range(0, len(colarr),1):
        l = npa[:, i]
        l[:] = [binaryDict[x] for x in l]
    return npa.astype(int)

<h1>Scenario 1 - Raw Tree</h1>

In [48]:
#Scenario 1 dataset - treats missing/? as if it is a value
s1Data = _toBinary(_rawData())

#Features matrix
s1X_Matrix = s1Data[:, 1:]

#Target column
s1Y = s1Data[:, :1]

#Split dataset into training and test data
s1X_train, s1X_test, s1y_train, s1y_test = train_test_split(s1X_Matrix, s1Y, test_size=0.2, random_state=0)

### Initiate classifier
s1InfoGain_clf = sklearn.tree.DecisionTreeClassifier(criterion='entropy')

#K folds score
s1Cv = KFold(s1X_train.shape[0], 5, shuffle=True, random_state=33)
print cross_val_score(s1InfoGain_clf, s1X_train, s1y_train, cv=s1Cv)

#Fit and evaluate against test set
s1InfoGain_clf.fit(s1X_train, s1y_train)
s1pred=s1InfoGain_clf.predict(s1X_test)
print metrics.classification_report(s1y_test, s1pred, target_names=target_names)

[ 0.88571429  0.9         0.88571429  0.94202899  0.94202899]
             precision    recall  f1-score   support

 republican       0.94      0.94      0.94        35
   democrat       0.96      0.96      0.96        52

avg / total       0.95      0.95      0.95        87



<h1>Scenario 2 - Exclude Missing Tree</h1>

In [49]:
#Scenario 2 dataset - exclude instances where features have missing values
s2Data = _toBinary(_excludeData())

#Features matrix
s2X_Matrix = s2Data[:, 1:]

#Target column
s2Y = s2Data[:, :1]

#Split dataset into training and test data
s2X_train, s2X_test, s2y_train, s2y_test = train_test_split(s2X_Matrix, s2Y, test_size=0.2, random_state=0)

### Initiate classifier
s2InfoGain_clf = sklearn.tree.DecisionTreeClassifier(criterion='entropy')

#K folds score
s2Cv = KFold(s2X_train.shape[0], 5, shuffle=True, random_state=33)
print cross_val_score(s2InfoGain_clf, s2X_train, s2y_train, cv=s2Cv)

#Fit and evaluate against test set
s2InfoGain_clf.fit(s2X_train, s2y_train)
s2pred=s2InfoGain_clf.predict(s2X_test)
print metrics.classification_report(s2y_test, s2pred, target_names=target_names)

[ 0.97297297  0.97297297  0.97297297  1.          1.        ]
             precision    recall  f1-score   support

 republican       0.92      0.92      0.92        24
   democrat       0.91      0.91      0.91        23

avg / total       0.91      0.91      0.91        47



<h1>Scenario 3 - Replace Missing Tree</h1>

In [50]:
#Scenario 3 dataset - impute missing values
s3Data = _toBinary(_replaceMissing())

#Features matrix
s3X_Matrix = s3Data[:, 1:]

#Target column
s3Y = s3Data[:, :1]

#Split dataset into training and test data
s3X_train, s3X_test, s3y_train, s3y_test = train_test_split(s3X_Matrix, s3Y, test_size=0.2, random_state=0)

### Initiate classifier
s3InfoGain_clf = sklearn.tree.DecisionTreeClassifier(criterion='entropy')

#K folds score
s3Cv = KFold(s3X_train.shape[0], 5, shuffle=True, random_state=33)
print cross_val_score(s3InfoGain_clf, s3X_train, s3y_train, cv=s3Cv)

#Fit and evaluate against test set
s3InfoGain_clf.fit(s3X_train, s3y_train)
s3pred=s3InfoGain_clf.predict(s3X_test)
print metrics.classification_report(s3y_test, s3pred, target_names=target_names)

[ 0.94285714  0.98571429  0.94285714  0.91304348  0.95652174]
             precision    recall  f1-score   support

 republican       0.89      0.94      0.92        35
   democrat       0.96      0.92      0.94        52

avg / total       0.93      0.93      0.93        87



<h1>Scenario 4 - Raw NBC</h1>

In [51]:
#Scenario 4 dataset - treats missing/? as if it is a value
s4Data = _toBinary(_rawData())

#Features matrix
s4X_Matrix = s4Data[:, 1:]

#Target column
s4Y = s4Data[:, :1]

#Split dataset into training and test data
s4X_train, s4X_test, s4y_train, s4y_test = train_test_split(s4X_Matrix, s4Y, test_size=0.2, random_state=0)

### Initiate classifier
s4NBClassifier = BernoulliNB()

#K folds score
s4Cv = KFold(s4X_train.shape[0], 5, shuffle=True, random_state=33)
print cross_val_score(s4NBClassifier, s4X_train, s4y_train, cv=s4Cv)

#Fit and evaluate against test set
s4NBClassifier.fit(s4X_train, s4y_train)
s4pred=s4NBClassifier.predict(s4X_test)
print metrics.classification_report(s4y_test, s4pred, target_names=target_names)

[ 0.88571429  0.94285714  0.9         0.92753623  0.86956522]
             precision    recall  f1-score   support

 republican       0.79      0.94      0.86        35
   democrat       0.96      0.83      0.89        52

avg / total       0.89      0.87      0.87        87



<h1>Scenario 5 - Exlude Missing NBC</h1>

In [52]:
#Scenario 5 dataset - exclude instances where features have missing values
s5Data = _toBinary(_excludeData())

#Features matrix
s5X_Matrix = s5Data[:, 1:]

#Target column
s5Y = s5Data[:, :1]

#Split dataset into training and test data
s5X_train, s5X_test, s5y_train, s5y_test = train_test_split(s5X_Matrix, s5Y, test_size=0.2, random_state=0)

### Initiate classifier
s5NBClassifier = BernoulliNB()

#K folds score
s5Cv = KFold(s5X_train.shape[0], 5, shuffle=True, random_state=33)
print cross_val_score(s5NBClassifier, s5X_train, s5y_train, cv=s5Cv)

#Fit and evaluate against test set
s5NBClassifier.fit(s5X_train, s5y_train)
s5pred=s5NBClassifier.predict(s5X_test)
print metrics.classification_report(s5y_test, s5pred, target_names=target_names)

[ 1.          0.91891892  0.86486486  0.94594595  0.86486486]
             precision    recall  f1-score   support

 republican       0.96      0.92      0.94        24
   democrat       0.92      0.96      0.94        23

avg / total       0.94      0.94      0.94        47



<h1>Scenario 6 - Replace Missing NBC</h1>

In [53]:
#Scenario 6 dataset - impute missing values
s6Data = _toBinary(_replaceMissing())

#Features matrix
s6X_Matrix = s6Data[:, 1:]

#Target column
s6Y = s6Data[:, :1]

#Split dataset into training and test data
s6X_train, s6X_test, s6y_train, s6y_test = train_test_split(s6X_Matrix, s6Y, test_size=0.2, random_state=0)

### Initiate classifier
s6NBClassifier = BernoulliNB()

#K folds score
s6Cv = KFold(s6X_train.shape[0], 5, shuffle=True, random_state=33)
print cross_val_score(s6NBClassifier, s6X_train, s6y_train, cv=s6Cv)

#Fit and evaluate against test set
s6NBClassifier.fit(s6X_train, s6y_train)
s6pred=s6NBClassifier.predict(s6X_test)
print metrics.classification_report(s6y_test, s6pred, target_names=target_names)

[ 0.9         0.94285714  0.88571429  0.92753623  0.86956522]
             precision    recall  f1-score   support

 republican       0.80      0.94      0.87        35
   democrat       0.96      0.85      0.90        52

avg / total       0.90      0.89      0.89        87

