In [1]:
# Initial imports

import numpy as np
import pandas as pd 
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline

In [2]:
# Imports for better visualization

from collections import defaultdict
import json

import scipy as sp

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'



In [3]:
# Load training data
train_data = pd.read_csv('Data/Training_Dataset.csv')

In [4]:
# Load testing data
test_data = pd.read_csv('Data/Leaderboard_Dataset.csv')

In [5]:
train_data.head()

Unnamed: 0,citizen_id,party_voted_past,mvar1,mvar2,mvar3,mvar4,mvar5,mvar6,mvar7,mvar8,...,mvar25,mvar26,mvar27,mvar28,mvar29,mvar30,mvar31,mvar32,mvar33,actual_vote
0,C1,Tokugawa,0,0,18,0,0,0,0,9,...,0,3,25-35,1.0,0.0,Degree,6,AEC,AEC,Tokugawa
1,C2,Ebony,2,6,0,2,0,2,5,0,...,0,3,18-24,1.0,0.0,Masters,5,AEC,AEC,Ebony
2,C3,Cosmos,0,0,0,0,6,0,0,0,...,15,1,46-55,1.0,1.0,Masters,6,AMS,AMS,Cosmos
3,C4,Centaur,2,0,0,0,0,2,0,0,...,0,1,36-45,1.0,0.0,Degree,6,ACL,ANQ,Centaur
4,C5,Centaur,9,2,3,0,2,1,0,1,...,7,4,25-35,1.0,0.0,Primary,7,ARK,ARK,Centaur


In [6]:
test_data.head()

Unnamed: 0,citizen_id,party_voted_past,mvar1,mvar2,mvar3,mvar4,mvar5,mvar6,mvar7,mvar8,...,mvar24,mvar25,mvar26,mvar27,mvar28,mvar29,mvar30,mvar31,mvar32,mvar33
0,C70001,Centaur,13,1,0,0,0,13,1,0,...,0,0,4,55+,1.0,0.0,Masters,5,ATI,ATI
1,C70002,Ebony,0,6,7,0,0,0,3,2,...,0,0,3,46-55,1.0,0.0,Masters,5,AVJ,ANQ
2,C70003,Centaur,5,0,0,0,0,3,0,0,...,0,0,3,18-24,0.0,1.0,Degree,7,AHS,AHS
3,C70004,Ebony,14,7,3,15,0,4,4,1,...,15,0,3,25-35,0.0,0.0,Primary,6,AZG,ASQ
4,C70005,Cosmos,0,0,0,0,7,0,0,0,...,0,15,3,18-24,1.0,1.0,Masters,8,AXT,AWX


In [7]:
data = train_data.append(test_data, ignore_index=True)

In [8]:
data.head()

Unnamed: 0,actual_vote,citizen_id,mvar1,mvar10,mvar11,mvar12,mvar13,mvar14,mvar15,mvar16,...,mvar31,mvar32,mvar33,mvar4,mvar5,mvar6,mvar7,mvar8,mvar9,party_voted_past
0,Tokugawa,C1,0,0,0,0,3,0,0,0,...,6,AEC,AEC,0,0,0,0,9,0,Tokugawa
1,Ebony,C2,2,0,0,0,0,0,0,0,...,5,AEC,AEC,2,0,2,5,0,2,Ebony
2,Cosmos,C3,0,6,0,0,0,0,0,0,...,6,AMS,AMS,0,6,0,0,0,0,Cosmos
3,Centaur,C4,2,0,0,0,0,0,0,0,...,6,ACL,ANQ,0,0,2,0,0,0,Centaur
4,Centaur,C5,9,0,2,0,0,0,0,2,...,7,ARK,ARK,0,2,1,0,1,0,Centaur


In [9]:
data.shape

(81336, 36)

In [10]:
data['actual_vote'].fillna(0, inplace=True)

In [11]:
def change_column_names(df):
    new_columns = df.columns.values

    new_columns[2] = 'donation_centaur'
    new_columns[3] = 'donation_ebony'
    new_columns[4] = 'donation_toku'
    new_columns[5] = 'donation_odyssey'
    new_columns[6] = 'donation_cosmos'

    new_columns[7] = 'ind_rally_centaur'
    new_columns[8] = 'ind_rally_ebony'
    new_columns[9] = 'ind_rally_toku'
    new_columns[10] = 'ind_rally_odyssey'
    new_columns[11] = 'ind_rally_cosmos'

    new_columns[12] = 'group_rally_centaur'
    new_columns[13] = 'group_rally_ebony'
    new_columns[14] = 'group_rally_toku'
    new_columns[15] = 'group_rally_odyssey'
    new_columns[16] = 'group_rally_cosmos'

    new_columns[17] = 'fundraiser_centaur'
    new_columns[18] = 'fundraiser_ebony'
    new_columns[19] = 'fundraiser_toku'
    new_columns[20] = 'fundraiser_odyssey'
    new_columns[21] = 'fundraiser_cosmos'

    new_columns[22] = 'volunteer_centaur'
    new_columns[23] = 'volunteer_ebony'
    new_columns[24] = 'volunteer_toku'
    new_columns[25] = 'volunteer_odyssey'
    new_columns[26] = 'volunteer_cosmos'

    new_columns[27] = 'hh_size'
    new_columns[28] = 'age_bucket'
    new_columns[29] = 'married'
    new_columns[30] = 'home_ownership'
    new_columns[31] = 'education_level'
    new_columns[32] = 'newspapers'
    new_columns[33] = 'prev_residence'
    new_columns[34] = 'curr_residence'

    df.columns = new_columns
    return df

In [12]:
change_column_names(train_data)
change_column_names(test_data)
change_column_names(data)

train_data.head()

Unnamed: 0,citizen_id,party_voted_past,donation_centaur,donation_ebony,donation_toku,donation_odyssey,donation_cosmos,ind_rally_centaur,ind_rally_ebony,ind_rally_toku,...,volunteer_cosmos,hh_size,age_bucket,married,home_ownership,education_level,newspapers,prev_residence,curr_residence,actual_vote
0,C1,Tokugawa,0,0,18,0,0,0,0,9,...,0,3,25-35,1.0,0.0,Degree,6,AEC,AEC,Tokugawa
1,C2,Ebony,2,6,0,2,0,2,5,0,...,0,3,18-24,1.0,0.0,Masters,5,AEC,AEC,Ebony
2,C3,Cosmos,0,0,0,0,6,0,0,0,...,15,1,46-55,1.0,1.0,Masters,6,AMS,AMS,Cosmos
3,C4,Centaur,2,0,0,0,0,2,0,0,...,0,1,36-45,1.0,0.0,Degree,6,ACL,ANQ,Centaur
4,C5,Centaur,9,2,3,0,2,1,0,1,...,7,4,25-35,1.0,0.0,Primary,7,ARK,ARK,Centaur


In [13]:
train_data.dtypes

citizen_id              object
party_voted_past        object
donation_centaur         int64
donation_ebony           int64
donation_toku            int64
donation_odyssey         int64
donation_cosmos          int64
ind_rally_centaur        int64
ind_rally_ebony          int64
ind_rally_toku           int64
ind_rally_odyssey        int64
ind_rally_cosmos         int64
group_rally_centaur      int64
group_rally_ebony        int64
group_rally_toku         int64
group_rally_odyssey      int64
group_rally_cosmos       int64
fundraiser_centaur       int64
fundraiser_ebony         int64
fundraiser_toku          int64
fundraiser_odyssey       int64
fundraiser_cosmos        int64
volunteer_centaur        int64
volunteer_ebony          int64
volunteer_toku           int64
volunteer_odyssey        int64
volunteer_cosmos         int64
hh_size                  int64
age_bucket              object
married                float64
home_ownership         float64
education_level         object
newspape

In [14]:
train_data.shape

(60129, 36)

In [15]:
test_data.shape

(21207, 35)

In [16]:
train_data.isnull().any()

citizen_id             False
party_voted_past       False
donation_centaur       False
donation_ebony         False
donation_toku          False
donation_odyssey       False
donation_cosmos        False
ind_rally_centaur      False
ind_rally_ebony        False
ind_rally_toku         False
ind_rally_odyssey      False
ind_rally_cosmos       False
group_rally_centaur    False
group_rally_ebony      False
group_rally_toku       False
group_rally_odyssey    False
group_rally_cosmos     False
fundraiser_centaur     False
fundraiser_ebony       False
fundraiser_toku        False
fundraiser_odyssey     False
fundraiser_cosmos      False
volunteer_centaur      False
volunteer_ebony        False
volunteer_toku         False
volunteer_odyssey      False
volunteer_cosmos       False
hh_size                False
age_bucket             False
married                 True
home_ownership          True
education_level         True
newspapers             False
prev_residence         False
curr_residence

In [17]:
test_data.isnull().any()

citizen_id             False
party_voted_past       False
donation_centaur       False
donation_ebony         False
donation_toku          False
donation_odyssey       False
donation_cosmos        False
ind_rally_centaur      False
ind_rally_ebony        False
ind_rally_toku         False
ind_rally_odyssey      False
ind_rally_cosmos       False
group_rally_centaur    False
group_rally_ebony      False
group_rally_toku       False
group_rally_odyssey    False
group_rally_cosmos     False
fundraiser_centaur     False
fundraiser_ebony       False
fundraiser_toku        False
fundraiser_odyssey     False
fundraiser_cosmos      False
volunteer_centaur      False
volunteer_ebony        False
volunteer_toku         False
volunteer_odyssey      False
volunteer_cosmos       False
hh_size                False
age_bucket             False
married                 True
home_ownership          True
education_level         True
newspapers             False
prev_residence         False
curr_residence

In [18]:
train_data['actual_vote'].value_counts()

Odyssey     13223
Centaur     12964
Cosmos      12959
Ebony       12937
Tokugawa     8046
Name: actual_vote, dtype: int64

In [19]:
train_data[train_data['married'].isnull()]

Unnamed: 0,citizen_id,party_voted_past,donation_centaur,donation_ebony,donation_toku,donation_odyssey,donation_cosmos,ind_rally_centaur,ind_rally_ebony,ind_rally_toku,...,volunteer_cosmos,hh_size,age_bucket,married,home_ownership,education_level,newspapers,prev_residence,curr_residence,actual_vote
3238,C3239,Ebony,1,20,0,0,0,1,8,0,...,0,4,18-24,,,,7,AHL,ABX,Ebony
3553,C3554,Odyssey,1,1,0,8,0,1,1,0,...,15,4,55+,,,,7,ASF,ASF,Odyssey
5373,C5374,Tokugawa,1,0,1,0,0,1,0,1,...,0,4,55+,,,,4,ACL,AUM,Centaur
8232,C8233,Cosmos,0,3,0,0,4,0,0,0,...,6,4,18-24,,,,7,AJI,AOV,Cosmos
12001,C12002,Odyssey,0,0,0,19,0,0,0,0,...,0,1,36-45,,,,7,AMH,AWK,Odyssey
12279,C12280,Centaur,18,0,0,0,0,12,0,0,...,0,3,55+,,,,7,AEC,AEC,Centaur
14554,C14555,Cosmos,0,2,0,0,2,0,2,0,...,15,1,25-35,,,,5,AJI,ATI,Centaur
21256,C21257,Ebony,0,4,0,0,0,0,3,0,...,0,4,18-24,,,,5,ACL,AGD,Ebony
21537,C21538,Centaur,7,0,0,0,0,4,0,0,...,0,1,55+,,,,3,AMS,ARK,Centaur
23666,C23667,Odyssey,0,0,0,13,0,0,0,0,...,0,4,55+,,,,6,AET,AWT,Odyssey


In [20]:
train_data[train_data['home_ownership'].isnull()]

Unnamed: 0,citizen_id,party_voted_past,donation_centaur,donation_ebony,donation_toku,donation_odyssey,donation_cosmos,ind_rally_centaur,ind_rally_ebony,ind_rally_toku,...,volunteer_cosmos,hh_size,age_bucket,married,home_ownership,education_level,newspapers,prev_residence,curr_residence,actual_vote
3238,C3239,Ebony,1,20,0,0,0,1,8,0,...,0,4,18-24,,,,7,AHL,ABX,Ebony
3553,C3554,Odyssey,1,1,0,8,0,1,1,0,...,15,4,55+,,,,7,ASF,ASF,Odyssey
5373,C5374,Tokugawa,1,0,1,0,0,1,0,1,...,0,4,55+,,,,4,ACL,AUM,Centaur
8232,C8233,Cosmos,0,3,0,0,4,0,0,0,...,6,4,18-24,,,,7,AJI,AOV,Cosmos
12001,C12002,Odyssey,0,0,0,19,0,0,0,0,...,0,1,36-45,,,,7,AMH,AWK,Odyssey
12279,C12280,Centaur,18,0,0,0,0,12,0,0,...,0,3,55+,,,,7,AEC,AEC,Centaur
14554,C14555,Cosmos,0,2,0,0,2,0,2,0,...,15,1,25-35,,,,5,AJI,ATI,Centaur
21256,C21257,Ebony,0,4,0,0,0,0,3,0,...,0,4,18-24,,,,5,ACL,AGD,Ebony
21537,C21538,Centaur,7,0,0,0,0,4,0,0,...,0,1,55+,,,,3,AMS,ARK,Centaur
23666,C23667,Odyssey,0,0,0,13,0,0,0,0,...,0,4,55+,,,,6,AET,AWT,Odyssey


In [21]:
train_data[train_data['education_level'].isnull()]

Unnamed: 0,citizen_id,party_voted_past,donation_centaur,donation_ebony,donation_toku,donation_odyssey,donation_cosmos,ind_rally_centaur,ind_rally_ebony,ind_rally_toku,...,volunteer_cosmos,hh_size,age_bucket,married,home_ownership,education_level,newspapers,prev_residence,curr_residence,actual_vote
3238,C3239,Ebony,1,20,0,0,0,1,8,0,...,0,4,18-24,,,,7,AHL,ABX,Ebony
3553,C3554,Odyssey,1,1,0,8,0,1,1,0,...,15,4,55+,,,,7,ASF,ASF,Odyssey
5373,C5374,Tokugawa,1,0,1,0,0,1,0,1,...,0,4,55+,,,,4,ACL,AUM,Centaur
8232,C8233,Cosmos,0,3,0,0,4,0,0,0,...,6,4,18-24,,,,7,AJI,AOV,Cosmos
12001,C12002,Odyssey,0,0,0,19,0,0,0,0,...,0,1,36-45,,,,7,AMH,AWK,Odyssey
12279,C12280,Centaur,18,0,0,0,0,12,0,0,...,0,3,55+,,,,7,AEC,AEC,Centaur
14554,C14555,Cosmos,0,2,0,0,2,0,2,0,...,15,1,25-35,,,,5,AJI,ATI,Centaur
21256,C21257,Ebony,0,4,0,0,0,0,3,0,...,0,4,18-24,,,,5,ACL,AGD,Ebony
21537,C21538,Centaur,7,0,0,0,0,4,0,0,...,0,1,55+,,,,3,AMS,ARK,Centaur
23666,C23667,Odyssey,0,0,0,13,0,0,0,0,...,0,4,55+,,,,6,AET,AWT,Odyssey


In [22]:
print stats.mode(data['married'].dropna())[0]
print stats.mode(data['home_ownership'].dropna())[0]
print stats.mode(data['education_level'].dropna())[0]

[0]
[0]
[0]


In [23]:
train_data['married'].fillna(stats.mode(train_data['married'].dropna())[0][0], inplace=True)
train_data['home_ownership'].fillna(stats.mode(train_data['home_ownership'].dropna())[0][0], inplace=True)
train_data['education_level'].fillna(stats.mode(train_data['education_level'].dropna())[0][0], inplace=True)

test_data['married'].fillna(stats.mode(test_data['married'].dropna())[0][0], inplace=True)
test_data['home_ownership'].fillna(stats.mode(test_data['home_ownership'].dropna())[0][0], inplace=True)
test_data['education_level'].fillna(stats.mode(test_data['education_level'].dropna())[0][0], inplace=True)



In [24]:
train_data['age_bucket'].value_counts()

25-35    18347
18-24    11803
46-55    11244
55+       9705
36-45     9030
Name: age_bucket, dtype: int64

In [25]:
def get_age_bucket_dummies(df):
    df.ix[(df['age_bucket'] == '18-24'), 'age_bucket'] = 1
    df.ix[(df['age_bucket'] == '25-35'), 'age_bucket'] = 2
    df.ix[(df['age_bucket'] == '36-45'), 'age_bucket'] = 3
    df.ix[(df['age_bucket'] == '46-55'), 'age_bucket'] = 4
    df.ix[(df['age_bucket'] == '55+'), 'age_bucket'] = 5
    age_dummies = pd.get_dummies(df['age_bucket'], prefix='age')
    df = pd.concat([df.drop(['age_bucket'], axis=1), age_dummies.drop(['age_5'], axis=1)], axis=1)
    return df

In [26]:
train_data = get_age_bucket_dummies(train_data)
test_data = get_age_bucket_dummies(test_data)
train_data.head()

Unnamed: 0,citizen_id,party_voted_past,donation_centaur,donation_ebony,donation_toku,donation_odyssey,donation_cosmos,ind_rally_centaur,ind_rally_ebony,ind_rally_toku,...,home_ownership,education_level,newspapers,prev_residence,curr_residence,actual_vote,age_1,age_2,age_3,age_4
0,C1,Tokugawa,0,0,18,0,0,0,0,9,...,0.0,Degree,6,AEC,AEC,Tokugawa,0.0,1.0,0.0,0.0
1,C2,Ebony,2,6,0,2,0,2,5,0,...,0.0,Masters,5,AEC,AEC,Ebony,1.0,0.0,0.0,0.0
2,C3,Cosmos,0,0,0,0,6,0,0,0,...,1.0,Masters,6,AMS,AMS,Cosmos,0.0,0.0,0.0,1.0
3,C4,Centaur,2,0,0,0,0,2,0,0,...,0.0,Degree,6,ACL,ANQ,Centaur,0.0,0.0,1.0,0.0
4,C5,Centaur,9,2,3,0,2,1,0,1,...,0.0,Primary,7,ARK,ARK,Centaur,0.0,1.0,0.0,0.0


In [27]:
train_data['education_level'].value_counts()

Masters    23943
Degree     18561
Diploma    11077
Primary     6548
Name: education_level, dtype: int64

In [28]:
def get_education_level_dummies(df):
    edu_level_dummies = pd.get_dummies(df['education_level'], prefix='edu_level')
    df = pd.concat([df.drop(['education_level'], axis=1), edu_level_dummies.drop(['edu_level_Primary'], axis=1)], axis=1)
    return df

In [29]:
train_data = get_education_level_dummies(train_data)
test_data = get_education_level_dummies(test_data)
train_data.head()

Unnamed: 0,citizen_id,party_voted_past,donation_centaur,donation_ebony,donation_toku,donation_odyssey,donation_cosmos,ind_rally_centaur,ind_rally_ebony,ind_rally_toku,...,prev_residence,curr_residence,actual_vote,age_1,age_2,age_3,age_4,edu_level_Degree,edu_level_Diploma,edu_level_Masters
0,C1,Tokugawa,0,0,18,0,0,0,0,9,...,AEC,AEC,Tokugawa,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,C2,Ebony,2,6,0,2,0,2,5,0,...,AEC,AEC,Ebony,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,C3,Cosmos,0,0,0,0,6,0,0,0,...,AMS,AMS,Cosmos,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,C4,Centaur,2,0,0,0,0,2,0,0,...,ACL,ANQ,Centaur,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,C5,Centaur,9,2,3,0,2,1,0,1,...,ARK,ARK,Centaur,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [30]:
train_data['prev_residence'].value_counts()

ACL    5240
AEC    3307
ATI    2463
ANQ    2430
AOV    2338
AJI    2322
ABX    2136
APH    1984
AHS    1928
AWX    1899
AHR    1601
AJV    1141
AMS    1126
AET    1084
AHO    1067
ASE     935
ARK     916
AII     869
AWI     831
ACD     805
AMR     786
AQU     778
AUE     761
AQX     718
AZG     665
AXT     661
ADV     630
AHJ     613
AXO     555
AYE     519
       ... 
AJY       1
AJX       1
AVR       1
AYF       1
AYP       1
AVZ       1
ACW       1
ACJ       1
ACB       1
ACC       1
AMQ       1
AMY       1
AMG       1
AMN       1
AWF       1
ATR       1
ANV       1
AEQ       1
ANJ       1
ANH       1
ANM       1
BAS       1
ANE       1
AEZ       1
AXI       1
AXY       1
AEH       1
AXQ       1
AXP       1
AZO       1
Name: prev_residence, dtype: int64

In [31]:
train_data['prev_residence'].value_counts().count()

456

In [32]:
test_data['prev_residence'].value_counts()

ACL    1791
AEC    1181
ANQ     884
AOV     874
ATI     855
AJI     837
ABX     753
AHS     697
APH     656
AWX     637
AHR     545
AHO     414
AJV     377
AMS     368
AET     358
ARK     319
ASE     317
AII     305
AWI     298
ACD     277
AQU     268
AMR     267
AUE     260
ADV     260
AQX     257
AHJ     239
AZG     235
AXT     223
AXO     212
AYE     199
       ... 
ANJ       1
ANF       1
ALO       1
AEX       1
AEU       1
AEQ       1
AWJ       1
AXK       1
AXD       1
AXB       1
AVI       1
AVK       1
AVF       1
ACK       1
AUY       1
AUW       1
AUS       1
AUH       1
AUB       1
AUC       1
AUA       1
AKB       1
AKH       1
AKP       1
ADA       1
ADW       1
ACY       1
ACR       1
ACJ       1
AGO       1
Name: prev_residence, dtype: int64

In [33]:
test_data['prev_residence'].value_counts().count()

373

In [34]:
train_data['curr_residence'].value_counts()

ACL    3869
AEC    3356
ATI    2586
ANQ    2549
AJI    2482
ABX    2370
AOV    2360
AHS    1978
APH    1818
AWX    1749
AHR    1596
AJV    1292
AMS    1191
AHO    1130
AET    1127
ARK    1006
ASE     938
AII     862
ACD     852
AUE     847
AWI     844
AMR     825
AQU     795
AQX     704
AZG     689
AHJ     670
AXT     654
ADV     644
ATU     588
AXO     572
       ... 
AUO       1
AUB       1
AKI       1
AKV       1
ADQ       1
AZS       1
ADT       1
AVQ       1
AVP       1
AEM       1
AED       1
AEX       1
BAF       1
BAC       1
ATD       1
ATB       1
ATN       1
ATQ       1
AJX       1
AJY       1
AIY       1
AIR       1
AIM       1
AIL       1
AIO       1
AIK       1
AIC       1
AZM       1
AZR       1
AGO       1
Name: curr_residence, dtype: int64

In [35]:
train_data['curr_residence'].value_counts().count()

465

In [36]:
test_data['curr_residence'].value_counts()

ACL    1399
AEC    1167
ANQ     929
AOV     913
ATI     910
AJI     885
ABX     814
AHS     741
APH     602
AWX     590
AHR     587
AJV     444
AMS     392
AHO     380
AET     379
ARK     342
AII     325
ACD     323
ASE     296
AWI     290
AQU     289
AUE     273
AMR     268
AHJ     260
ADV     255
AZG     248
AQX     231
AXO     212
AXT     204
ATU     200
       ... 
ACJ       1
AVK       1
AMG       1
AMD       1
AMB       1
ANJ       1
ANF       1
AUP       1
AUS       1
APQ       1
ALZ       1
ATK       1
ATM       1
BAH       1
BAS       1
BAU       1
ALS       1
ALU       1
ALW       1
ALC       1
AXW       1
ALD       1
ALI       1
ALK       1
ALL       1
BBC       1
AUY       1
BBQ       1
AUT       1
AQJ       1
Name: curr_residence, dtype: int64

In [37]:
test_data['curr_residence'].value_counts().count()

355

In [38]:
# sns.boxplot(y=train_data['actual_vote'], x=train_data['party_voted_past'])

In [39]:
from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score

In [40]:
xgb = XGBClassifier(n_estimators=100, objective='multi:softmax', silent=False)
print np.mean(cross_val_score(xgb, train_data.drop(['citizen_id', 'party_voted_past', 'prev_residence', 'curr_residence', 'actual_vote'], axis=1), train_data['actual_vote']))

0.753879199651


In [41]:
train_data.isnull().any()

citizen_id             False
party_voted_past       False
donation_centaur       False
donation_ebony         False
donation_toku          False
donation_odyssey       False
donation_cosmos        False
ind_rally_centaur      False
ind_rally_ebony        False
ind_rally_toku         False
ind_rally_odyssey      False
ind_rally_cosmos       False
group_rally_centaur    False
group_rally_ebony      False
group_rally_toku       False
group_rally_odyssey    False
group_rally_cosmos     False
fundraiser_centaur     False
fundraiser_ebony       False
fundraiser_toku        False
fundraiser_odyssey     False
fundraiser_cosmos      False
volunteer_centaur      False
volunteer_ebony        False
volunteer_toku         False
volunteer_odyssey      False
volunteer_cosmos       False
hh_size                False
married                False
home_ownership         False
newspapers             False
prev_residence         False
curr_residence         False
actual_vote            False
age_1         

In [42]:
xgb = XGBClassifier(n_estimators=500, objective='multi:softmax', silent=False)
x_train, x_test, y_train, y_test = train_test_split(train_data.drop(['citizen_id', 'party_voted_past', 'prev_residence', 'curr_residence', 'actual_vote'], axis=1), train_data['actual_vote'], random_state=2)
xgb.fit(x_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=1)

In [43]:
predicted_train = xgb.predict(x_test)

In [44]:
predicted_train

array(['Odyssey', 'Centaur', 'Odyssey', ..., 'Ebony', 'Centaur', 'Odyssey'], dtype=object)

In [45]:
print accuracy_score(y_test, predicted_train)

0.749550987827


In [48]:
predicted_test = xgb.predict(test_data.drop(['citizen_id', 'party_voted_past', 'prev_residence', 'curr_residence'], axis=1))

In [50]:
predicted_test

array(['Centaur', 'Tokugawa', 'Centaur', ..., 'Cosmos', 'Odyssey', 'Cosmos'], dtype=object)

In [51]:
solution = pd.concat([DataFrame(test_data['citizen_id']), DataFrame(predicted_test, columns=['actual_vote'])], axis=1)
solution.head()

Unnamed: 0,citizen_id,actual_vote
0,C70001,Centaur
1,C70002,Tokugawa
2,C70003,Centaur
3,C70004,Odyssey
4,C70005,Cosmos


In [54]:
# Export the results to CSV file
solution.to_csv('solution.csv', index=False, header=False)