In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
train_features = pd.read_csv("Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_values.csv")
train_features["Count"] = 1
train_labels = pd.read_csv("Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_labels.csv")
train = pd.merge(train_features,train_labels,on="id",how="outer")

In [3]:
train["DifferenceFromMeanSchemeHeight"] = train.groupby("scheme_name")["gps_height"].transform(lambda x: x - x.mean())


In [4]:
train.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group', 'Count', 'status_group',
       'DifferenceFromMeanSchemeHeight'],
      dtype='object')

In [5]:
train.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,Count,status_group,DifferenceFromMeanSchemeHeight
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1,functional,-57.316547
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,functional,1399.0
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,1,functional,2.222222
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,1,non functional,263.0
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,functional,0.0


In [6]:
def encodeDiscrete(df,discrete_features):
    dummies = []
    for f in discrete_features:
        dummy = pd.get_dummies(df[f])
        colnames = [f+"_"+str(c) for c in dummy.columns]
        dummy.columns = colnames
        dummies.append(dummy)
    encoded_data = pd.concat(dummies,axis=1)
    return encoded_data

In [7]:
def prepareData(df,continuous_features,discrete_features,target_col):
    discrete_data = encodeDiscrete(df,discrete_features)
    continuous_data = df[continuous_features]
    df_x = (discrete_data.join(continuous_data))
    x = df_x.values
    y = df[target_col].values
    return x,y,df_x.columns

In [21]:
X,y,feature_names = prepareData(train,
                  ["amount_tsh","latitude","longitude","gps_height","population","DifferenceFromMeanSchemeHeight"],
                  ["waterpoint_type","source","quantity","quality_group","basin","installer"],"status_group")

In [22]:
x_train,x_valid, y_train,y_valid = train_test_split(X,y,test_size=0.2)

In [23]:
xgc = XGBClassifier()

In [24]:
def accuracyScorer(clf,x,Y):
    return accuracy_score(Y,clf.predict(x))

In [25]:
md_mcw_g = {"max_depth":[3,4],"min_child_weight":[2,3]}
gs_clf = GridSearchCV(xgc,md_mcw_g,cv=5,scoring=accuracyScorer)
gs_clf.fit(x_train,y_train)

KeyboardInterrupt: 

In [None]:
for params, mean_score, scores in gs_clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))

In [19]:
b = gs_clf.best_estimator_

In [20]:
im = pd.DataFrame({"feature":feature_names,"importance":b.feature_importances_}).sort_values("importance",ascending=False)
im

Unnamed: 0,feature,importance
29,latitude,0.223962
30,longitude,0.210202
33,DifferenceFromMeanSchemeHeight,0.096085
31,gps_height,0.069276
32,population,0.052195
28,amount_tsh,0.04745
17,quantity_dry,0.034638
12,source_rainwater harvesting,0.03013
4,waterpoint_type_hand pump,0.029656
6,waterpoint_type_other,0.028233
