In [67]:
import numpy as np
import pandas as pd
import mord
from sklearn import linear_model, metrics
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.feature_selection import RFE

In [4]:
path = '../../data/cleaned'
training = pd.read_csv(path + '/training_cleaned_v2.csv')

In [5]:
training.head(5)

Unnamed: 0,id,amount_tsh,year_recorded,month_recorded,day_recorded,gps_height,basin,basin_encoded,region,region_encoded,...,source,source_encoded,source_type,source_type_encoded,source_class,source_class_encoded,waterpoint_type_new,waterpoint_type_new_encoded,status_group,status_group_encoded
0,69572,6000.0,2011,3,14,1390,Lake Nyasa,1,Iringa,3,...,spring,8,spring,6,groundwater,0,communal standpipe,0,functional,3
1,8776,0.0,2013,3,6,1399,Lake Victoria,4,Mara,9,...,rainwater harvesting,5,rainwater harvesting,3,surface,1,communal standpipe,0,functional,3
2,34310,25.0,2013,2,25,686,Pangani,5,Manyara,8,...,dam,0,dam,1,surface,1,communal standpipe multiple,1,functional,3
3,67743,0.0,2013,1,28,263,Ruvuma / Southern Coast,7,Mtwara,12,...,machine dbh,3,borehole,0,groundwater,0,communal standpipe multiple,1,non functional,1
4,19728,0.0,2011,7,13,0,Lake Victoria,4,Kagera,4,...,rainwater harvesting,5,rainwater harvesting,3,surface,1,communal standpipe,0,functional,3


In [8]:
training.dtypes

id                                 int64
amount_tsh                       float64
year_recorded                      int64
month_recorded                     int64
day_recorded                       int64
gps_height                         int64
basin                             object
basin_encoded                      int64
region                            object
region_encoded                     int64
population                         int64
public_meeting_new               float64
permit_new                       float64
age                              float64
extraction_type                   object
extraction_type_encoded            int64
extraction_type_group             object
extraction_type_group_encoded      int64
extraction_type_class             object
extraction_type_class_encoded      int64
management                        object
management_encoded                 int64
management_group_new              object
management_group_new_encoded       int64
payment         

In [12]:
df_train = training.select_dtypes(exclude = 'object')
df_train.columns

Index(['id', 'amount_tsh', 'year_recorded', 'month_recorded', 'day_recorded',
       'gps_height', 'basin_encoded', 'region_encoded', 'population',
       'public_meeting_new', 'permit_new', 'age', 'extraction_type_encoded',
       'extraction_type_group_encoded', 'extraction_type_class_encoded',
       'management_encoded', 'management_group_new_encoded', 'payment_encoded',
       'quantity_group_encoded', 'quality_group_new_encoded', 'source_encoded',
       'source_type_encoded', 'source_class_encoded',
       'waterpoint_type_new_encoded', 'status_group_encoded'],
      dtype='object')

In [16]:
df_train = df_train.drop(['id','year_recorded', 'month_recorded', 'day_recorded','public_meeting_new', 'permit_new', 'age'], axis = 1)

In [21]:
x, y = df_train.iloc[:, 0:len(df_train.columns)-2], df_train.iloc[:,len(df_train.columns)-1]

In [42]:
fit1 = linear_model.LogisticRegression(solver='lbfgs',multi_class='multinomial')
fit1.fit(x, y)
metrics.mean_absolute_error(fit1.predict(x), y)

0.7941245791245791

In [39]:
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.1, random_state=0)

In [41]:
fit1 = linear_model.LogisticRegression(solver='lbfgs',multi_class='multinomial').fit(x_train, y_train)
fit1.score(x_test, y_test)
## only use the first fold??

0.5732323232323232

In [72]:
for i in range(5):
    print(cross_val_score(fit1, x, y, cv=KFold(n_splits=3, shuffle=True, random_state=i)))

[0.58035354 0.57040404 0.56111111]
[0.56691919 0.56479798 0.56984848]
[0.56439394 0.5709596  0.56737374]
[0.56717172 0.56838384 0.56828283]
[0.56919192 0.56131313 0.56156566]


In [47]:
fit2 = mord.LogisticIT(alpha=1.).fit(x,y)
cross_val_score(fit2, x, y, cv = 10)

array([0.63282828, 0.63164983, 0.62912458, 0.62861953, 0.63164983,
       0.61818182, 0.62003367, 0.63468013, 0.63619529, 0.64107744])

## stepwise

In [52]:
estimator = mord.LogisticIT(alpha=1.)
selector = RFE(estimator,step = 1)
selector = selector.fit(x, y)

In [60]:
x.columns[selector.support_]

Index(['extraction_type_group_encoded', 'extraction_type_class_encoded',
       'management_encoded', 'management_group_new_encoded',
       'quantity_group_encoded', 'quality_group_new_encoded', 'source_encoded',
       'source_type_encoded'],
      dtype='object')

In [61]:
fit_step = mord.LogisticIT(alpha=1.).fit(x.iloc[:,selector.support_],y)
cross_val_score(fit2, x.iloc[:,selector.support_], y, cv = 3)

array([0.60565657, 0.59247475, 0.61439394])

In [63]:
for i in range(3,len(x.columns)):
    estimator = mord.LogisticIT(alpha=1.)
    selector = RFE(estimator,i,step = 1).fit(x, y)
    fit_step = mord.LogisticIT(alpha=1.).fit(x.iloc[:,selector.support_],y)
    print(cross_val_score(fit2, x.iloc[:,selector.support_], y, cv = 3))

[0.59757576 0.59050505 0.60666667]
[0.5989899  0.59227273 0.61166667]
[0.5929798  0.58954545 0.60353535]
[0.59818182 0.58893939 0.60585859]
[0.59661616 0.59282828 0.60540404]
[0.60565657 0.59247475 0.61439394]
[0.61050505 0.60181818 0.62171717]
[0.62782828 0.6210101  0.6380303 ]


KeyboardInterrupt: 