# Initial Logistic Regression

## Import packages

In [1]:
import numpy as np
import pandas as pd
import mord
from sklearn import linear_model, metrics, preprocessing

## Read data

In [2]:
path = '../../data/cleaned'
training = pd.read_csv(path + '/training_cleaned_v1.csv')
testing = pd.read_csv(path + '/testing_cleaned_v1.csv')

In [3]:
print (training.isnull().sum())
training.columns

id                           0
amount_tsh                   0
year_recorded                0
month_recorded               0
day_recorded                 0
gps_height                   0
basin                        0
region                       0
population                   0
public_meeting            3334
permit                    3056
age                      20709
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group_new         0
payment                      0
quantity_group               0
quality_group_new            0
source                       0
source_type                  0
source_class                 0
waterpoint_type_new          0
status_group                 0
status_group_new             0
dtype: int64


Index(['id', 'amount_tsh', 'year_recorded', 'month_recorded', 'day_recorded',
       'gps_height', 'basin', 'region', 'population', 'public_meeting',
       'permit', 'age', 'extraction_type', 'extraction_type_group',
       'extraction_type_class', 'management', 'management_group_new',
       'payment', 'quantity_group', 'quality_group_new', 'source',
       'source_type', 'source_class', 'waterpoint_type_new', 'status_group',
       'status_group_new'],
      dtype='object')

## Ordinal Logistic Regression

In [4]:
le = preprocessing.LabelEncoder()

In [5]:
# encode region
le.fit(training['region'])
training['region_new'] = le.transform(training['region'])
training[['region','region_new']] # can label if not NA

Unnamed: 0,region,region_new
0,Iringa,3
1,Mara,9
2,Manyara,8
3,Mtwara,12
4,Kagera,4
5,Tanga,20
6,Shinyanga,17
7,Shinyanga,17
8,Tabora,19
9,Kagera,4


In [6]:
# extraction_type_class
le.fit(training['extraction_type_class'])
training['extraction_type_class_new'] = le.transform(training['extraction_type_class'])
training[['extraction_type_class','extraction_type_class_new']]

Unnamed: 0,extraction_type_class,extraction_type_class_new
0,gravity,0
1,gravity,0
2,gravity,0
3,submersible,5
4,gravity,0
5,submersible,5
6,handpump,1
7,handpump,1
8,handpump,1
9,handpump,1


In [7]:
le.fit(training['public_meeting'])
training['pm_encoded'] = le.transform(training['public_meeting'])
training[['public_meeting','pm_encoded']] # cannot label if NA

ValueError: y contains new labels: [nan nan nan ... nan nan nan]

In [8]:
sum(training['status_group_new'] == 1)

22824

In [9]:
# just try
X, y = training.loc[:,('amount_tsh', 'gps_height', 'region_new', 'population','extraction_type_class_new')], training['status_group_new']

In [14]:
reg0 = linear_model.LogisticRegression(
    solver='lbfgs',
    multi_class='multinomial')
reg0.fit(X, y)
print (reg0.predict(X))
print('Mean Absolute Error of LogisticSE %s' %
      metrics.mean_absolute_error(reg0.predict(X), y))
print (sum(reg0.predict(X)==1))

[3 3 3 ... 1 1 3]
Mean Absolute Error of LogisticSE 0.7626094276094276
15995


In [10]:
reg1 = mord.LogisticAT(alpha=1.)
reg1.fit(X, y)
print (reg1.predict(X))
print('Mean Absolute Error of LogisticSE %s' %
      metrics.mean_absolute_error(reg1.predict(X), y))
print (sum(reg1.predict(X)==1))

[3 3 3 ... 3 3 3]
Mean Absolute Error of LogisticSE 0.7808417508417509
8487


In [11]:
reg2 = mord.LogisticIT(alpha=1.)
reg2.fit(X, y)
print (reg2.predict(X))
print('Mean Absolute Error of LogisticSE %s' %
      metrics.mean_absolute_error(reg2.predict(X), y))
print (sum(reg2.predict(X)==1))

[3 3 3 ... 3 3 3]
Mean Absolute Error of LogisticSE 0.7463804713804714
12174


In [12]:
reg3 = mord.LogisticSE(alpha=1.)
reg3.fit(X, y)
print (reg3.predict(X))
print('Mean Absolute Error of LogisticSE %s' %
      metrics.mean_absolute_error(reg3.predict(X), y))
print (sum(reg3.predict(X)==1))

[3 2 2 ... 2 2 2]
Mean Absolute Error of LogisticSE 0.9088888888888889
0


In [15]:
# use reg2 on test set:

In [16]:
# encode region
le.fit(testing['region'])
testing['region_new'] = le.transform(testing['region'])
testing[['region','region_new']] # can label if not NA

Unnamed: 0,region,region_new
0,Manyara,8
1,Arusha,0
2,Singida,18
3,Lindi,7
4,Ruvuma,16
5,Arusha,0
6,Iringa,3
7,Mtwara,12
8,Mtwara,12
9,Kilimanjaro,6


In [17]:
# extraction_type_class
le.fit(testing['extraction_type_class'])
testing['extraction_type_class_new'] = le.transform(testing['extraction_type_class'])

In [18]:
X_test = testing.loc[:,('amount_tsh', 'gps_height', 'region_new', 'population','extraction_type_class_new')]
prediction_test = reg2.predict(X_test)
print (prediction_test)
print (sum(prediction_test==1))

[3 3 3 ... 3 3 3]
3170


In [19]:
# Let's get a submission!
testing['prediction_label'] = prediction_test
testing['status_group'] = np.where((testing['prediction_label'] == 1), 
                                                 'non functional', np.where((testing['prediction_label'] == 3), 'functional', 'functional needs repair'))
out = testing[['id', 'status_group']]

In [20]:
path_out = '../../data/submissions'
out.to_csv(path_out + '/out_v1.csv', index = False)

# To Do

## 1. Update data cleaning to make all categorical to numeric

## 2. Deal with NAs (mord does not seem like it), maybe fill? Then try mord more carefully and figure out what is going on for each method

## 3. stepwise try to figure out

## 4. try out other models

## 5. Maybe Write a CV function