In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_selection import mutual_info_classif, SelectKBest

from category_encoders import LeaveOneOutEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [32]:
# import data
dftrain = pd.read_csv('data/train_target.csv')
dftest = pd.read_csv('data/train_values.csv')

# merge on id
df = pd.merge(dftrain, dftest, how='inner', on='id')

# drop scheme name column because it is 50% NaNs - determined unimportant
df.drop('scheme_name', axis=1, inplace=True)

keep_columns = ['status_group', 'amount_tsh', 'gps_height', 'num_private', 'region', 'district_code', 'lga',
                'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type',
                'management', 'management_group', 'water_quality', 'quantity', 'source', 'waterpoint_type', 
                'population']
# 'ward', 'installer', 'funder', 

df = df[keep_columns]

# now can drop NaNs without losing half the columns
df.dropna(inplace=True)
# note that funder, installer, public_meeting, scheme_management, permit still contain a couple thous NaNs 

# can now select X and y from cleaned data
X = df.drop('status_group', axis=1)
y = df['status_group']

In [33]:
X['district_code'].astype('object')

0         5
2         4
3        63
5         8
6         3
         ..
59395     5
59396     4
59397     7
59398     4
59399     2
Name: district_code, Length: 49841, dtype: object

In [34]:
X['amount_tsh'].describe()

count     49841.000000
mean        352.215854
std        2793.448736
min           0.000000
25%           0.000000
50%           0.000000
75%          30.000000
max      250000.000000
Name: amount_tsh, dtype: float64

In [35]:
cont_columns = ['amount_tsh', 'gps_height', 'num_private', 'construction_year', 'population']


loo_columns = ['region', 'district_code', 'lga', 'management', 'management_group',
               'water_quality', 'quantity', 'source', 'waterpoint_type', 'scheme_management', 'extraction_type']

# test without
# 'funder', 'ward', 'installer', 

binary_columns = ['public_meeting', 'permit']

In [36]:
X['installer'].value_counts()

KeyError: 'installer'

In [37]:
dummies = pd.get_dummies(X, columns = loo_columns, drop_first=True)

In [38]:
dummies.head()

Unnamed: 0,amount_tsh,gps_height,num_private,public_meeting,permit,construction_year,region_Dar es Salaam,region_Dodoma,region_Iringa,region_Kagera,...,extraction_type_nira/tanira,extraction_type_other,extraction_type_other - mkulima/shinyanga,extraction_type_other - play pump,extraction_type_other - rope pump,extraction_type_other - swn 81,extraction_type_submersible,extraction_type_swn 80,extraction_type_walimi,extraction_type_windmill
0,6000.0,1390,0,True,False,1999,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,25.0,686,0,True,True,2009,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,263,0,True,True,1986,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,20.0,0,0,True,True,2009,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,0.0,0,0,True,True,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [39]:
X_train, X_test, y_train, y_test = train_test_split(dummies, y, test_size=0.25, random_state=123)

In [40]:
imputer = SimpleImputer(missing_values=0, strategy='median')
column_imputer = ColumnTransformer([('med_imputer', imputer, ['construction_year'])])

X_train_year_imp = column_imputer.fit_transform(X_train)
X_test_year_imp = column_imputer.transform(X_test)

In [41]:
X_train['construction_year'] = X_train_year_imp
X_test['construction_year'] = X_test_year_imp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['construction_year'] = X_train_year_imp
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['construction_year'] = X_test_year_imp


In [10]:
# LeaveOneOut encoding, trying against OHE

lab_enc = LabelEncoder()
y_train_enc = lab_enc.fit_transform(y_train)

loo_enc = LeaveOneOutEncoder(cols = loo_columns)
X_train_enc = loo_enc.fit_transform(X_train, y_train_enc)
X_test_enc = loo_enc.transform(X_test)

  elif pd.api.types.is_categorical(cols):


In [42]:
forest = RandomForestClassifier(n_estimators=200)
forest.fit(X_train, y_train)

train_preds = forest.predict(X_train)
test_preds = forest.predict(X_test)

print(classification_report(y_train, train_preds))
print(classification_report(y_test, test_preds))

                         precision    recall  f1-score   support

             functional       0.92      0.97      0.95     20470
functional needs repair       0.90      0.74      0.81      2750
         non functional       0.96      0.92      0.94     14160

               accuracy                           0.93     37380
              macro avg       0.93      0.88      0.90     37380
           weighted avg       0.93      0.93      0.93     37380

                         precision    recall  f1-score   support

             functional       0.80      0.84      0.82      6911
functional needs repair       0.44      0.32      0.37       919
         non functional       0.78      0.76      0.77      4631

               accuracy                           0.77     12461
              macro avg       0.67      0.64      0.65     12461
           weighted avg       0.77      0.77      0.77     12461



In [43]:
forest = RandomForestClassifier(class_weight='balanced')
forest.fit(X_train, y_train)

train_preds = forest.predict(X_train)
test_preds = forest.predict(X_test)

print(classification_report(y_train, train_preds))
print(classification_report(y_test, test_preds))

                         precision    recall  f1-score   support

             functional       0.97      0.91      0.93     20470
functional needs repair       0.55      0.95      0.70      2750
         non functional       0.96      0.91      0.94     14160

               accuracy                           0.91     37380
              macro avg       0.83      0.92      0.86     37380
           weighted avg       0.93      0.91      0.92     37380

                         precision    recall  f1-score   support

             functional       0.82      0.78      0.80      6911
functional needs repair       0.32      0.49      0.39       919
         non functional       0.78      0.76      0.77      4631

               accuracy                           0.75     12461
              macro avg       0.64      0.68      0.65     12461
           weighted avg       0.77      0.75      0.76     12461



In [70]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

train_preds = gnb.predict(X_train)
test_preds = gnb.predict(X_test)

print(classification_report(y_train, train_preds))
print(classification_report(y_test, test_preds))

                         precision    recall  f1-score   support

             functional       0.83      0.38      0.52     20000
functional needs repair       0.14      0.93      0.24      2639
         non functional       0.67      0.46      0.54     13849

               accuracy                           0.45     36488
              macro avg       0.54      0.59      0.43     36488
           weighted avg       0.72      0.45      0.51     36488

                         precision    recall  f1-score   support

             functional       0.82      0.37      0.51      6720
functional needs repair       0.13      0.90      0.22       860
         non functional       0.64      0.43      0.52      4583

               accuracy                           0.43     12163
              macro avg       0.53      0.57      0.42     12163
           weighted avg       0.70      0.43      0.49     12163



In [71]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

train_preds = knn.predict(X_train)
test_preds = knn.predict(X_test)

print(classification_report(y_train, train_preds))
print(classification_report(y_test, test_preds))

                         precision    recall  f1-score   support

             functional       0.78      0.90      0.83     20000
functional needs repair       0.65      0.38      0.48      2639
         non functional       0.82      0.72      0.77     13849

               accuracy                           0.79     36488
              macro avg       0.75      0.66      0.69     36488
           weighted avg       0.79      0.79      0.78     36488

                         precision    recall  f1-score   support

             functional       0.72      0.83      0.77      6720
functional needs repair       0.46      0.25      0.33       860
         non functional       0.71      0.62      0.66      4583

               accuracy                           0.71     12163
              macro avg       0.63      0.57      0.59     12163
           weighted avg       0.70      0.71      0.70     12163



In [45]:
from xgboost import XGBClassifier

In [46]:
X_train['public_meeting_bool'] = None
X_test['public_meeting_bool'] = None

X_train['public_meeting_bool'] = X_train['public_meeting'].map(lambda x: 1 if x == True else 0)
X_test['public_meeting_bool'] = X_test['public_meeting'].map(lambda x: 1 if x == True else 0)

X_train.drop('public_meeting', axis=1, inplace=True)
X_test.drop('public_meeting', axis=1, inplace=True)


X_train['permit_bool'] = None
X_test['permit_bool'] = None

X_train['permit_bool'] = X_train['permit'].map(lambda x: 1 if x == True else 0)
X_test['permit_bool'] = X_test['permit'].map(lambda x: 1 if x == True else 0)

X_train.drop('permit', axis=1, inplace=True)
X_test.drop('permit', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['public_meeting_bool'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['public_meeting_bool'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['public_meeting_bool'] = X_train['public_meeting'].map(lambda x: 1 if x == True else 0)
A value is trying to be set on a c

In [31]:
X_train['public_meeting_bool'].value_counts()

1    19446
0     1413
Name: public_meeting_bool, dtype: int64

In [54]:
## need to rename columns so they dont have shit in them

for x in [',', ' ', '[', ']', '<']:
    X_train.rename(lambda c: c.replace(x, ""), axis='columns', inplace=True)
    X_test.rename(lambda c: c.replace(x, ""), axis='columns', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [47]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

train_preds = xgb.predict(X_train)
test_preds = xgb.predict(X_test)

print(classification_report(y_train, train_preds))
print(classification_report(y_test, test_preds))

                         precision    recall  f1-score   support

             functional       0.79      0.93      0.85     20470
functional needs repair       0.71      0.24      0.36      2750
         non functional       0.87      0.74      0.80     14160

               accuracy                           0.81     37380
              macro avg       0.79      0.64      0.67     37380
           weighted avg       0.81      0.81      0.80     37380

                         precision    recall  f1-score   support

             functional       0.77      0.91      0.84      6911
functional needs repair       0.63      0.21      0.31       919
         non functional       0.82      0.72      0.76      4631

               accuracy                           0.78     12461
              macro avg       0.74      0.61      0.64     12461
           weighted avg       0.78      0.78      0.77     12461



In [11]:
X_train_enc

Unnamed: 0,funder,gps_height,installer,num_private,region,district_code,lga,ward,public_meeting,scheme_management,permit,construction_year,extraction_type,management,management_group,water_quality,quantity,source,waterpoint_type
40145,0.843357,1417,0.410853,0,0.383308,0.608347,0.671875,0.421053,True,0.847104,True,1984.0,0.686279,0.866608,0.775795,0.750489,0.536061,0.707641,0.649172
29867,1.295455,522,0.820000,0,0.758532,0.822565,0.763636,0.857143,True,0.971545,True,1975.0,0.686143,1.257447,0.760430,0.750386,1.500000,0.707287,0.649041
40505,0.151515,2134,0.253521,0,0.383308,0.652115,0.346740,0.124402,True,0.847104,False,2004.0,0.686279,0.866608,0.775795,0.750489,0.536061,0.676045,0.649172
16909,1.059952,1373,1.571429,0,0.712500,0.720642,1.028169,2.000000,True,0.846957,False,1982.0,0.686143,0.866472,0.775688,0.750386,0.884250,0.707287,1.599119
4560,0.841958,0,1.493304,0,1.205793,0.881448,1.213622,1.750000,True,0.846957,True,2000.0,0.686143,0.866472,0.775688,0.750386,1.958974,0.675843,0.649041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32878,0.091954,277,0.220183,0,0.583431,0.823047,0.461140,0.000000,True,0.507647,True,2008.0,0.815578,0.520557,0.775795,0.750489,0.536061,0.707641,0.649172
46019,1.057047,0,0.711715,0,1.027384,0.822806,0.975610,1.054054,True,0.847030,False,2000.0,0.686211,0.866540,0.775741,0.750438,0.535982,0.675944,0.649107
37785,1.172065,1088,0.711715,0,0.835720,0.881733,0.896887,0.944444,True,0.847030,True,1994.0,0.686211,0.866540,0.775741,0.750438,0.535982,0.707464,1.150393
33581,1.060392,961,0.711872,0,0.580247,0.721205,0.576212,0.618421,True,0.847104,True,1980.0,0.686279,0.866608,0.775795,0.750489,0.884630,0.676045,0.649172


In [15]:
rf_clf = RandomForestClassifier(n_estimators=200, max_features=3, min_samples_leaf=2)
rf_clf.fit(X_train_enc, y_train)

RandomForestClassifier(max_features=3, min_samples_leaf=2, n_estimators=200)

In [16]:
train_preds = rf_clf.predict(X_train_enc)
test_preds = rf_clf.predict(X_test_enc)

In [17]:
print(classification_report(y_train, train_preds))

                         precision    recall  f1-score   support

             functional       1.00      1.00      1.00     11997
functional needs repair       1.00      1.00      1.00      1550
         non functional       1.00      1.00      1.00      7312

               accuracy                           1.00     20859
              macro avg       1.00      1.00      1.00     20859
           weighted avg       1.00      1.00      1.00     20859



In [18]:
print(classification_report(y_test, test_preds))

                         precision    recall  f1-score   support

             functional       0.50      0.00      0.00      4059
functional needs repair       0.07      1.00      0.13       463
         non functional       0.80      0.01      0.02      2432

               accuracy                           0.07      6954
              macro avg       0.46      0.34      0.05      6954
           weighted avg       0.58      0.07      0.02      6954



In [19]:
svm_clf = SVC(kernel='poly')
svm_clf.fit(X_train_enc, y_train)

train_preds = svm_clf.predict(X_train_enc)
test_preds = svm_clf.predict(X_test_enc)

In [20]:
print(classification_report(y_train, train_preds))

  _warn_prf(average, modifier, msg_start, len(result))


                         precision    recall  f1-score   support

             functional       0.58      1.00      0.73     11997
functional needs repair       0.00      0.00      0.00      1550
         non functional       0.00      0.00      0.00      7312

               accuracy                           0.58     20859
              macro avg       0.19      0.33      0.24     20859
           weighted avg       0.33      0.58      0.42     20859



In [21]:
print(classification_report(y_test, test_preds))

                         precision    recall  f1-score   support

             functional       0.58      1.00      0.74      4059
functional needs repair       0.00      0.00      0.00       463
         non functional       0.00      0.00      0.00      2432

               accuracy                           0.58      6954
              macro avg       0.19      0.33      0.25      6954
           weighted avg       0.34      0.58      0.43      6954

