In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 999)

In [2]:
df_training_labels = pd.read_csv('Phase_3_Project/data/Training_set_labels.csv')
df_training_labels['id_2'] = df_training_labels['id']
df_training_labels.drop('id', axis = 1, inplace = True)
df_training_labels['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [3]:
df_training_values = pd.read_csv('Phase_3_Project/data/Training_set_values.csv')

In [4]:
df_training_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   status_group  59400 non-null  object
 1   id_2          59400 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 928.2+ KB


In [5]:
df_training_values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [6]:
df = pd.concat([df_training_values, df_training_labels], join = 'inner', axis = 1)
df.drop('scheme_name', axis = 1, inplace = True)

In [7]:
df.isna().sum()

id                          0
amount_tsh                  0
date_recorded               0
funder                   3635
gps_height                  0
installer                3655
longitude                   0
latitude                    0
wpt_name                    0
num_private                 0
basin                       0
subvillage                371
region                      0
region_code                 0
district_code               0
lga                         0
ward                        0
population                  0
public_meeting           3334
recorded_by                 0
scheme_management        3877
permit                   3056
construction_year           0
extraction_type             0
extraction_type_group       0
extraction_type_class       0
management                  0
management_group            0
payment                     0
payment_type                0
water_quality               0
quality_group               0
quantity                    0
quantity_g

In [8]:
df = df.fillna(df['funder'].value_counts().index[0])

In [9]:
df = df.fillna(df['installer'].value_counts().index[0])

In [10]:
df = df.fillna(df['subvillage'].value_counts().index[0])

In [11]:
df = df.fillna(df['public_meeting'].value_counts().index[0])

In [12]:
df = df.fillna(df['scheme_management'].value_counts().index[0])

In [13]:
df = df.fillna(df['permit'].value_counts().index[0])

In [14]:
mask = df.applymap(type) != bool
d = {True: 'True', False: 'False'}

df = df.where(mask, df.replace(d))

In [15]:
# pf = PolynomialFeatures(degree=2)

# X = df_no_cats

# pf.fit(X)
# pdf = pd.DataFrame(pf.transform(X), columns=pf.get_feature_names(X.columns))
# pdf
# corr_df = pdf.corr()
# corr_df[corr_df['x0'] > .75]['x0']

In [16]:
df = df[df['id'] == df['id_2']]

In [17]:
df.drop(['id_2'], axis = 1, inplace = True)
df.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,Government Of Tanzania,GeoData Consultants Ltd,Other,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,Government Of Tanzania,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [18]:
df.drop(['id', 'date_recorded', 'funder', 'longitude', 'latitude'], axis = 1, inplace = True)

In [27]:
df.head()

Unnamed: 0,amount_tsh,gps_height,installer,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,6000.0,1390,Roman,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,0.0,1399,GRUMETI,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,Government Of Tanzania,GeoData Consultants Ltd,Other,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,25.0,686,World vision,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,0.0,263,UNICEF,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,0.0,0,Artisan,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,Government Of Tanzania,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [36]:
df['source'].value_counts()

spring                  17021
shallow well            16824
machine dbh             11075
river                    9612
rainwater harvesting     2295
hand dtw                  874
lake                      765
dam                       656
other                     212
unknown                    66
Name: source, dtype: int64

In [19]:
for c in df.columns:
    print ("---- %s ---" % c)
    print (df[c].value_counts())

---- amount_tsh ---
0.0         41639
500.0        3102
50.0         2472
1000.0       1488
20.0         1463
            ...  
8500.0          1
6300.0          1
220.0           1
138000.0        1
12.0            1
Name: amount_tsh, Length: 98, dtype: int64
---- gps_height ---
 0       20438
-15         60
-16         55
-13         55
-20         52
         ...  
 2285        1
 2424        1
 2552        1
 2413        1
 2385        1
Name: gps_height, Length: 2428, dtype: int64
---- installer ---
DWE                           17402
Government Of Tanzania         3655
Government                     1825
RWE                            1206
Commu                          1060
                              ...  
AQUA Wat                          1
Friedkin conservation fund        1
LC                                1
KURRP                             1
Colonial government               1
Name: installer, Length: 2146, dtype: int64
---- wpt_name ---
none                3563
Shuleni

In [20]:
df_cats = df.select_dtypes('object')

In [21]:
df['waterpoint_type_group'].value_counts()

communal standpipe    34625
hand pump             17488
other                  6380
improved spring         784
cattle trough           116
dam                       7
Name: waterpoint_type_group, dtype: int64

In [22]:
df.groupby('status_group')['waterpoint_type_group'].value_counts()['functional'].values / df['waterpoint_type_group'].value_counts().values

array([0.57649097, 0.61785224, 0.13166144, 0.71811224, 0.72413793,
       0.85714286])

In [23]:
df.groupby('status_group')['waterpoint_type_group'].value_counts()['functional']

waterpoint_type_group
communal standpipe    19961
hand pump             10805
other                   840
improved spring         563
cattle trough            84
dam                       6
Name: waterpoint_type_group, dtype: int64

In [24]:
df_test = pd.DataFrame(df.groupby('status_group')['waterpoint_type'].value_counts())
df_test['type'] = df_test['waterpoint_type']
df_test.drop('waterpoint_type', axis = 1, inplace = True)
df_test.reset_index(inplace = True)

In [25]:
df.columns.to_list()

['amount_tsh',
 'gps_height',
 'installer',
 'wpt_name',
 'num_private',
 'basin',
 'subvillage',
 'region',
 'region_code',
 'district_code',
 'lga',
 'ward',
 'population',
 'public_meeting',
 'recorded_by',
 'scheme_management',
 'permit',
 'construction_year',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'quantity_group',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group',
 'status_group']

In [26]:
for column in df:
    for stat in stat_group:
            numer = df.groupby('status_group')[column].value_counts()[stat].values.sum()
            denom = df[column].value_counts().values.sum()
            print(f'{column}, {stat}:', numer/denom)

NameError: name 'stat_group' is not defined

In [None]:
stat_group = df['status_group'].value_counts().index.to_list()

for column in df:
    fig, ax = plt.subplots(figsize = (5,5))
    x = []
    y = []
    for stat in stat_group:
        y.append(df.groupby('status_group')[column].value_counts()[stat].values.sum())
        x.append(stat)
    plt.title(column)
    ax.bar(x, y)
    plt.xticks(rotation=45, ha = 'right')

In [None]:
for i in df:
    print('------- {} -------'.format(i))
    print(df.groupby('status_group')[i].value_counts())

In [None]:
X = df.select_dtypes(['int64', 'float64'])
y = df['status_group']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

In [None]:
dtc.score(X_train, y_train)

In [None]:
dtc.feature_importances_

In [None]:
dtc.n_features_

In [None]:
dtc.classes_

In [None]:
dtc.score(X_test, y_test)

In [None]:
df.select_dtypes(['int64', 'float64']).columns.to_list()

In [None]:
X = df.drop('status_group', axis = 1)
y = df['status_group']

In [None]:
numeric_features = [
 'amount_tsh',
 'gps_height',
 'num_private',
 'population']

numeric_transformer = Pipeline([('scaler', StandardScaler())])

categorical_features = ['construction_year',
 'region_code',
 'district_code',                          
 'funder',
 'installer',
 'wpt_name',
 'basin',
 'subvillage',
 'region',
 'lga',
 'ward',
 'public_meeting',
 'recorded_by',
 'scheme_management',
 'permit',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'quantity_group',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group']


categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

preprocessor = ColumnTransformer([('num', numeric_transformer, numeric_features),
                               ('cat', categorical_transformer, categorical_features)])

clf = Pipeline([('preprocessor', preprocessor), 
               ('classifier', LogisticRegression(max_iter = 10000))])

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

clf.fit(X_train, y_train)

In [None]:
clf.score(X_train, y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_validate

clfs = []
clfs.append(DecisionTreeClassifier())
clfs.append(RandomForestClassifier(n_jobs = 8))
clfs.append(GradientBoostingClassifier())

for classifier in clfs:
    pipeline.set_params(clf = classifier)
    scores = cross_validate(pipeline, X_train, y_train)
    print('---------------------------------')
    print(str(classifier))
    print('-----------------------------------')
    for key, values in scores.items():
            print(key,' mean ', values.mean())
            print(key,' std ', values.std())