# FSM for Tanzania Water Well Project

In [1]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score

from imblearn.pipeline import Pipeline as ImPipeline



In [2]:
# data
tanzania = pd.read_csv('final_df.csv')

In [3]:
tanzania_with_dates = tanzania.copy()

In [4]:
#drop construction year and data column
tanzania.drop(['construction_year'], axis = 1, inplace = True)
tanzania.drop(['date_recorded'], axis = 1, inplace = True)

In [5]:
#check out data
tanzania['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [6]:
# create fxn to tranform booleans to 0 and 1s
#def bool_transform(col):
#    tanzania[col] = tanzania[col].map({True : "1" , False  :"0"})
#    return tanzania[col].value_counts()

In [7]:
# transform public_meeting and permit

#bool_transform('permit')

In [8]:
# data organize for model
X = tanzania.drop(["status_group"], axis = 1)
y = tanzania['status_group'] # target

In [9]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [10]:

tanzania.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             59400 non-null  float64
 1   basin                  59400 non-null  object 
 2   subvillage             59400 non-null  object 
 3   region                 59400 non-null  object 
 4   population             59400 non-null  int64  
 5   public_meeting         59400 non-null  object 
 6   permit                 59400 non-null  object 
 7   extraction_type_class  59400 non-null  object 
 8   management             59400 non-null  object 
 9   payment_type           59400 non-null  object 
 10  quality_group          59400 non-null  object 
 11  quantity               59400 non-null  object 
 12  source                 59400 non-null  object 
 13  waterpoint_type        59400 non-null  object 
 14  status_group           59400 non-null  object 
 15  Ag

In [11]:
#Turn all numeric cols to int

#numeric = tanzania.select_dtypes(include=['float64', 'int64']).columns

# Convert numerical columns to float type
#tanzania[numeric] = tanzania[numeric].astype('Int64')


#change just that one column
tanzania['amount_tsh'] = tanzania['amount_tsh'].fillna(0).astype(int)

In [33]:
problem = tanzania[tanzania['amount_tsh'] == 'Wami / Ruvu']
problem

Unnamed: 0,amount_tsh,basin,subvillage,region,population,public_meeting,permit,extraction_type_class,management,payment_type,quality_group,quantity,source,waterpoint_type,status_group,Age


In [34]:
problem1 = tanzania[tanzania['population'] == 'Wami / Ruvu']
problem1

Unnamed: 0,amount_tsh,basin,subvillage,region,population,public_meeting,permit,extraction_type_class,management,payment_type,quality_group,quantity,source,waterpoint_type,status_group,Age


In [35]:
problem2 = tanzania[tanzania['Age'] == 'Wami / Ruvu']
problem2

Unnamed: 0,amount_tsh,basin,subvillage,region,population,public_meeting,permit,extraction_type_class,management,payment_type,quality_group,quantity,source,waterpoint_type,status_group,Age


In [49]:
# pipelines for numerical and categorical data

num_pipe = Pipeline(steps = [
    
    ('num_impute', SimpleImputer()), 
    ('ss', StandardScaler())

])


cat_pipe = Pipeline(steps = [
    
    ('cat_impute', SimpleImputer(strategy = 'most_frequent')),
    ('ohe', OneHotEncoder(sparse = False, handle_unknown = 'ignore')) #ohe for categorical

]) #strategy = 'most_frequent', fill_value = 0

In [50]:
tanzania.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   amount_tsh             59400 non-null  int64 
 1   basin                  59400 non-null  object
 2   subvillage             59400 non-null  object
 3   region                 59400 non-null  object
 4   population             59400 non-null  int64 
 5   public_meeting         59400 non-null  object
 6   permit                 59400 non-null  object
 7   extraction_type_class  59400 non-null  object
 8   management             59400 non-null  object
 9   payment_type           59400 non-null  object
 10  quality_group          59400 non-null  object
 11  quantity               59400 non-null  object
 12  source                 59400 non-null  object
 13  waterpoint_type        59400 non-null  object
 14  status_group           59400 non-null  object
 15  Age                

In [51]:
# find index with col\

num_cols = ['amount_tsh', 'population', 'Age']
cat_cols = ['basin', 'subvillage',
                     'region', 'public_meeting', 'permit',
                    'extraction_type_class', 'management', 'payment_type', 
                    'quality_group', 'quantity', 
                    'source', 'waterpoint_type']

In [52]:
trial = tanzania[['amount_tsh', 'population', 'Age']]

In [53]:
trial

Unnamed: 0,amount_tsh,population,Age
0,6000,109,24
1,0,280,13
2,25,250,14
3,0,58,37
4,0,0,2023
...,...,...,...
59395,10,125,24
59396,4700,56,27
59397,0,0,2023
59398,0,0,2023


In [54]:
wami_ruvu = trial.isin(['Wami / Ruvu']).any(axis=1).sum()
wami_ruvu

0

In [55]:
#column transformers
CT = ColumnTransformer(transformers = [('num_pipeline', num_pipe, num_cols),
                                       ('cat_pipeline', cat_pipe, cat_cols)])

In [56]:
#evaluate model fxn
def eval(model, X_train, y_train, X_test, y_test):
    print(f"""
    train accuracy score: {round(accuracy_score(y_train, model.predict(X_train)),4)} 
    test accuracy score: {round(accuracy_score(y_test, model.predict(X_test)),4)}
    
    train precision score: {round(precision_score(y_train, model.predict(X_train), average= 'weighted'),4)}
    test precision score: {round(precision_score(y_test, model.predict(X_test), average= 'weighted'),4)} 
    
    train recall score: {round(recall_score(y_train, model.predict(X_train), average= 'weighted'),4)}
    test recall score: {round(recall_score(y_test, model.predict(X_test), average= 'weighted'),4)} 
    
    train F1 score: {round(f1_score(y_train, model.predict(X_train), average= 'weighted'),4)}
    test F1 score: {round(f1_score(y_test, model.predict(X_test), average= 'weighted'),4)} 
    
    Confusion Matrix:""")     
    fig, ax = plt.subplots(figsize=(12, 12))
    plot_confusion_matrix(model, X_test, y_test, normalize = 'true', display_labels = ['Functional',
                                                                                       'Functional Needs Repair',
                                                                                       'Non Functional'])

## Dummy 

In [57]:
#dummy model
dummy_pipe = Pipeline(steps=([
    ("ct", CT), 
    ("dc", DummyClassifier())

]))


In [58]:
dummy_pipe.fit(X_train, y_train)



Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('num_pipeline',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'population',
                                                   'Age']),
                                                 ('cat_pipeline',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
         