In [61]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

## **Read in CSV Files**:

In [62]:
train_values_df = pd.read_csv('data/training_set_values.csv')
train_values_df.head(2) 

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [63]:
test_values_df = pd.read_csv('data/test_set_values.csv')
test_values_df.head(2)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,...,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe


In [64]:
submission_format_df = pd.read_csv('data/submission_format.csv')
submission_format_df.head()

Unnamed: 0,id,status_group
0,50785,predicted label
1,51630,predicted label
2,17168,predicted label
3,45559,predicted label
4,49871,predicted label


In [65]:
train_labels_df = pd.read_csv('data/training_set_labels.csv')
train_labels_df.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


## **Merging Dataframes Above**

In [66]:
# merging train_values_df & train_labels_df
train_values_df = pd.merge(train_values_df, train_labels_df, on='id', how='left')
# merging test_values_df & submission_format_df
test_values_df = pd.merge(test_values_df, submission_format_df, on='id', how='left')

---

## **EDA**

In [67]:
print(f'train_values_df columns: \n{list(train_values_df.columns)}\n{train_values_df.shape}\n')
print(f'test_values_df columns: \n{list(test_values_df.columns)}\n{test_values_df.shape}\n')
print(f'submission_format_df columns: \n{list(submission_format_df.columns)}\n{submission_format_df.shape}\n')
print(f'train_labels_df columns: \n{list(train_labels_df.columns)}\n{train_labels_df.shape}\n')

train_values_df columns: 
['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer', 'longitude', 'latitude', 'wpt_name', 'num_private', 'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga', 'ward', 'population', 'public_meeting', 'recorded_by', 'scheme_management', 'scheme_name', 'permit', 'construction_year', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group', 'status_group']
(59400, 41)

test_values_df columns: 
['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer', 'longitude', 'latitude', 'wpt_name', 'num_private', 'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga', 'ward', 'population', 'public_meeting', 'recorded_by', 'scheme_management', 'scheme_name', 'permit', 'construction_year',

---

## **Identify Missing Values**

#### (7) columns with missing values in **train_values_df**:

In [68]:
train_values_df.isna().sum().sort_values(ascending=False).head(7)

scheme_name          28166
scheme_management     3877
installer             3655
funder                3635
public_meeting        3334
permit                3056
subvillage             371
dtype: int64

#### (7) columns with missing values in **test_values_df**:

In [69]:
test_values_df.isna().sum().sort_values(ascending=False).head(7)

scheme_name          7092
scheme_management     969
installer             877
funder                869
public_meeting        821
permit                737
subvillage             99
dtype: int64

#### No values missing in **submission_format_df**:

In [70]:
submission_format_df.isna().sum().sort_values(ascending=False)

id              0
status_group    0
dtype: int64

#### No values missing in **train_labels_df**:

In [71]:
train_labels_df.isna().sum().sort_values(ascending=False)

id              0
status_group    0
dtype: int64

---

## **Data Limitations & Cleaning**

What we do to **train_values_df** we will do to **test_values_df**:
1. **Dropping Columns**:
- For example, **'scheme_name'** is missing 28,166 values out of 59,400 in train_values_df. As a result, we will drop this column and others we wont need.
2. **Dropping rows with missing values**:
- For rows with a few missing values, we will drop the rows so we can preserve the columns.
3. **Data Type Conversion**:
- The 'date_recorded' column was the only column that needed to be changes to datetime
4. **Renaming Columns**:
- Many of the columns have names that are confusing or dont represent the data. These have been changed.

In [72]:

# Dropping unneeded columns
dropped_columns = ['scheme_name', 'num_private', 'region_code', 'district_code',
                   'public_meeting', 'recorded_by', 'extraction_type',
                   'extraction_type_group', 'management', 'payment_type',
                   'water_quality', 'quantity_group', 'source_type', 'waterpoint_type' ,'payment_type' , 'funder' , 'subvillage', 'lga', 'construction_year', 'date_recorded',
                   'scheme_management', 'installer', 'id', 'population', 'longitude', 'latitude', 'waterpoint_type_group'
                   ]
train_values_df = train_values_df.drop(columns=dropped_columns)
test_values_df = test_values_df.drop(columns=dropped_columns)


# Dropping rows with missing values
dropped_rows = ['permit'
                ]
train_values_df.dropna(subset=dropped_rows, inplace=True)
test_values_df.dropna(subset=dropped_rows, inplace=True)

# Renaming Columns
renamed_col = {'amount_tsh': 'total_static_head(ft)', 'gps_height': 'height',
               'wpt_name': 'waterpoint_name', 'basin': 'basin_location',
                'permit': 'permit_approved',
               'extraction_type_class': 'extraction_method',
               'management_group': 'management_type',
                'quality_group': 'quality_of_water',
               'quantity': 'quantity_of_water', 'source': 'water_source'
               }
train_values_df = train_values_df.rename(columns=renamed_col)
test_values_df = test_values_df.rename(columns=renamed_col)


---

In [73]:
train_values_df['status_group'].value_counts()

functional                 30586
non functional             21741
functional needs repair     4017
Name: status_group, dtype: int64

In [74]:
# Adding Functional needs repairs values to Functional
train_values_df[train_values_df['status_group'] == 'functional needs repair'] = "functional"
train_values_df['status_group'].value_counts()

functional        34603
non functional    21741
Name: status_group, dtype: int64

## **Sorting Dataframes**

In [75]:
train_values_df.columns[0:8]


Index(['total_static_head(ft)', 'height', 'waterpoint_name', 'basin_location',
       'region', 'ward', 'permit_approved', 'extraction_method'],
      dtype='object')

In [76]:
#if unknown makes up less than 1%, drop unknown rows from our table 
def drop_unknown(df, column):
    counts = df[column].value_counts(normalize=True)
    if 'unknown' in counts.index and counts['unknown'] < 0.01:
        df.drop(df[df[column] == 'unknown'].index, inplace=True)

#loop through columns to drop unknowns
for column in train_values_df.columns: 
    drop_unknown(train_values_df, column)

## Modeling

## Decision Tree Modeling


In [77]:
from sklearn.tree import DecisionTreeClassifier


In [78]:
train_values_df['status_group'] = train_values_df['status_group'].map({'functional': 1, 'non functional': 0})

In [79]:
train_values_df['status_group'].value_counts()

1    34143
0    21022
Name: status_group, dtype: int64

In [80]:
from sklearn.preprocessing import OneHotEncoder
X = train_values_df.drop('status_group', axis=1)
y = train_values_df['status_group']

X_cats = X.select_dtypes(include='object')

ohe = OneHotEncoder(drop='first')
ohe.fit(X_cats)
X_cats_encoded = pd.DataFrame(ohe.transform(X_cats).todense(), columns=ohe.get_feature_names_out())

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['float', 'str']

In [None]:
X_cats_encoded.shape

In [None]:
#concat with numerical columns 
X_num = X.select_dtypes(exclude='object')
X_cats_encoded = X_cats_encoded.reset_index(drop=True)
X_num = X_num.reset_index(drop=True)

In [None]:

X_processed = pd.concat([X_num, X_cats_encoded], axis=1)

In [None]:
X_processed.shape

In [None]:
#perform train-test split
X = X_processed
y = train_values_df['status_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
#fit decision tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

In [None]:
#predict from X_test
y_preds = dt.predict(X_test)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

ConfusionMatrixDisplay.from_estimator(dt, X_test, y_test)

In [None]:
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score

recall = recall_score(y_test, y_preds)
accuracy = accuracy_score(y_test, y_preds)
f1 = f1_score(y_test, y_preds)

print(f'recall: {recall}')
print(f'accuracy: {accuracy}')
print(f'f1: {f1}')