In [463]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

## **Read in CSV Files**:

In [464]:
train_values_df = pd.read_csv('data/training_set_values.csv')
train_values_df.head(2) 

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [465]:
test_values_df = pd.read_csv('data/test_set_values.csv')
test_values_df.head(2)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,...,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe


In [466]:
submission_format_df = pd.read_csv('data/submission_format.csv')
submission_format_df.head()

Unnamed: 0,id,status_group
0,50785,predicted label
1,51630,predicted label
2,17168,predicted label
3,45559,predicted label
4,49871,predicted label


In [467]:
train_labels_df = pd.read_csv('data/training_set_labels.csv')
train_labels_df.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


## **Merging Dataframes Above**

In [478]:
# merging train_values_df & train_labels_df
train_values_df = pd.merge(train_values_df, train_labels_df, on='id', how='left')
# merging test_values_df & submission_format_df
test_values_df = pd.merge(test_values_df, submission_format_df, on='id', how='left')

---

## **EDA**

In [477]:
print(f'train_values_df columns: \n{list(train_values_df.columns)}\n{train_values_df.shape}\n')
print(f'test_values_df columns: \n{list(test_values_df.columns)}\n{test_values_df.shape}\n')
print(f'submission_format_df columns: \n{list(submission_format_df.columns)}\n{submission_format_df.shape}\n')
print(f'train_labels_df columns: \n{list(train_labels_df.columns)}\n{train_labels_df.shape}\n')

train_values_df columns: 
['id', 'total_static_head(ft)', 'date_recorded', 'funder', 'height', 'installer', 'longitude', 'latitude', 'waterpoint_name', 'basin_location', 'subvillage', 'region', 'local_gov_area', 'ward', 'population', 'scheme_management', 'permit_approved', 'construction_year', 'extraction_method', 'management_type', 'payment_frequency', 'quality_of_water', 'quantity_of_water', 'water_source', 'source_class', 'waterpoint_type_group', 'status_group']
(50918, 27)

test_values_df columns: 
['id', 'total_static_head(ft)', 'date_recorded', 'funder', 'height', 'installer', 'longitude', 'latitude', 'waterpoint_name', 'basin_location', 'subvillage', 'region', 'local_gov_area', 'ward', 'population', 'scheme_management', 'permit_approved', 'construction_year', 'extraction_method', 'management_type', 'payment_frequency', 'quality_of_water', 'quantity_of_water', 'water_source', 'source_class', 'waterpoint_type_group', 'status_group']
(12749, 27)

submission_format_df columns: 
['id

---

## **Identify Missing Values**

#### (7) columns with missing values in **train_values_df**:

In [476]:
train_values_df.isna().sum().sort_values(ascending=False).head(7)

status_group             0
local_gov_area           0
total_static_head(ft)    0
date_recorded            0
funder                   0
height                   0
installer                0
dtype: int64

#### (7) columns with missing values in **test_values_df**:

In [475]:
test_values_df.isna().sum().sort_values(ascending=False).head(7)

status_group             0
local_gov_area           0
total_static_head(ft)    0
date_recorded            0
funder                   0
height                   0
installer                0
dtype: int64

#### No values missing in **submission_format_df**:

In [472]:
submission_format_df.isna().sum().sort_values(ascending=False)

status_group    0
id              0
dtype: int64

#### No values missing in **train_labels_df**:

In [473]:
train_labels_df.isna().sum().sort_values(ascending=False)

status_group    0
id              0
dtype: int64

---

## **Data Limitations & Cleaning**

What we do to **train_values_df** we will do to **test_values_df**:
1. **Dropping Columns**:
- For example, **'scheme_name'** is missing 28,166 values out of 59,400 in train_values_df. As a result, we will drop this column and others we wont need.
2. **Dropping rows with missing values**:
- For rows with a few missing values, we will drop the rows so we can preserve the columns.
3. **Data Type Conversion**:
- The 'date_recorded' column was the only column that needed to be changes to datetime
4. **Renaming Columns**:
- Many of the columns have names that are confusing or dont represent the data. These have been changed.

In [474]:
# Dropping unneeded columns
dropped_columns = ['scheme_name', 'num_private', 'region_code', 'district_code',
                   'public_meeting', 'recorded_by', 'extraction_type',
                   'extraction_type_group', 'management', 'payment_type',
                   'water_quality', 'quantity_group', 'source_type', 'waterpoint_type'
                   ]
train_values_df = train_values_df.drop(columns=dropped_columns)
test_values_df = test_values_df.drop(columns=dropped_columns)


# Dropping rows with missing values
dropped_rows = ['scheme_management', 'installer','funder', 
                'permit', 'subvillage'
                ]
train_values_df.dropna(subset=dropped_rows, inplace=True)
test_values_df.dropna(subset=dropped_rows, inplace=True)


# Data Type Conversion
train_values_df['date_recorded'] = pd.to_datetime(train_values_df['date_recorded'])
test_values_df['date_recorded'] = pd.to_datetime(test_values_df['date_recorded'])


# Renaming Columns
renamed_col = {'amount_tsh': 'total_static_head(ft)', 'gps_height': 'height',
               'wpt_name': 'waterpoint_name', 'basin': 'basin_location',
               'lga': 'local_gov_area', 'permit': 'permit_approved',
               'extraction_type_class': 'extraction_method',
               'management_group': 'management_type',
               'payment': 'payment_frequency', 'quality_group': 'quality_of_water',
               'quantity': 'quantity_of_water', 'source': 'water_source'
               }
train_values_df = train_values_df.rename(columns=renamed_col)
test_values_df = test_values_df.rename(columns=renamed_col)


---

## **Sorting Dataframes**