In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

import functions_used as func

In [2]:
file_train = "~/ds/projects/mod_3_project/tanzania-water-wells/data/raw/training-set-values.csv" #x_train, training set values data
file_target_train = "~/ds/projects/mod_3_project/tanzania-water-wells/data/raw/training-labels.csv" #y_train, training set labels
file_test = "~/ds/projects/mod_3_project/tanzania-water-wells/data/raw/test-set.csv" #x_test, test set data

In [3]:
features = pd.read_csv(file_train)
targets = pd.read_csv(file_target_train)
X_test = pd.read_csv(file_test)

## Functions 

In [33]:
# Be sure to train/test split before processing DFs
def model_preprocessing(df, feature_list, ohe, train=True):
    print('Beginning numerical cleaning...')
    df = numerical_clean(df, feature_list)
    print('Completed numerical cleaning.\n')
    
    print('Removing the target from the cleaned data frame...')
    target = df['status_group']
    print("---Length of target: ", len(target))
    df = df.drop(columns='status_group', axis = 1)
    print("---Shape of dataframe: ", df.shape)
    
    print("Reading the remaining columns as independent features\n")
    obj_list = obj_lister(df)
    
    print('Begining "object" cleaning...')
    ohe_df = obj_preprocessing(df, obj_list, ohe, train)
    print("---Shape of ohe_df: ", ohe_df.shape)
    print('...ending "object" cleaning.')
    
    print("Joining the cleaned numerical and object dataframes together.")
    # dropping the independent features from X
    df = df.drop(obj_list, axis=1)
    # joining the OHE dataframe to X
    model_df = df.join(ohe_df)
    print('Returning the main (independent features, X) and target (y) data frames...')
    return model_df, target


def numerical_clean(df, feature_list):
    #this takes the df and the list of numerical features to clean
    df = df[feature_list]
    print("check: df shape = ", df.shape)
    print('---Dropping 0 longitudes...')
    df = drop_zero_long(df)
    print("check: df shape = ", df.shape)
    print("---Replace 0's with average constructor year...")
    df = con_year_avg(df)
    print("check: df shape = ", df.shape)
    print('...returning a cleaned dataframe of numerical values.')
    return df

def drop_zero_long(df):
    return df.drop(df[df.longitude==0].index)

def con_year_avg(df):
    con_year_nonzero = df.replace(0, np.nan)
    avg_con_years = pd.DataFrame(con_year_nonzero.groupby(['extraction_type']).mean()['construction_year'])
    df = df.join(avg_con_years, rsuffix = '_avg', on = 'extraction_type')
    df = df.reset_index()
    df = df.drop(['index'], axis = 1)
#    df['construction_year'] = df.apply(con_year, axis=1)
    df = df.drop(['construction_year_avg'], axis = 1)
    return df

def obj_lister(df):
    # returns a list of columns that contain Objects
    obj_list = []
    for col in df.select_dtypes([np.object]):
        obj_list.append(col)
    return obj_list

def obj_preprocessing(df, obj_list, ohe, train = True):
    '''
    
    '''
    df_current = df[obj_list]
    # Clean the df if there are NaNs
    df = NaN_cleaning(df_current)
    #OHE data
    array_current = ohe_data(df, ohe, train)
    #return a dataframe of the OHE data
    return pd.DataFrame(array_current)


def NaN_cleaning(df):
    # Replace NaN with "unknown" bin
    print('---Replacing NaN with "unknown" bin...')
    df = df.replace(np.nan, 'unknown')
    print(f'---Check: Number of rows with nulls: {len(df[df.isna().any(axis=1)])}...\n')
    return df.reset_index(drop=True)

def ohe_data(df, ohe, train):
    #OHE the data
    print('Begin one hot encoding data...')
    if train:
        array_current = ohe.fit_transform(df).toarray()
    else:
        array_current = ohe.transform(df).toarray()
    print('Finish one hot encoding data...\n')
    return array_current

## Model 1: Decision Tree

In [16]:
#note the target is listed in here

features_list = ['basin', 'region', 'scheme_management', 'scheme_name',
       'extraction_type', 'management', 'payment', 'water_quality', 'quantity',
       'source', 'waterpoint_type','gps_height', 'longitude', 'latitude', 
       'region_code', 'district_code', 'population', 'construction_year', 'status_group']   


In [17]:
ohe = OneHotEncoder(handle_unknown = 'ignore')

- Perform a train test spit of the "training data" given in the problem. 
- Join the training data (X and y) together.


In [18]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, random_state=42)

In [19]:
joined_train = X_train.join(y_train, lsuffix='_l', rsuffix='_r')

- Perform the pre-processing. Clean the numerical data. Perform One Hot Encoding.

In [34]:
joined_train_processed, y_train = model_preprocessing(joined_train, features_list, ohe, train = True)

Beginning numerical cleaning...
check: df shape =  (44550, 19)
---Dropping 0 longitudes...
check: df shape =  (43211, 19)
---Replace 0's with average constructor year...
check: df shape =  (43211, 19)
...returning a cleaned dataframe of numerical values.
Completed numerical cleaning.

Removing the target from the cleaned data frame...
---Length of target:  43211
---Shape of dataframe:  (43211, 18)
Reading the remaining columns as independent features

Begining "object" cleaning...
---Replacing NaN with "unknown" bin...
---Check: Number of rows with nulls: 0...

Begin one hot encoding data...
Finish one hot encoding data...

---Shape of ohe_df:  (43211, 2572)
...ending "object" cleaning.
Joining the cleaned numerical and object dataframes together.
Returning the main (independent features, X) and target (y) data frames...


- Train the decision tree with training data.

In [26]:
dtc = DecisionTreeClassifier(random_state=42, max_depth=5) 
dtc.fit(joined_train_processed, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

- Repeat the pre-processing on the test data.
- Join the testing data (X and y) together.

In [29]:
joined_test = X_test.join(y_test, lsuffix='_l', rsuffix='_r')

- Preprocess the testing data.

In [30]:
joined_test_processed, y_test = model_preprocessing(joined_test, features_list, ohe, train=False)

Beginning numerical cleaning...
check: df shape =  (14850, 19)
---Dropping 0 longitudes...
check: df shape =  (14377, 19)
---Replace 0's with average constructor year...
check: df shape =  (14377, 19)
...returning a cleaned dataframe of numerical values.

...completed numerical cleaning.

Removing the target from the cleaned data frame...
---Length of target:  14377
---Shape of dataframe:  (14377, 18)
Reading the remaining columns as independent features

Begining "object" cleaning...
---Replacing NaN with "unknown" bin...
---Check: Number of rows with nulls: 0...

Begin one hot encoding data...
Finish one hot encoding data...

---Shape of ohe_df:  (14377, 2572)
...ending "object" cleaning.
Joining the cleaned numerical and object dataframes together.
Returning the main (independent features, X) and target (y) data frames...


- Make predictions with the test data.

In [31]:
predicts = dtc.predict(joined_test_processed)

- Check the score of the model.

In [32]:
dtc.score(joined_test_processed, y_test) #x_test, y_test from split

0.7052931766015164