## Data cleaning integrated with pipeline
Because we've got a relatively small number of variables, we've been picky with using them. Below, I use define a class to clean the data. That didn't really work, so I put it in a function. Once that cell has been run, the `FunctionTransformer` can use it in the pipeline using `('data_transformer', my_transformer),`. I really wanted to get into variable importance, but I can't find the `coef_` method. I'll get more into it Monday. Probably.


In [223]:
# usual imports
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
# data methods
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
# misc
import joblib

In [285]:
# create a class that we can pass to the pipeline
class DataCleaner:

    def __init__(self):
        print('Cleaning ...')

    def clean_numeric(self, df):
        #
        # Construction year
        df['construction_year'] = df['construction_year'].replace(0, np.nan)
        #Impute using region + installer
        df['construction_year'] = df.groupby(['region', 'installer'])['construction_year'].transform(
            lambda x: x.fillna(x.median())
        )
        #Impute using region only (for rows still missing)
        df['construction_year'] = df.groupby('region')['construction_year'].transform(
            lambda x: x.fillna(x.median())
        )
        #Use recorded year - 13
        df['date_recorded'] = pd.to_datetime(df['date_recorded'])
        df['recorded_year'] = df['date_recorded'].dt.year
        df['construction_year'] = df['construction_year'].fillna(df['recorded_year'] - 13)
        #
        # gps_height
        df['gps_height'] = df['gps_height'].apply(lambda x: np.nan if x <= 0 else x)
        # Fill using median per lga
        df['gps_height'] = df.groupby('lga')['gps_height'].fillna(df['gps_height'].median())
        # Fill any still missing using region median
        df['gps_height'] = df.groupby(['region'])['gps_height'].fillna(df['gps_height'].median())
        # Longitude and latitude
        df['longitude'] = df['longitude'].replace(0, np.nan)
        df['latitude'] = df['latitude'].where(df['latitude'] < -0.5, np.nan) # too close to the equator
        for i in ['latitude','longitude']: # loop to fill by lga, region
            df[i] = df.groupby(['lga'])[i].fillna(df[i].median())
            df[i] = df.groupby(['region'])[i].fillna(df[i].median())
        #
        # population
        # Fill population using median by district_code
        df['population'] = df.groupby('lga')['population'].transform(
            lambda x: x.fillna(x.median())
        )
        # Fill any still missing with median by region, then overall median
        df['population'] = df.groupby('region')['population'].transform(
            lambda x: x.fillna(x.median())
        )
        df['population'] = df['population'].fillna(df.population.median)
        # Bin the outcome, see how it behaves
        df['population'] = pd.cut(df['population'], [-1,1,25,90,160,260,9999999], labels=[0,0.2,0.3,0.4,0.6,1])
        df['population'] = df['population'].astype(float)
        #
        # amount_tsh
        df['amount_tsh'] = df['amount_tsh'].apply(lambda x: min(x, 15000))
        df['amount_tsh'] = df['amount_tsh'].apply(lambda x: np.power(x,0.3))
        return df

    def clean_categorical(self, df):
            ### Encode categorical variables
        # Encode 'quantity' (and typo fix: 'insufficent' -> 'insufficient')
        df['quantity'] = df['quantity'].replace({
            'enough': 1,
            'seasonal': 0.6,
            'insufficient': 0.4,
            'dry': 0,
            'unknown': 0
        })
        df.quantity = pd.to_numeric(df.quantity, errors='coerce')

        # Encode 'water_quality' as binary: good = 1, else 0
        df['water_quality'] = np.where(df['water_quality'] == 'soft', 1, 0)
        # Encode 'waterpoint_type' (1 = preferred type, 0 = everything else)
        preferred_waterpoint = ['communal standpipe multiple', 'communal standpipe']
        df['waterpoint_type'] = df['waterpoint_type'].apply(lambda x: 1 if x in preferred_waterpoint else 0)
        # Encode 'permit' as binary: True = 1, False, missing = 0
        df['permit'] = np.where(df['permit'] == 'True', 1, 0)
        # Encode 'payment' as binary: never pay = 0, else = 1
        df['payment'] = np.where(df['payment'] == 'never pay', 0, 1)
        # Encode 'source' (1 = preferred sources, 0 = everything else)
        preferred_sources = ['spring', 'river', 'rainwater harvesting']
        df['source'] = df['source'].apply(lambda x: 1 if x in preferred_sources else 0)
        # Encode 'payment' as binary: never pay = 0, else = 1
        df['extraction_type_class'] = np.where(df['extraction_type_class'] == 'gravity', 0, 1)
        # Encode 'scheme_management' (1 = VWC, others 0)
        df['scheme_management'] = np.where(df['scheme_management'] == 'VWC', 0, 1)
        # one hot encoder for basin 
        df = pd.get_dummies(data=df, columns=['basin'], drop_first=True, dtype=int)
        return df

    def selection(self, df):
         #  Drop other columns and only keep these:
        # df_small = df[['amount_tsh',
        #     'gps_height',
        #     'population',
        #     'construction_year',
        #     'extraction_type_class',
        #     'payment',
        #     'water_quality',
        #     'quantity',
        #     'source',
        #     'waterpoint_type'
        #    ]]
        #  #  Drop other columns and only keep these:
        # df_medium = df[['amount_tsh',
        #          'gps_height',
        #          'longitude',
        #          'latitude',
        #          'population',
        #          'construction_year',
        #          'extraction_type_class',
        #          'payment',
        #         'water_quality',
        #         'quantity',
        #         'source',
        #         'waterpoint_type',, 'basin_Lake Nyasa', 'basin_Lake Rukwa',
        #         'basin_Lake Tanganyika', 'basin_Lake Victoria', 'basin_Pangani',
        #         'basin_Rufiji', 'basin_Ruvuma / Southern Coast', 'basin_Wami / Ruvu'
        #         'scheme_management'
        #        ]]
        df = df[['amount_tsh',
                 'gps_height',
                 'longitude',
                 'latitude',
                 'population',
                 'construction_year',
                 'extraction_type_class',
                 'payment',
                'water_quality',
                'quantity',
                'source',
                'waterpoint_type', 
                'scheme_management', 'basin_Lake Nyasa', 'basin_Lake Rukwa',
                'basin_Lake Tanganyika', 'basin_Lake Victoria', 'basin_Pangani',
                'basin_Rufiji', 'basin_Ruvuma / Southern Coast', 'basin_Wami / Ruvu'
               ]]
        df['tshXpayment'] = df.amount_tsh * df.payment
        df['extractXsource'] = df.extraction_type_class * df.source
        df['popXtsh'] = df.population * df.amount_tsh
        df['popXquant'] = df.population * df.quantity
        df['popXsource'] = df.population * df.source
        df['extractXheight'] = df.extraction_type_class * df.gps_height
        df['typeXsource'] = df.waterpoint_type * df.source
        df['typeXyear'] = df.waterpoint_type * df.construction_year
        df['yearXpop'] = df.construction_year * df.population
        df['quantXsource'] = df.quantity * df.source
        df['yearsq'] = np.sqrt(df.construction_year + 1)
        df_large = df

        return df#_small, df_medium, df_large

    def clean_data(self, df): 
        df = self.clean_numeric(df) 
        df = self.clean_categorical(df)
        df = self.selection(df)
        return df
        
print('cool')

cool


In [286]:
# Testing the cleaning function on the original training data
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
data_path = os.path.join(parent_dir, 'data')
out_path = os.path.join(parent_dir, 'outputs')

# Read the files
train = pd.read_csv(os.path.join(data_path, 'train.csv'))
labels = pd.read_csv(os.path.join(data_path, 'train_labels.csv')) 

# Try the function
cleaner = DataCleaner()
X = cleaner.clean_data(train)

X.isna().sum() # returns all floats and integers, which makes me happier than it should


Cleaning ...


  df['gps_height'] = df.groupby('lga')['gps_height'].fillna(df['gps_height'].median())
  df[i] = df.groupby(['lga'])[i].fillna(df[i].median())
  df[i] = df.groupby(['region'])[i].fillna(df[i].median())
  df[i] = df.groupby(['lga'])[i].fillna(df[i].median())
  df[i] = df.groupby(['region'])[i].fillna(df[i].median())


amount_tsh                       0
gps_height                       0
longitude                        0
latitude                         0
population                       0
construction_year                0
extraction_type_class            0
payment                          0
water_quality                    0
quantity                         0
source                           0
waterpoint_type                  0
scheme_management                0
basin_Lake Nyasa                 0
basin_Lake Rukwa                 0
basin_Lake Tanganyika            0
basin_Lake Victoria              0
basin_Pangani                    0
basin_Rufiji                     0
basin_Ruvuma / Southern Coast    0
basin_Wami / Ruvu                0
tshXpayment                      0
extractXsource                   0
popXtsh                          0
popXquant                        0
popXsource                       0
extractXheight                   0
typeXsource                      0
typeXyear           

In [291]:
X.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
id,59400.0,37115.131768,0.0,18519.75,37061.5,55656.5,74247.0,21453.128371
amount_tsh,59400.0,1.705425,0.0,0.0,0.0,2.456456,17.898943,3.16077
date_recorded,59400.0,2012-03-29 09:11:33.818181888,2002-10-14 00:00:00,2011-04-01 00:00:00,2012-10-10 00:00:00,2013-02-09 00:00:00,2013-12-03 00:00:00,
gps_height,59400.0,1109.69633,1.0,995.0,1194.0,1319.25,2770.0,471.186468
longitude,59400.0,35.145285,29.607122,33.354079,35.005943,37.178387,40.345193,2.567468
latitude,59400.0,-5.863826,-11.64944,-8.540621,-5.172704,-3.398151,-0.998464,2.769401
num_private,59400.0,0.474141,0.0,0.0,0.0,0.0,1776.0,12.23623
region_code,59400.0,15.297003,1.0,5.0,12.0,17.0,99.0,17.587406
district_code,59400.0,5.629747,0.0,2.0,3.0,5.0,80.0,9.633649
population,59400.0,0.333443,0.0,0.0,0.2,0.6,1.0,0.38698


In [290]:
# create a class that we can pass to the pipeline
# run this cell to initiate the function, then the pipeline *should* run smoothly
def clean_func(df):
    #
    # Construction year
    df['construction_year'] = df['construction_year'].replace(0, np.nan)
    #Impute using region + installer
    df['construction_year'] = df.groupby(['region', 'installer'])['construction_year'].transform(
        lambda x: x.fillna(x.median())
    )
    #Impute using region only (for rows still missing)
    df['construction_year'] = df.groupby('region')['construction_year'].transform(
        lambda x: x.fillna(x.median())
    )
    #Use recorded year - 13
    df['date_recorded'] = pd.to_datetime(df['date_recorded'])
    df['recorded_year'] = df['date_recorded'].dt.year
    df['construction_year'] = df['construction_year'].fillna(df['recorded_year'] - 13)
    #
    # gps_height 
    df['gps_height'] = df['gps_height'].apply(lambda x: np.nan if x <= 0 else x)
    # Fill using median per lga
    df['gps_height'] = df.groupby('lga')['gps_height'].fillna(df['gps_height'].median())
    # Fill any still missing using region median
    df['gps_height'] = df.groupby(['region'])['gps_height'].fillna(df['gps_height'].median())
    # Longitude and latitude
    df['longitude'] = df['longitude'].replace(0, np.nan)
    df['latitude'] = df['latitude'].where(df['latitude'] < -0.5, np.nan) # too close to the equator
    for i in ['latitude','longitude']: # loop to fill by lga, region
        df[i] = df.groupby(['lga'])[i].fillna(df[i].median())
        df[i] = df.groupby(['region'])[i].fillna(df[i].median())
    # population
    # Fill population using median by district_code
    df['population'] = df.groupby('lga')['population'].transform(
        lambda x: x.fillna(x.median())
    )
    # Fill any still missing with median by region, then overall median
    df['population'] = df.groupby('region')['population'].transform(
        lambda x: x.fillna(x.median())
    )
    df['population'] = df['population'].fillna(df.population.median)
    # Bin the outcome, see how it behaves
    df['population'] = pd.cut(df['population'], [-1,1,25,90,160,260,9999999], labels=[0,0.2,0.3,0.4,0.6,1])
    df['population'] = df['population'].astype(float)
    #
    # amount_tsh
    df['amount_tsh'] = df['amount_tsh'].apply(lambda x: min(x, 15000))
    df['amount_tsh'] = df['amount_tsh'].apply(lambda x: np.power(x,0.3))
        ### Encode categorical variables
    # Encode 'quantity' (and typo fix: 'insufficent' -> 'insufficient')
    df['quantity'] = df['quantity'].replace({
        'enough': 1,
        'seasonal': 0.6,
        'insufficient': 0.4,
        'dry': 0,
        'unknown': 0
    })
    df.quantity = pd.to_numeric(df.quantity, errors='coerce')

    # Encode 'water_quality' as binary: good = 1, else 0
    df['water_quality'] = np.where(df['water_quality'] == 'soft', 1, 0)
    # Encode 'waterpoint_type' (1 = preferred type, 0 = everything else)
    preferred_waterpoint = ['communal standpipe multiple', 'communal standpipe']
    df['waterpoint_type'] = df['waterpoint_type'].apply(lambda x: 1 if x in preferred_waterpoint else 0)
    # Encode 'permit' as binary: True = 1, False, missing = 0
    df['permit'] = np.where(df['permit'] == 'True', 1, 0)
    # Encode 'payment' as binary: never pay = 0, else = 1
    df['payment'] = np.where(df['payment'] == 'never pay', 0, 1)
    # Encode 'source' (1 = preferred sources, 0 = everything else)
    preferred_sources = ['spring', 'river', 'rainwater harvesting']
    df['source'] = df['source'].apply(lambda x: 1 if x in preferred_sources else 0)
    # Encode 'payment' as binary: never pay = 0, else = 1
    df['extraction_type_class'] = np.where(df['extraction_type_class'] == 'gravity', 0, 1)
    # Encode 'scheme_management' (1 = VWC, others 0)
    df['scheme_management'] = np.where(df['scheme_management'] == 'VWC', 0, 1)
    # one hot encoder for basin 
    df = pd.get_dummies(data=df, columns=['basin'], drop_first=True, dtype=int)
    ### Select what's good
     #  Drop other columns and only keep these:
    # df_small = df[['amount_tsh',
    #     'gps_height',
    #     'population',
    #     'construction_year',
    #     'extraction_type_class',
    #     'payment',
    #     'water_quality',
    #     'quantity',
    #     'source',
    #     'waterpoint_type'
    #    ]]
    #  #  Drop other columns and only keep these:
    # df_medium = df[['amount_tsh',
    #          'gps_height',
    #          'longitude',
    #          'latitude',
    #          'population',
    #          'construction_year',
    #          'extraction_type_class',
    #          'payment',
    #         'water_quality',
    #         'quantity',
    #         'source',
    #         'waterpoint_type',, 'basin_Lake Nyasa', 'basin_Lake Rukwa',
    #         'basin_Lake Tanganyika', 'basin_Lake Victoria', 'basin_Pangani',
    #         'basin_Rufiji', 'basin_Ruvuma / Southern Coast', 'basin_Wami / Ruvu'
    #         'scheme_management'
    #        ]]
    df = df[['amount_tsh',
             'gps_height',
             'longitude',
             'latitude',
             'population',
             'construction_year',
             'extraction_type_class',
             'payment',
            'water_quality',
            'quantity',
            'source',
            'waterpoint_type', 
            'scheme_management', 'basin_Lake Nyasa', 'basin_Lake Rukwa',
            'basin_Lake Tanganyika', 'basin_Lake Victoria', 'basin_Pangani',
            'basin_Rufiji', 'basin_Ruvuma / Southern Coast', 'basin_Wami / Ruvu'
           ]]
    df['tshXpayment'] = df.amount_tsh * df.payment
    df['extractXsource'] = df.extraction_type_class * df.source
    df['popXtsh'] = df.population * df.amount_tsh
    df['popXquant'] = df.population * df.quantity
    df['popXsource'] = df.population * df.source
    df['extractXheight'] = df.extraction_type_class * df.gps_height
    df['typeXsource'] = df.waterpoint_type * df.source
    df['typeXyear'] = df.waterpoint_type * df.construction_year
    df['yearXpop'] = df.construction_year * df.population
    df['quantXsource'] = df.quantity * df.source
    df['yearsq'] = np.sqrt(df.construction_year + 1)
    df_large = df
    return df#_small, df_medium, df_large

#def clean_data(self, df): 
#    df = self.clean_numeric(df) 
#    df = self.clean_categorical(df)
#    df = self.selection(df)
#    return df

print('ready for piping')

my_transformer = FunctionTransformer(clean_func)

ready for piping


In [294]:


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [310]:
from sklearn.preprocessing import FunctionTransformer

# data_transformer = FunctionTransformer(cleaner.clean_data)
# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(labels['status_group']) 
X = train

# Define features and target 
y = y_encoded
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


pipeline = Pipeline([
    ('data_transformer', my_transformer), # this is how we clean the data
    ('scaler', StandardScaler()), 
    ('logreg', LogisticRegression(max_iter=700)) 
])

# Parameter grid
param_grid = {
    'logreg__C': [0.01, 0.1,0.5,1],
    'logreg__penalty': ['l1','l2'],
    'logreg__solver': ['saga']
}

# Grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=2, scoring='accuracy')
grid_search.fit(X_train, y_train)  # this is essential

# Check best_estimator_
print(" Type:", type(grid_search))
print(" Best Estimator:", grid_search.best_estimator_)


AttributeError: 'GridSearchCV' object has no attribute 'named_steps'

In [296]:
y_test_pred = grid_search.best_estimator_.predict(X_test)

print(" Classification Report on Test Set:")
print(classification_report(y_test, y_test_pred, target_names=le.classes_))

print(" Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

 Classification Report on Test Set:
                         precision    recall  f1-score   support

             functional       0.67      0.81      0.74      6452
functional needs repair       0.00      0.00      0.00       863
         non functional       0.64      0.57      0.60      4565

               accuracy                           0.66     11880
              macro avg       0.44      0.46      0.45     11880
           weighted avg       0.61      0.66      0.63     11880

 Confusion Matrix:
[[5235    0 1217]
 [ 591    0  272]
 [1960    0 2605]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [303]:
print(grid_search)

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('data_transformer',
                                        FunctionTransformer(func=<function clean_func at 0x7e97d45ce340>)),
                                       ('scaler', StandardScaler()),
                                       ('logreg',
                                        LogisticRegression(max_iter=700))]),
             param_grid={'logreg__C': [0.01, 0.1], 'logreg__penalty': ['l2'],
                         'logreg__solver': ['lbfgs', 'saga']},
             scoring='accuracy')


In [316]:
classifier_coef = pipeline['best_estimator_'].coef_

KeyError: 'best_estimator_'