### Comparation of 3 base models


In [43]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [187]:
# create a class that we can pass to the pipeline
class DataCleaner:

    def __init__(self):
        print('Cleaning ...')

    def clean_numeric(self, df):
        #
        # Construction year
        df['construction_year'] = df['construction_year'].replace(0, np.nan)
        #Impute using region + installer
        df['construction_year'] = df.groupby(['region', 'installer'])['construction_year'].transform(
            lambda x: x.fillna(x.median())
        )
        #Impute using region only (for rows still missing)
        df['construction_year'] = df.groupby('region')['construction_year'].transform(
            lambda x: x.fillna(x.median())
        )
        #Use recorded year - 13
        df['date_recorded'] = pd.to_datetime(df['date_recorded'])
        df['recorded_year'] = df['date_recorded'].dt.year
        df['construction_year'] = df['construction_year'].fillna(df['recorded_year'] - 13)
        #
        # gps_height
        df['gps_height'] = df['gps_height'].apply(lambda x: np.nan if x <= 0 else x)
        # Fill using median per lga
        df['gps_height'] = df.groupby('lga')['gps_height'].transform(
            lambda x: x.fillna(x.median())
        )
        # Fill any still missing using region median
        df['gps_height'] = df.groupby('region')['gps_height'].transform(
            lambda x: x.fillna(x.median())
        )
        #
        #location
        df['longitude'] = df['longitude'].replace(0, np.nan)
        df['latitude'] = df['latitude'].where(df['latitude'] < -0.5, np.nan) # too close to the equator
        for i in ['latitude','longitude']:
            df[i] = df.groupby('lga')[i].transform(lambda x: x.fillna(x.median))
            df[i] = df.groupby('region')[i].transform(lambda x: x.fillna(x.median))
        df.longitude = pd.to_numeric(df.longitude, errors='coerce')
        df.latitude = pd.to_numeric(df.latitude, errors='coerce')
        #
        # population
        # Fill population using median by district_code
        df['population'] = df.groupby('lga')['population'].transform(
            lambda x: x.fillna(x.median())
        )
        # Fill any still missing with median by region, then overall median
        df['population'] = df.groupby('region')['population'].transform(
            lambda x: x.fillna(x.median())
        )
        df['population'] = df['population'].fillna(df.population.median)
        # Bin the outcome, see how it behaves
        df['population'] = pd.cut(df['population'], [-1,1,25,90,160,260,9999999], labels=[0,0.2,0.3,0.4,0.6,1])
        df['population'] = df['population'].astype(float)
        #
        # amount_tsh
        df['amount_tsh'] = df['amount_tsh'].apply(lambda x: min(x, 15000))
        return df

    def clean_categorical(self, df):
            ### Encode categorical variables
        # Encode 'quantity' (and typo fix: 'insufficent' -> 'insufficient')
        df['quantity'] = df['quantity'].replace({
            'enough': 1,
            'seasonal': 0.6,
            'insufficient': 0.4,
            'dry': 0,
            'unknown': 0
        })
        df.quantity = pd.to_numeric(df.quantity, errors='coerce')

        # Encode 'water_quality' as binary: good = 1, else 0
        df['water_quality'] = np.where(df['water_quality'] == 'soft', 1, 0)
        # Encode 'waterpoint_type' (1 = preferred type, 0 = everything else)
        preferred_waterpoint = ['hand pump', 'communal standpipe']
        df['waterpoint_type'] = df['waterpoint_type'].apply(lambda x: 1 if x in preferred_waterpoint else 0)
        # Encode 'permit' as binary: True = 1, False, missing = 0
        df['permit'] = np.where(df['permit'] == 'True', 1, 0)
        # Encode 'payment' as binary: never pay = 0, else = 1
        df['payment'] = np.where(df['payment'] == 'never pay', 0, 1)
        # Encode 'source' (1 = preferred sources, 0 = everything else)
        preferred_sources = ['spring', 'river', 'rainwater harvesting']
        df['source'] = df['source'].apply(lambda x: 1 if x in preferred_sources else 0)
        # Encode 'payment' as binary: never pay = 0, else = 1
        df['extraction_type_class'] = np.where(df['extraction_type_class'] == 'gravity', 0, 1)
        # Encode 'scheme_management' (1 = VWC, others 0)
        df['scheme_management'] = np.where(df['scheme_management'] == 'VWC', 0, 1)
        # one hot encoder for basin 
        df = pd.get_dummies(data=df, columns=['basin'], drop_first=True, dtype=int)
        return df

    def selection(self, df):
         #  Drop other columns and only keep these:
        # df_small = df[['amount_tsh',
        #     'gps_height',
        #     'population',
        #     'construction_year',
        #     'extraction_type_class',
        #     'payment',
        #     'water_quality',
        #     'quantity',
        #     'source',
        #     'waterpoint_type'
        #    ]]
        #  #  Drop other columns and only keep these:
        # df_medium = df[['amount_tsh',
        #          'gps_height',
        #          'longitude',
        #          'latitude',
        #          'population',
        #          'construction_year',
        #          'extraction_type_class',
        #          'payment',
        #         'water_quality',
        #         'quantity',
        #         'source',
        #         'waterpoint_type',, 'basin_Lake Nyasa', 'basin_Lake Rukwa',
        #         'basin_Lake Tanganyika', 'basin_Lake Victoria', 'basin_Pangani',
        #         'basin_Rufiji', 'basin_Ruvuma / Southern Coast', 'basin_Wami / Ruvu'
        #         'scheme_management'
        #        ]]
        df = df[['amount_tsh',
                 'gps_height',
                 'longitude',
                 'latitude',
                 'population',
                 'construction_year',
                 'extraction_type_class',
                 'payment',
                'water_quality',
                'quantity',
                'source',
                'waterpoint_type', 
                'scheme_management', 'basin_Lake Nyasa', 'basin_Lake Rukwa',
                'basin_Lake Tanganyika', 'basin_Lake Victoria', 'basin_Pangani',
                'basin_Rufiji', 'basin_Ruvuma / Southern Coast', 'basin_Wami / Ruvu'
               ]]
#        df['tshXpayment'] = df.amount_tsh * df.payment
#        df['extractXsource'] = df.extraction_type_class * df.source
#        df['popXtsh'] = df.population * df.amount_tsh
#        df['popXquant'] = df.population * df.quantity
#        df['popXsource'] = df.population * df.source
#        df['extractXheight'] = df.extraction_type_class * df.gps_height
#        df['typeXsource'] = df.waterpoint_type * df.source
#        df['typeXyear'] = df.waterpoint_type * df.construction_year
#        df['yearXpop'] = df.construction_year * df.population
#        df['quantXsource'] = df.quantity * df.source
        df['yearsq'] = np.sqrt(df.construction_year + 1)
        df_large = df

        return df#_small, df_medium, df_large

    def clean_data(self, df): 
        df = self.clean_numeric(df) 
        df = self.clean_categorical(df)
        df = self.selection(df)
        return df
        
print('cool')

cool


In [189]:
# Using the cleaning function on the original training data

cleaner = DataCleaner()
X = cleaner.clean_data(train)
X.info()

Cleaning ...


  df[i] = df.groupby('region')[i].transform(lambda x: x.fillna(x.median))
  df[i] = df.groupby('region')[i].transform(lambda x: x.fillna(x.median))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   amount_tsh                     59400 non-null  float64
 1   gps_height                     47285 non-null  float64
 2   longitude                      57588 non-null  float64
 3   latitude                       57588 non-null  float64
 4   population                     59400 non-null  float64
 5   construction_year              59400 non-null  float64
 6   extraction_type_class          59400 non-null  int64  
 7   payment                        59400 non-null  int64  
 8   water_quality                  59400 non-null  int64  
 9   quantity                       59400 non-null  float64
 10  source                         59400 non-null  int64  
 11  waterpoint_type                59400 non-null  int64  
 12  scheme_management              59400 non-null 

In [157]:
from sklearn.preprocessing import power_transform


pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pt', PowerTransformer())
    ('logreg', LogisticRegression(max_iter=1000))
])

from sklearn import svm, neighbors, tree
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   amount_tsh                     59400 non-null  float64
 1   gps_height                     59400 non-null  object 
 2   longitude                      59400 non-null  object 
 3   latitude                       59400 non-null  object 
 4   population                     59400 non-null  float64
 5   construction_year              59400 non-null  float64
 6   extraction_type_class          59400 non-null  int64  
 7   payment                        59400 non-null  int64  
 8   water_quality                  59400 non-null  int64  
 9   quantity                       59400 non-null  float64
 10  source                         59400 non-null  int64  
 11  waterpoint_type                59400 non-null  int64  
 12  scheme_management              59400 non-null 

In [75]:
df.columns 

Index(['Lake Nyasa', 'Lake Rukwa', 'Lake Tanganyika', 'Lake Victoria',
       'Pangani', 'Rufiji', 'Ruvuma / Southern Coast', 'Wami / Ruvu'],
      dtype='object')

In [178]:

df = pd.read_csv(os.path.join(data_path, 'train.csv')) 
df = df[['construction_year','date_recorded','gps_height','longitude','latitude','lga','region']]
print(df.dtypes)


# gps_height
df['gps_height'] = df['gps_height'].apply(lambda x: np.nan if x <= 0 else x)
# Fill using median per lga
df['gps_height'] = df.groupby('lga')['gps_height'].transform(
    lambda x: x.fillna(x.median())
)
# Fill any still missing using region median
df['gps_height'] = df.groupby('region')['gps_height'].transform(
    lambda x: x.fillna(x.median())
)
#
#location
df['longitude'] = df['longitude'].replace(0, np.nan)
df['latitude'] = df['latitude'].where(df['latitude'] < -0.5, np.nan) # too close to the equator
for i in ['latitude','longitude']:
    df[i] = df.groupby('lga')[i].transform(lambda x: x.fillna(x.median))
    df[i] = df.groupby('region')[i].transform(lambda x: x.fillna(x.median))
#df.longitude = pd.to_numeric(df.longitude, errors='coerce')
#df.latitude = pd.to_numeric(df.latitude, errors='coerce')
#
print('~~~~~~~~~\n',df.dtypes)

construction_year      int64
date_recorded         object
gps_height             int64
longitude            float64
latitude             float64
lga                   object
region                object
dtype: object
~~~~~~~~~
 construction_year      int64
date_recorded         object
gps_height           float64
longitude             object
latitude              object
lga                   object
region                object
dtype: object


  df[i] = df.groupby('region')[i].transform(lambda x: x.fillna(x.median))
  df[i] = df.groupby('region')[i].transform(lambda x: x.fillna(x.median))


In [201]:
import numpy as np
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
pt.lambdas_ = {0:1,1:0}
data = [[-1, 0], [1, 2], [3, 4], [5,16]]
print('pt.fit(data): ', pt.fit(data))
print('pt.lambdas_: ', pt.lambdas_)
print('pt.transform(data) ', pt.transform(data))

pt.fit(data):  PowerTransformer()
pt.lambdas_:  [ 0.81761727 -0.05022969]
pt.transform(data)  [[-1.41108034 -1.38481521]
 [-0.36251621 -0.25831859]
 [ 0.49653431  0.24467264]
 [ 1.27706224  1.39846116]]
