In [1]:
# alpha: initial import statements
import pandas as pd
from sklearn.model_selection import train_test_split as tts
# from sklearn.preprocessing import minmax_scale as mms
!pip install category_encoders
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score



In [2]:
# bravo: give background, then load and inspect data
# here's a URL for background: https://www.kaggle.com/t/32b89e93d8a44743983a0ab1c19c85f3
# Q: "Can you predict which [Tanzanian] water pumps are faulty?"
# H_null: something like "all features are equally likely to make H2O unclean and/or non-potable"
# H_one: akin to "one or a group of features enables high-accuracy prediction of faulty pumps"

kaggle_path = 'C:\\Users\\jhump\\Desktop\\Desktop_professional\\LSDS\\Full_Course\\train_features.csv'

df = pd.read_csv(kaggle_path)
print(df.shape)
df.head()

(59400, 40)


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
id                       59400 non-null int64
amount_tsh               59400 non-null float64
date_recorded            59400 non-null object
funder                   55765 non-null object
gps_height               59400 non-null int64
installer                55745 non-null object
longitude                59400 non-null float64
latitude                 59400 non-null float64
wpt_name                 59400 non-null object
num_private              59400 non-null int64
basin                    59400 non-null object
subvillage               59029 non-null object
region                   59400 non-null object
region_code              59400 non-null int64
district_code            59400 non-null int64
lga                      59400 non-null object
ward                     59400 non-null object
population               59400 non-null int64
public_meeting           56066 non-null object
r

In [None]:
# charlie: check data for nan's
df.isna().sum()

In [4]:
df1 = df.copy()

# source for following: https://stackoverflow.com/questions/22470690/get-list-of-pandas-dataframe-columns-based-on-data-type
df1_dtype_groups = df1.columns.to_series().groupby(df1.dtypes).groups
df1_dtype_groups

{dtype('int64'): Index(['id', 'gps_height', 'num_private', 'region_code', 'district_code',
        'population', 'construction_year'],
       dtype='object'),
 dtype('float64'): Index(['amount_tsh', 'longitude', 'latitude'], dtype='object'),
 dtype('O'): Index(['date_recorded', 'funder', 'installer', 'wpt_name', 'basin',
        'subvillage', 'region', 'lga', 'ward', 'public_meeting', 'recorded_by',
        'scheme_management', 'scheme_name', 'permit', 'extraction_type',
        'extraction_type_group', 'extraction_type_class', 'management',
        'management_group', 'payment', 'payment_type', 'water_quality',
        'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
        'source_class', 'waterpoint_type', 'waterpoint_type_group'],
       dtype='object')}

In [5]:
# what features will show whether or not H2O is clean and potable?
# let's investigate 'water_quality'
df1.water_quality.unique()  # questions: merge both 'abandoned' strings into one? what to do with 'unknown'?

array(['soft', 'salty', 'milky', 'unknown', 'fluoride', 'coloured',
       'salty abandoned', 'fluoride abandoned'], dtype=object)

In [6]:
# delta: preprocess data
# CREDIT for this function, to Ryan Herr/LSDS

def train_validation_test_split(
    X, y, train_size=0.8, val_size=0.1, test_size=0.1, 
    random_state=None, shuffle=True):
        
    assert train_size + val_size + test_size == 1
    
    X_train_val, X_test, y_train_val, y_test = tts(X, y, test_size=test_size,
                                                   random_state=random_state, shuffle=shuffle)
    
    X_train, X_val, y_train, y_val = tts(X_train_val, y_train_val,
                                         test_size=val_size/(train_size+val_size), 
                                         random_state=random_state, shuffle=shuffle)
    print('X_train is:', X_)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [7]:
# for baseline model, X might include 'region_code', 'population', 'construction_year'
X = df[['region_code', 'population', 'construction_year']]
# call 'mms' on X after creating X namespace
# mms(X)
# best target for y seems to be 'water_quality'
y = df.water_quality

train_validation_test_split(X, y)
# pd.get_dummies(df)

(       region_code  population  construction_year
 43298           16         500               2007
 51822           10         100               1983
 24149            4         400               1990
 3026             8         300                  0
 204             18           0                  0
 41626           20         350               1999
 20952           80         269               2004
 40886           99         400               1978
 12270            1           0                  0
 11429           12           0                  0
 34635           20         300               2009
 8439            20         253               1978
 54077           15          30               1981
 12508           16         355               1992
 12550           13           1               1990
 27978           17           0                  0
 56717           21         200               2000
 28642            1           0                  0
 48131           12           0

In [9]:
# echo: fit, validate, and present model
# 2019-02-04 comment: I believe that some sort of advanced regression will be best here, but TBD
pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True), 
    StandardScaler(), 
    LogisticRegression(solver='lbfgs')
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)
accuracy_score(y_val, y_pred)

NameError: name 'X_train' is not defined

In [None]:
# foxtrot: analyze and interpret model predictions