# Import Libraries and Files

In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix


from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn import tree
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [2]:
# Import test values

df_test = pd.read_csv('DrivenData Files/testsetvalues.csv')
df_test.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,...,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
3,45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,...,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
4,49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,...,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


## Preprocess Well Age and Drop Columns

In [3]:
df_test['year_recorded'] = pd.to_datetime(df_test['date_recorded']).dt.year
df_test['construction_year'] = df_test['construction_year'].replace({0:2012})
df_test['well_age'] = df_test['year_recorded'] - df_test['construction_year']
df_test['well_age'] = df_test['well_age'].replace({-1:0, -2:0, -4:0, -7:0, -8:0})
df_test.drop(columns = ['date_recorded', 'year_recorded', 'construction_year'], axis = 1, inplace = True)
print(df_test['well_age'].value_counts())


0     4855
1     1149
3      650
2      594
5      510
13     436
4      434
7      405
6      351
11     324
14     293
8      280
33     255
15     249
16     237
23     203
19     203
10     192
9      192
27     187
31     186
18     184
28     182
17     164
25     161
21     149
29     147
12     147
35     140
41     133
26     133
37     131
20     130
39     127
38      90
36      86
30      78
22      77
43      70
40      62
24      61
34      49
32      42
42      23
50      21
53      20
45      14
44      11
51       7
48       7
46       7
52       5
47       4
49       3
Name: well_age, dtype: int64


In [4]:
# Deal with null/missing values

df_test.drop(columns = ['scheme_name', 'subvillage', 'public_meeting'], axis = 1, inplace = True);
df_test['funder'].fillna('other', inplace = True)
df_test['installer'].fillna('other', inplace = True)
df_test['scheme_management'].fillna('other', inplace = True)
df_test['permit'].fillna('other', inplace = True)

In [5]:
# Drop columns that were determined to be dropped from training data

df_test.drop(columns = ['num_private', 'wpt_name', 'ward', 'recorded_by', 'permit', 
                       'management_group', 'payment_type', 'quality_group',
                       'quantity_group', 'source_type', 'source_class',
                       'waterpoint_type_group', 'extraction_type_group', 'extraction_type'], axis = 1, inplace = True)

## Organize Top 50 Values for Funder, Installer, and Lga's

In [6]:
#Clean 'Funder' column and reduce values counts

def top_funders(var):
    if var in df_test['funder'].value_counts(normalize = True).head(50).index:
        return var
    else:
        return 'other'
    
df_test['top_funded'] = df_test['funder'].map(top_funders)

In [7]:
# Inspect changes to value counts

df_test['top_funded'].value_counts()

other                             5331
Government Of Tanzania            2215
Danida                             793
Hesawa                             580
World Bank                         352
Kkkt                               336
Rwssp                              329
World Vision                       316
Unicef                             267
Tasaf                              259
Dhv                                236
Private Individual                 208
0                                  203
Dwsp                               201
District Council                   194
Norad                              184
Water                              156
Germany Republi                    155
Ministry Of Water                  138
Tcrs                               133
Hifab                              127
Netherlands                        122
Dwe                                121
Lga                                104
Adb                                103
Amref                    

In [8]:
# Correct data entry grammatical errors

df_test['top_funded'].replace('0', 'other', inplace = True)
df_test['top_funded'].replace('Finw', 'Fini Water', inplace = True)
df_test['top_funded'].replace('Germany Republi', 'Germany Republic', inplace = True)
df_test['top_funded'].replace('Private', 'Private Individual', inplace = True)
df_test['top_funded'].replace('Jica', 'Jaica', inplace = True)
df_test['top_funded'].replace('Water', 'Ministry Of Water', inplace = True)
#df_test['top_funded'].replace('Ministry of Water', 'Ministry Of Water', inplace = True)
df_test['top_funded'].replace('Rural Water Supply And Sanitat', 'Rural Water Supply And Sanitation', inplace = True)

In [9]:
df_test['top_funded'].replace('Magadini-makiwaru Water', 'other', inplace = True)
df_test['top_funded'].replace('Wvt', 'other', inplace = True)
df_test['top_funded'].replace('W.B', 'other', inplace = True)

In [10]:
# Clean 'Installer' column and reduce value counts

def top_installer(var):
    if var in df_test['installer'].value_counts(normalize = True).head(50).index:
        return var
    else:
        return 'other'
    
df_test['top_installers'] = df_test['installer'].map(top_installer)

In [11]:
# Inspect changes to value counts

df_test['top_installers'].value_counts()

other                         5380
DWE                           4349
Government                     457
RWE                            292
Commu                          287
DANIDA                         255
Hesawa                         230
KKKT                           222
0                              203
TCRS                           180
CES                            155
Central government             142
HESAWA                         140
DANID                          138
Community                      134
District Council               112
World vision                   109
TASAF                          108
Gover                          100
WEDECO                          99
District council                98
LGA                             93
TWESA                           79
WU                              76
Dmdd                            69
AMREF                           69
ACRA                            68
UNICEF                          63
Villagers           

In [12]:
# Correct data entry grammatical errors

df_test['top_installers'].replace('0', 'other', inplace = True)
df_test['top_installers'].replace('District council', 'District Council', inplace = True)
df_test['top_installers'].replace('Gover', 'Government', inplace = True)
df_test['top_installers'].replace('Commu', 'Community', inplace = True)
df_test['top_installers'].replace('World vision', 'World Vision', inplace = True)
df_test['top_installers'].replace('HESAWA', 'Hesawa', inplace = True)
df_test['top_installers'].replace('DANID', 'DANIDA', inplace = True)
#df_test['top_installers'].replace('Centr', 'Central government', inplace = True)
df_test['top_installers'].replace('Gove', 'Government', inplace = True)
#df_test['top_installers'].replace('Distri', 'District Council', inplace = True)
df_test['top_installers'].replace('KKKT _ Konde and DWE', 'KKKT', inplace = True)
df_test['top_installers'].replace('Da', 'DANIDA', inplace = True)
df_test['top_installers'].replace('Magadini-Makiwaru wa', 'Magadini-Makiwaru Water Supply', inplace = True)
df_test['top_installers'].replace('Lawatefuka water sup', 'Lawatefuka Water Supply', inplace = True)
df_test['top_installers'].replace('Handeni Trunk Main(', 'Handeni Trunk Main', inplace = True)
df_test['top_installers'].replace('Kuwait', 'other', inplace = True)
df_test['top_installers'].replace('Mission', 'other', inplace = True)
df_test['top_installers'].replace('FinW', 'Fini Water', inplace = True)
df_test['top_installers'].replace('Handeni Trunk Main', 'other', inplace = True)
df_test['top_installers'].replace('JAICA', 'other', inplace = True)
df_test['top_installers'].replace('Distri', 'other', inplace = True)

In [13]:
# Clean 'lga' column and reduce value counts

def top_lgas(var):
    if var in df_test['lga'].value_counts(normalize = True).head(50).index:
        return var
    else:
        return 'other'
    
df_test['top_lga'] = df_test['lga'].map(top_lgas)

In [14]:
# Inspect changes to value counts

df_test['top_lga'].value_counts()

other               5024
Njombe               625
Moshi Rural          315
Bariadi              308
Rungwe               275
Kasulu               275
Kilosa               274
Arusha Rural         269
Bagamoyo             266
Mbozi                252
Kilombero            248
Meru                 235
Same                 229
Kibondo              227
Magu                 225
Kahama               222
Maswa                215
Kyela                211
Singida Rural        207
Karagwe              196
Mbinga               193
Kigoma Rural         191
Serengeti            189
Iringa Rural         186
Ngara                185
Songea Rural         183
Ulanga               176
Lushoto              176
Mvomero              175
Mpanda               173
Shinyanga Rural      163
Kwimba               161
Makete               159
Nzega                155
Mbarali              155
Hai                  155
Rombo                153
Namtumbo             151
Bukombe              139
Mkuranga             138


In [15]:
df_test['top_lga'].replace('Bukoba Rural', 'other', inplace = True)
df_test['top_lga'].replace('Rufiji', 'other', inplace = True)

In [16]:
df_test['scheme_management'].replace('Other', 'other', inplace = True)

## Preprocess remaining data

In [17]:
# Drop id because it is unnecessary, drop funder and installer because new features were added with their clean data.

df_test.drop(columns = ['id', 'funder', 'installer', 'lga'], axis = 1, inplace = True)

In [18]:
# Change data type of district_code and region_code to 'string' in order to be one-hot encoded

df_test['district_code'] = df_test['district_code'].astype('str')
df_test['region_code'] = df_test['region_code'].astype('str')

In [19]:
# Inspect remaining columns for any null values. 

df_test.isnull().sum()

amount_tsh               0
gps_height               0
longitude                0
latitude                 0
basin                    0
region                   0
region_code              0
district_code            0
population               0
scheme_management        0
extraction_type_class    0
management               0
payment                  0
water_quality            0
quantity                 0
source                   0
waterpoint_type          0
well_age                 0
top_funded               0
top_installers           0
top_lga                  0
dtype: int64

### Save pre-OHE processed file to import for CatBoost modeling

In [20]:
# Save preprocessing for modeling

df_test.to_csv('Top50_catboost', index = False)

# Model Preparation

In [None]:
# Isolate continuous variables 

test_cont = df_test[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population', 'well_age']]

In [None]:
# Isolate categorical variables 

test_cat = df_test.drop(columns = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population',
                                        'well_age'], axis = 1)

In [None]:
# One-hot encode categorical variables using pd.get_dummies

test_dummies = pd.get_dummies(test_cat, drop_first = True)
test_dummies.head()

In [None]:
# Concatenate the OHE categorical and continuous variables back together 

processed_test = pd.concat([test_dummies, test_cont], axis = 1)
processed_test.head()

In [None]:
processed_test.head()

In [None]:
# Save preprocessing for modeling

processed_test.to_csv('Top50_processed_test', index = False)

In [2]:
# Import preprocessed file

processed_test = pd.read_csv('Saved Files/Top50_processed_test')
processed_test.head()

Unnamed: 0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,region_Dodoma,...,top_lga_Songea Rural,top_lga_Sumbawanga Rural,top_lga_Ulanga,top_lga_other,amount_tsh,gps_height,longitude,latitude,population,well_age
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0.0,1996,35.290799,-4.059696,321,1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0.0,1569,36.656709,-3.309214,300,13
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,1567,34.767863,-5.004344,500,3
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0.0,267,38.058046,-9.418672,250,26
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,500.0,1260,35.006123,-10.950412,60,13


# Import and Process Training Data

In [3]:
# Import training data 

df_train = pd.read_csv('Saved Files/experimental50_training_set')
df_train.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,basin,region,region_code,district_code,population,scheme_management,...,payment,water_quality,quantity,source,waterpoint_type,well_age,status_group,top_funded,top_installers,top_lga
0,6000.0,1390,34.938093,-9.856322,Lake Nyasa,Iringa,11,5,109,VWC,...,pay annually,soft,enough,spring,communal standpipe,12,functional,Roman,other,Ludewa
1,0.0,1399,34.698766,-2.147466,Lake Victoria,Mara,20,2,280,other,...,never pay,soft,insufficient,rainwater harvesting,communal standpipe,3,functional,other,other,Serengeti
2,25.0,686,37.460664,-3.821329,Pangani,Manyara,21,4,250,VWC,...,pay per bucket,soft,enough,dam,communal standpipe multiple,4,functional,other,World Vision,other
3,0.0,263,38.486161,-11.155298,Ruvuma / Southern Coast,Mtwara,90,63,58,VWC,...,never pay,soft,dry,machine dbh,communal standpipe multiple,27,non functional,Unicef,UNICEF,other
4,0.0,0,31.130847,-1.825359,Lake Victoria,Kagera,18,1,0,other,...,never pay,soft,seasonal,rainwater harvesting,communal standpipe,12,functional,other,other,Karagwe


In [4]:
df_train['district_code'] = df_train['district_code'].astype('str')
df_train['region_code'] = df_train['region_code'].astype('str')

# Split Continuous and Categorical Variables, OHE Categorical Variables

train_cont = df_train[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population', 'well_age']]

train_cat = df_train.drop(columns = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population',
                                        'well_age', 'status_group'], axis = 1)

train_dummies = pd.get_dummies(train_cat, drop_first = True)

processed_train = pd.concat([train_dummies, train_cont], axis = 1)
processed_train.head()

Unnamed: 0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,region_Dodoma,...,top_lga_Songea Rural,top_lga_Sumbawanga Rural,top_lga_Ulanga,top_lga_other,amount_tsh,gps_height,longitude,latitude,population,well_age
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,6000.0,1390,34.938093,-9.856322,109,12
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,1399,34.698766,-2.147466,280,3
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,25.0,686,37.460664,-3.821329,250,4
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0.0,263,38.486161,-11.155298,58,27
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,0,31.130847,-1.825359,0,12


## Compare Top 50 Column Values in Train and Test Datasets

In [5]:
# View all columns present in Test set that aren't in Train set

set(processed_test.columns).difference(processed_train.columns)

set()

In [6]:
# View all columns present in Train set that aren't in Test set

set(processed_train.columns).difference(processed_test.columns)

set()

In [None]:
for col in processed_train.columns:
    print(col)

In [None]:
for col in processed_test.columns:
    print(col)

# Modeling

In [7]:
# Split data into target (y) and predictors (X)

X = processed_train
y = df_train['status_group']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [8]:
# Scale training data to make sure that all of our data is represented at the same scale
scale = MinMaxScaler()

# Fit
X_train = scale.fit_transform(X) 
X_test = scale.transform(processed_test) 

In [9]:
print(X_train.shape)
print(X_test.shape)

(59400, 261)
(14850, 261)


### Classifier 1

In [None]:
# Instantiate XGBClassifier
clf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=None, n_estimators=250,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

# Fit XGBClassifier
clf.fit(X, y)

In [None]:
# Predict on training and test sets

test_preds = clf.predict(processed_test)

In [None]:
# Accuracy of training and test sets for classifier 1

training_accuracy = clf.score(X, y)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))

### Classifier 2

In [None]:
# Instantiate XGBClassifier
clf2 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

# Fit XGBClassifier
clf2.fit(X, y)

In [None]:
# Predict on training and test sets with classifier 2

test_preds2 = clf2.predict(processed_test)

In [None]:
# Accuracy of training and test sets for classifier 2

training_accuracy = clf2.score(X, y)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))

#### Classifier 2.1, clf2 with MinMaxScaling

In [44]:
# Instantiate XGBClassifier
clf2 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=None, n_estimators=450,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

# Fit XGBClassifier
clf2.fit(X_train, y)



Parameters: { scale_pos_weight, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=None, monotone_constraints='()',
              n_estimators=450, n_jobs=-1, nthread=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
              subsample=0.5, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [45]:
# Predict on training and test sets with classifier 2.1

test_preds21 = clf2.predict(X_test)

In [46]:
# Accuracy of training and test sets for classifier 2.1

training_accuracy = clf2.score(X_train, y)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))

# CLF2.1 w/ 300 Estimators - 89.75% Accuracy
# CLF2.1 w/ 250 Estimators - 88.72% Accuracy
# CLF2.1 w/ 400 Estimators - 91.51% Accuracy
# CLF2.1 w/ 500 Estimators - 93.05% Accuracy
# CLF2.1 w/ 600 Estimators - 94.02% Accuracy
# CLF2.1 w/ 350 Estimators - 90.67% Accuracy
# CLF2.1 w/ 450 Estimators - 92.31% Accuracy

Training Accuracy: 92.31%


### Classifier 3

In [None]:
# Instantiate XGBClassifier
clf3 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

# Fit XGBClassifier
clf3.fit(X, y)

In [None]:
# Predict on training and test sets with classifier 3

test_preds3 = clf3.predict(processed_test)

In [None]:
# Accuracy of training and test sets for classifier 3

training_accuracy = clf3.score(X, y)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))

## Import and Process Submission File

In [13]:
# Import submission format file

model_submit = pd.read_csv('DrivenData Files/SubmissionFormat.csv')
model_submit.head()

Unnamed: 0,id,status_group
0,50785,predicted label
1,51630,predicted label
2,17168,predicted label
3,45559,predicted label
4,49871,predicted label


In [14]:
model_submit.drop(['status_group'], axis = 1, inplace = True)
model_submit.head()

Unnamed: 0,id
0,50785
1,51630
2,17168
3,45559
4,49871


### Classifier 1

In [None]:
# Format the test predictions into a pandas dataframe

test_pred_df = pd.DataFrame(data = test_preds)

# Name the outcome label column

test_pred_df = test_pred_df.rename(columns = {0: 'status_group'})

# Concatenate the test labels and submission format file

test_pred_df = pd.concat([model_submit, test_pred_df], axis = 1)

In [None]:
test_pred_df['status_group'] = test_pred_df['status_group'].astype(str)

In [None]:
# Convert the outcome column 'status_group' into numeric values

test_pred_df['status_group'].replace('1', 'functional', inplace = True)
test_pred_df['status_group'].replace('0', 'non_functional', inplace = True)
test_pred_df['status_group'].replace('2', 'functional_needs_repair', inplace = True)

In [None]:
test_pred_df['status_group'].value_counts()

### Classifier 2

In [None]:
# Classifier 2

# Format the test predictions into a pandas dataframe

test_pred_df2 = pd.DataFrame(data = test_preds2)

# Name the outcome label column

test_pred_df2 = test_pred_df2.rename(columns = {0: 'status_group'})

# Concatenate the test labels and submission format file

test_pred_df2 = pd.concat([model_submit, test_pred_df2], axis = 1)

In [None]:
test_pred_df2['status_group'] = test_pred_df2['status_group'].astype(str)

In [None]:
# Convert the outcome column 'status_group' into numeric values

test_pred_df2['status_group'].replace('1', 'functional', inplace = True)
test_pred_df2['status_group'].replace('0', 'non_functional', inplace = True)
test_pred_df2['status_group'].replace('2', 'functional_needs_repair', inplace = True)

In [None]:
test_pred_df2['status_group'].value_counts()

#### Classifier 2.1

In [47]:
# Classifier 2.1

# Format the test predictions into a pandas dataframe

test_pred_df21 = pd.DataFrame(data = test_preds21)

# Name the outcome label column

test_pred_df21 = test_pred_df21.rename(columns = {0: 'status_group'})

# Concatenate the test labels and submission format file

test_pred_df21 = pd.concat([model_submit, test_pred_df21], axis = 1)

In [48]:
test_pred_df21['status_group'] = test_pred_df21['status_group'].astype(str)

In [49]:
# Convert the outcome column 'status_group' into numeric values

test_pred_df21['status_group'].replace('1', 'functional', inplace = True)
test_pred_df21['status_group'].replace('0', 'non_functional', inplace = True)
test_pred_df21['status_group'].replace('2', 'functional_needs_repair', inplace = True)

In [50]:
test_pred_df21['status_group'].value_counts()

functional                 9491
non functional             4961
functional needs repair     398
Name: status_group, dtype: int64

### Classifier 3

In [None]:
# Classifier 2

# Format the test predictions into a pandas dataframe

test_pred_df3 = pd.DataFrame(data = test_preds3)

# Name the outcome label column

test_pred_df3 = test_pred_df3.rename(columns = {0: 'status_group'})

# Concatenate the test labels and submission format file

test_pred_df3 = pd.concat([model_submit, test_pred_df3], axis = 1)

In [None]:
test_pred_df3['status_group'] = test_pred_df3['status_group'].astype(str)

In [None]:
# Convert the outcome column 'status_group' into numeric values

test_pred_df3['status_group'].replace('1', 'functional', inplace = True)
test_pred_df3['status_group'].replace('0', 'non_functional', inplace = True)
test_pred_df3['status_group'].replace('2', 'functional_needs_repair', inplace = True)

In [None]:
test_pred_df3['status_group'].value_counts()

## Save File for Submission

In [None]:
# Top 50 Model, Classifier 1: Score = 0.8055

#test_pred_df.to_csv('test_set_predictions_6', index = False)

In [None]:
# Top 50 Model, Classifier 2: Score = 0.8055

#test_pred_df2.to_csv('test_set_predictions_7', index = False)

In [None]:
# Top 50 Model, Classifier 3: Score = 0.8055

#test_pred_df3.to_csv('test_set_predictions_8', index = False)

In [None]:
# Top 50 Model, CatBoost Classifier: Score = 0.7596

#cat_preds.to_csv('test_set_predictions_13', index = False)

In [None]:
# Top 50 Model, Classifier 2.1(MinMax, n_est = 250): Score = 0.8043

#test_pred_df21.to_csv('test_set_predictions_12', index = False)

In [None]:
# Top 50 Model, Classifier 2.1(MinMax, n_est = 300): Score = 0.8063

#test_pred_df21.to_csv('test_set_predictions_11', index = False)

In [43]:
# Top 50 Model, Classifier 2.1(MinMax, n_est = 350): Score = 0.8075

#test_pred_df21.to_csv('test_set_predictions_19', index = False)

In [19]:
# Top 50 Model, Classifier 2.1(MinMax, n_est = 400): Score = 0.8083

#test_pred_df21.to_csv('test_set_predictions_16', index = False)

In [27]:
# Top 50 Model, Classifier 2.1(MinMax, n_est = 500): Score = 0.8078

#test_pred_df21.to_csv('test_set_predictions_17', index = False)

In [35]:
# Top 50 Model, Classifier 2.1(MinMax, n_est = 600): Score = 0.8068

#test_pred_df21.to_csv('test_set_predictions_18', index = False)

In [51]:
# Top 50 Model, Classifier 2.1(MinMax, n_est = 450): Score = 0.8076

#test_pred_df21.to_csv('test_set_predictions_21', index = False)