### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier


### Importing data

In [None]:
f_path = '/Users/charlotteleysen/Google Drive/*PROJECTS/IE/Term 2/Machine Leaning 2/Tanzania Water Assignment/outputs/'
df = pd.read_csv(f_path + 'train_clean.csv')
df_test = pd.read_csv(f_path + 'test_clean.csv')
f_path = '/Users/charlotteleysen/Google Drive/*PROJECTS/IE/Term 2/Machine Leaning 2/Tanzania Water Assignment/data/'
df_test_origin = pd.read_csv(f_path + 'test_set.csv')
del df['Unnamed: 0']
df_test = df_test.rename(columns={'Unnamed: 0': 'id'})

df_test_origin = df_test_origin.sort_values('id', ascending=True)
a = list(df_test_origin.index)
df_test['order'] = a
df_test = df_test.sort_values('order', ascending=True)

df_all = df.copy()


### Preparing  Data for Modelling by converting object variables to categories

In [None]:
names = list(df_all.select_dtypes(['object']).columns)
df_all[names] = df_all[names].apply(lambda x: x.astype('category'))
df_test[names] = df_test[names].apply(lambda x: x.astype('category'))

### Changing categories to numeric codes
Every variable should be numeric - the model works better with numeric variables

In [None]:
df_all[names] = df_all[names].apply(lambda x: x.cat.codes)
df_test[names] = df_test[names].apply(lambda x: x.cat.codes)


### Check the data

In [2]:
df_all.dtypes
df.status_group.head(30)
df_all.status_group.head(30)

NameError: name 'df_all' is not defined

### Note
Status group has been changed to the following numeric codes:
non functional = 2, functional = 0, needs repair = 1

### Create train and test

In [3]:
df_all['is_train'] = np.random.uniform(0, 1, len(df_all)) <= .75
train, test = df_all[df_all['is_train']], df_all[df_all['is_train'] == False]

features = df_all.drop(['status_group', 'is_train'], axis=1).columns
features
target = train['status_group']

NameError: name 'df_all' is not defined

## Create the Models

### Random Forest Classifier

In [None]:
clf = RandomForestClassifier(
    n_estimators=25,
    n_jobs=15,
    random_state=0,
    min_samples_split=8,
    min_samples_leaf=4,
    max_depth=25)
clf.fit(train[features], target)

In [None]:
# Check results
clf.predict(test[features])
clf.predict_proba(test[features])[0:10]

In [None]:
# Scores
clf.score(test[features], test.status_group)
confusion_matrix(test.status_group, clf.predict(test[features]))

### Random Forest Hyperparameter Optimising

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=20, stop=200, num=20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=21)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5, 8, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

clf2 = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
clf2_random = RandomizedSearchCV(
    estimator=clf2,
    param_distributions=random_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1)

#### Fit the random search model

Initial model parameters result: {'n_estimators': 143, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 100, 'bootstrap': False}


New model result: {'n_estimators': 181, 'min_samples_split': 10,'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 95, 'bootstrap': False}


In [6]:
clf2_random.fit(train[features], target)
clf2_random.best_params_
clf2_random.score(test[features], test.status_group)

confusion_matrix(test.status_group, clf2_random.predict(test[features]))

NameError: name 'clf2_random' is not defined

### XGBoost

In [None]:
import xgboost as xgb

xgmodel = xgb.XGBClassifier(
    max_depth=10,
    objective='binary:logistic',
    subsample=1,
    colsample_bytree=0.8,
    learning_rate=0.4,
    min_child_weight=1.7)
xgmodel.fit(train[features], target)
xgmodel.score(test[features], test.status_group)

xgmodel.predict(test[features])
xgmodel.predict_proba(test[features])

xgmodel.score(test[features], test.status_group)
confusion_matrix(test.status_group, xgmodel.predict(test[features]))

### XGBoost Hyperparameter Optimising

In [None]:
n_estimators = [int(x) for x in np.linspace(start=20, stop=200, num=20)]
colsample_bytree = [float(x) for x in np.linspace(start=0.5, stop=1, num=5)]
subsample = [float(x) for x in np.linspace(start=0.5, stop=1, num=5)]
gamma = [float(x) for x in np.linspace(start=0, stop=10, num=16)]
learning_rate = [float(x) for x in np.linspace(start=0, stop=0.3, num=4)]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'colsample_bytree': colsample_bytree,
               'subsample': subsample,
               'gamma': gamma,
               'learning_rate': learning_rate}

xgmodel2 = xgb.XGBClassifier()
xgmodel_random = RandomizedSearchCV(
    estimator=xgmodel2,
    param_distributions=random_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1)

#### Fit the random search model
Initial model parameters result: {'subsample': 0.75, 'n_estimators': 171, 'learning_rate': 0.3, 'gamma': 0.6666666666666666, 'colsample_bytree': 1.0}

New model results : {'subsample': 0.5, 'n_estimators': 181, 'learning_rate': 0.3, 'gamma': 2.0, 'colsample_bytree': 0.5}

In [None]:
xgmodel_random.fit(train[features], target)
xgmodel_random.best_params_
xgmodel_random.predict(test[features])
confusion_matrix(test.status_group, xgmodel_random.predict(test[features]))

### Combine Predictions

In [None]:
clf2_random.predict_proba(test[features])
xgmodel_random.predict_proba(test[features])

pred1 = clf2_random.predict_proba(test[features])
pred2 = xgmodel_random.predict_proba(test[features])

#### Weight the models 80% random forest, 20%XGBoost

In [None]:
preds_comb = pd.DataFrame(0.8 * pred1 + 0.2 * pred2)
preds_comb.idxmax(axis=1)

#### Test the combination

In [None]:
confusion_matrix(test.status_group, preds_comb.idxmax(axis=1))
confusion_matrix(test.status_group, xgmodel_random.predict(test[features]))
confusion_matrix(test.status_group, clf2_random.predict(test[features]))

### Retrain on whole data set

In [None]:
clf_final = RandomForestClassifier(
    n_estimators=181,
    min_samples_split=10,
    min_samples_leaf=2,
    max_features='sqrt',
    max_depth=95,
    bootstrap=False)
clf_final.fit(df_all[features], df_all['status_group'])

xgmodel_final = xgb.XGBClassifier(
    subsample=0.5,
    n_estimators=181,
    learning_rate=0.3,
    gamma=2.0,
    colsample_bytree=0.5)
xgmodel_final.fit(df_all[features], df_all['status_group'])

### Predict on real test set

#### Single Model

In [None]:

df_test['status_group'] = clf_final.predict(df_test[features])

submission = df_test[['id', 'status_group']].copy()
submission.status_group[submission.status_group == 2] = 'non functional'
submission.status_group[submission.status_group == 0] = 'functional'
submission.status_group[submission.status_group == 1] = 'needs repair'


#### Combination of model predictions

In [None]:
test_preds1 = clf_final.predict_proba(df_test[features])
test_preds2 = xgmodel_final.predict_proba(df_test[features])
preds_comb_real = pd.DataFrame(0.8 * test_preds1 + 0.2 * test_preds2)
preds_comb_real.idxmax(axis=1)

df_test['status_group'] = preds_comb_real.idxmax(axis=1).values
submission = df_test[['id', 'status_group']].copy()
submission.status_group[submission.status_group == 2] = 'non functional'
submission.status_group[submission.status_group == 0] = 'functional'
submission.status_group[submission.status_group == 1] = 'needs repair'

In [None]:
# Correct the id labels in prediction

In [8]:
submission.id = submission.id - 1

NameError: name 'submission' is not defined

### Write to submission


For the final submission I used a pure CLF model

In [9]:
submission.to_csv(
    path_or_buf='/Users/charlotteleysen/Google Drive/*PROJECTS/IE/Term 2/Machine Leaning 2/Tanzania Water Assignment/submission.csv',
    sep=',',
    decimal='.',
    index=False)

NameError: name 'submission' is not defined