## Importing Analytical Packages

In [155]:
# Load the library with the iris dataset
from sklearn.datasets import load_iris

# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

from sklearn.utils import shuffle

import matplotlib.pyplot as plt


# Set random seed
np.random.seed(0)

## Data Preparation Done by Dataiku

## Loading Data

In [156]:
#loading the data for independent variables
whole_set = pd.read_csv('C:/Users/yangj/Documents/GitHub/pumpitup_1/data/data_to_go_max/whole_data.csv')

#loading target_set
target_train = pd.read_csv('C:/Users/yangj/Documents/GitHub/pumpitup_1/data/target_vars.csv')

#submission format
sub_form = pd.read_csv('C:/Users/yangj/Documents/GitHub/pumpitup_1/data/SubmissionFormat.csv')

In [157]:
object = whole_set.dtypes[whole_set.dtypes == 'object'].index
object

Index(['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region',
       'lga', 'ward', 'scheme_management', 'scheme_name', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_quality',
       'quality_group', 'quantity', 'source', 'source_type', 'source_class',
       'waterpoint_type', 'waterpoint_type_group'],
      dtype='object')

## Transformation Train_set and Test_set into array

In [159]:
whole_set_prc = whole_set
whole_set_prc[object] = whole_set[object].apply(lambda x: pd.factorize(x)[0])
whole_set_prc[:10]

Unnamed: 0,id,amount_tsh,since_date_recorded_days,funder,installer,wpt_name,basin,subvillage,region,lga,...,quantity,source,source_type,source_class,waterpoint_type,waterpoint_type_group,gps_height,longitude,latitude,train_set
0,69572,6000.0,2556,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1390.0,34.938093,-9.856322,1
1,8776,0.0,1833,0,0,1,1,0,1,1,...,1,1,1,1,0,0,1399.0,34.698766,-2.147466,1
2,34310,25.0,1842,0,1,1,2,1,2,0,...,0,2,2,1,1,0,686.0,37.460664,-3.821329,1
3,67743,0.0,1870,1,0,1,3,0,3,0,...,2,3,3,0,1,0,263.0,38.486161,-11.155298,1
4,19728,0.0,2435,0,0,1,1,0,4,1,...,3,1,1,1,0,0,1658.0,31.130847,-1.825359,1
5,9944,20.0,2557,0,2,1,2,0,5,0,...,0,4,4,2,1,0,4.0,39.172796,-4.765587,1
6,19816,0.0,1989,0,0,1,4,0,6,0,...,0,3,3,0,2,1,1147.0,33.36241,-3.766365,1
7,54551,0.0,1981,2,2,1,5,0,6,1,...,0,5,5,0,2,1,1261.0,32.620617,-4.226198,1
8,53934,0.0,1956,0,0,1,5,0,7,0,...,3,3,3,0,2,1,1154.0,32.7111,-5.146712,1
9,46144,0.0,2414,0,0,1,1,0,4,1,...,0,5,5,0,2,1,1366.0,30.626991,-1.257051,1


## Split dataset

In [175]:
x_train = whole_set_prc[whole_set_prc['train_set'] == 1].drop(['train_set'], axis =1)
x_train = x_train.fillna(0)

x_test = whole_set_prc[whole_set_prc['train_set'] == 0].drop(['train_set'], axis =1)
x_test = x_test.fillna(0)

## Take Random Sample(90%) to train the data, and leave 10% to test

In [176]:
x_train = x_train.merge(how='left', right = target_train, on = 'id')

In [177]:
x_train = shuffle(x_train,random_state = 1)
y_index = pd.factorize(x_train['status_group'].values)[1]
y = x_train[['status_group']].apply(lambda x: pd.factorize(x)[0])
y = y['status_group']
x_train = x_train.drop(['id','status_group'], axis = 1)
x_train_test = x_train[:5940]
x_train = x_train[5940:]
y_test = y[:5940]
y = y[5940:]

## Train Random Forest Classifier

### n_estimators: @Falcon is wrong, in general the more trees the less likely the algorithm is to overfit. So try increasing this. The lower this number, the closer the model is to a decision tree, with a restricted feature set.

### max_features: try reducing this number (try 30-50% of the number of features). This determines how many features each tree is randomly assigned. The smaller, the less likely to overfit, but too small will start to introduce under fitting.


### max_depth: Experiment with this. This will reduce the complexity of the learned models, lowering over fitting risk. Try starting small, say 5-10, and increasing you get the best result.


### min_samples_leaf: Try setting this to values greater than one. This has a similar effect to the max_depth parameter, it means the branch will stop splitting once the leaves have that number of samples each.

## find best max_depth = 23

In [180]:
precisions = pd.DataFrame(columns=['i','precision'])
for i in range(20,30):
# Create a random forest Classifier. By convention, clf means 'Classifier'
    clf = RandomForestClassifier(n_jobs=4, n_estimators = 30, random_state=0, criterion='gini',\
                                 max_features = 0.4, max_depth = i, oob_score = 1)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
    clf.fit(x_train, y)

    feature_importances = pd.DataFrame(list(zip(x_train, clf.feature_importances_)))
    feature_importances.columns = ['feature','importance']
    feature_importances = feature_importances.sort_values(by = 'importance', axis = 0, ascending = 0)
    feature_importances

    y_pred = clf.predict(x_train_test)
    a = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
    precision = np.diag(a).sum()/a.sum().sum()
    precisions = precisions.append({'i': i, 'precision': precision}, ignore_index = True)
precisions

Unnamed: 0,i,precision
0,20.0,0.808249
1,21.0,0.8133
2,22.0,0.807071
3,23.0,0.811279
4,24.0,0.808418
5,25.0,0.803535
6,26.0,0.806397
7,27.0,0.806734
8,28.0,0.806397
9,29.0,0.808418


## max_depth = 0, find best max_features = 0.4

In [182]:
precisions = pd.DataFrame(columns=['i','precision'])
for i in np.arange(0.3,0.7,0.01):
# Create a random forest Classifier. By convention, clf means 'Classifier'
    clf = RandomForestClassifier(n_jobs=4, n_estimators = 30, random_state=0, criterion='gini',\
                                 max_features = i, max_depth = 23,  oob_score = 1)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
    clf.fit(x_train, y)

    feature_importances = pd.DataFrame(list(zip(x_train, clf.feature_importances_)))
    feature_importances.columns = ['feature','importance']
    feature_importances = feature_importances.sort_values(by = 'importance', axis = 0, ascending = 0)
    feature_importances

    y_pred = clf.predict(x_train_test)
    a = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
    precision = np.diag(a).sum()/a.sum().sum()
    precisions = precisions.append({'i': i, 'precision': precision}, ignore_index = True)
precisions

Unnamed: 0,i,precision
0,0.3,0.805556
1,0.31,0.805556
2,0.32,0.805556
3,0.33,0.807407
4,0.34,0.807407
5,0.35,0.807407
6,0.36,0.809091
7,0.37,0.809091
8,0.38,0.809091
9,0.39,0.811279


## max_depth = 23 max_features = 0.40 find best min_sample_leaf = 1

In [184]:
precisions = pd.DataFrame(columns=['i','precision'])
for i in np.arange(1,300,30):
# Create a random forest Classifier. By convention, clf means 'Classifier'
    clf = RandomForestClassifier(n_jobs=2, n_estimators = 30, random_state=0, criterion='gini',\
                                 max_depth = 23, max_features = 0.4, min_samples_leaf = i, oob_score = 1)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
    clf.fit(x_train, y)

    feature_importances = pd.DataFrame(list(zip(x_train, clf.feature_importances_)))
    feature_importances.columns = ['feature','importance']
    feature_importances = feature_importances.sort_values(by = 'importance', axis = 0, ascending = 0)
    feature_importances

    y_pred = clf.predict(x_train_test)
    a = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
    precision = np.diag(a).sum()/a.sum().sum()
    precisions = precisions.append({'i': i, 'precision': precision}, ignore_index = True)
precisions

Unnamed: 0,i,precision
0,1.0,0.811279
1,31.0,0.782828
2,61.0,0.771549
3,91.0,0.758923
4,121.0,0.751852
5,151.0,0.745455
6,181.0,0.742593
7,211.0,0.741246
8,241.0,0.737542
9,271.0,0.735522


## Try big n_estimator

In [187]:
precisions = pd.DataFrame(columns=['i','precision'])

# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, n_estimators = 500, random_state=0, criterion='gini',\
                                 max_depth = 23, max_features = 0.4, min_samples_leaf = 1, oob_score = 1)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(x_train, y)

feature_importances = pd.DataFrame(list(zip(x_train, clf.feature_importances_)))
feature_importances.columns = ['feature','importance']
feature_importances = feature_importances.sort_values(by = 'importance', axis = 0, ascending = 0)
feature_importances

y_pred = clf.predict(x_train_test)
a = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
precision = np.diag(a).sum()/a.sum().sum()
precisions = precisions.append({'i': i, 'precision': precision}, ignore_index = True)
precisions

Unnamed: 0,i,precision
0,271.0,0.812458


## Train with the whole set

In [188]:
x_train = whole_set_prc[whole_set_prc['train_set'] == 1].drop(['train_set'], axis =1)
x_train = x_train.fillna(0)

x_test = whole_set_prc[whole_set_prc['train_set'] == 0].drop(['train_set'], axis =1)
x_test = x_test.fillna(0)

x_train = x_train.merge(how='left', right = target_train, on = 'id')

x_train = shuffle(x_train,random_state = 1)
y_index = pd.factorize(x_train['status_group'].values)[1]
y = x_train[['status_group']].apply(lambda x: pd.factorize(x)[0])
y = y['status_group']
x_train = x_train.drop(['id','status_group'], axis = 1)

In [189]:
precisions = pd.DataFrame(columns=['i','precision'])

# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, n_estimators = 500, random_state=0, criterion='gini',\
                                 max_depth = 23, max_features = 0.4, min_samples_leaf = 1, oob_score = 1)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(x_train, y)

feature_importances = pd.DataFrame(list(zip(x_train, clf.feature_importances_)))
feature_importances.columns = ['feature','importance']
feature_importances = feature_importances.sort_values(by = 'importance', axis = 0, ascending = 0)
feature_importances

y_pred = clf.predict(x_train)
a = pd.crosstab(y, y_pred, rownames=['Actual'], colnames=['Predicted'])
precision = np.diag(a).sum()/a.sum().sum()
precisions = precisions.append({'i': i, 'precision': precision}, ignore_index = True)
precisions

Unnamed: 0,i,precision
0,271.0,0.979882


## Prediction Using Classifier Trained

In [190]:
x_test['predict'] = clf.predict(x_test.drop(['id'], axis = 1))

In [194]:
x_test['predict'] = x_test['predict'].replace(to_replace = [0,1,2], value = y_index)
to_sub = x_test[['id','predict']]

In [195]:
submission = pd.merge(sub_form, to_sub, on = 'id')
submission = submission.drop('status_group', axis = 1)
submission.columns = ['id', 'status_group']
submission.to_csv('submission_2.csv',header = True, index = False)