## Importing Analytical Packages

In [49]:
# Load the library with the iris dataset
from sklearn.datasets import load_iris

# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

from sklearn.utils import shuffle

# Set random seed
np.random.seed(0)

## Data Preparation Done by Dataiku

## Loading Data

In [50]:
#loading the data for independent variables
train_set = pd.read_csv('C:/Users/yangj/Documents/GitHub/pumpitup_1/data/data_to_go_max/train_set_to_go.csv')
test_set = pd.read_csv('C:/Users/yangj/Documents/GitHub/pumpitup_1/data/data_to_go_max/test_set_to_go.csv')

#loading target_set
target_train = pd.read_csv('C:/Users/yangj/Documents/GitHub/pumpitup_1/data/target_vars.csv')

#submission format
sub_form = pd.read_csv('C:/Users/yangj/Documents/GitHub/pumpitup_1/data/SubmissionFormat.csv')

## Factorize Target

In [51]:
y = pd.factorize(target_train['status_group'])[0]
y_index =pd.factorize(target_train['status_group'])[1]
y_index

Index(['functional', 'non functional', 'functional needs repair'], dtype='object')

In [52]:
train_set.columns

Index(['id', 'amount_tsh', 'since_date_recorded_days', 'funder', 'installer',
       'wpt_name', 'basin', 'subvillage', 'region', 'region_code',
       'district_code', 'lga', 'population', 'public_meeting', 'scheme_name',
       'permit', 'construction_year', 'extraction_type_class', 'management',
       'user_group', 'payment', 'water_soft', 'quantity', 'source_type',
       'source_class', 'waterpoint_type_group', 'gps_height', 'longitude',
       'latitude', 'train_set'],
      dtype='object')

## Transformation Train_set and Test_set into array

In [53]:
x_train = train_set.drop(labels = ['id','train_set'], axis = 1)
x_train[['funder', 'installer', 'basin','region_code', 'district_code', 'public_meeting','construction_year',\
        'extraction_type_class', 'management', 'payment', 'source_type', 'waterpoint_type_group',\
        'region','water_soft','permit','scheme_name','source_class','lga','user_group','wpt_name','subvillage']]\
= x_train[['funder', 'installer', 'basin','region_code', 'district_code', 'public_meeting','construction_year',\
        'extraction_type_class', 'management', 'payment', 'source_type', 'waterpoint_type_group',\
        'region','water_soft','permit','scheme_name','source_class','lga','user_group','wpt_name','subvillage']]\
        .apply(lambda x: pd.factorize(x)[0])
x_train = x_train.fillna(0)

x_test = test_set.drop(labels = ['id','train_set'], axis = 1)
x_test[['funder', 'installer', 'basin','region_code', 'district_code', 'public_meeting','construction_year',\
        'extraction_type_class', 'management', 'payment', 'source_type', 'waterpoint_type_group',\
        'region','water_soft','permit','scheme_name','source_class','lga','user_group','wpt_name','subvillage']]\
= x_test[['funder', 'installer', 'basin','region_code', 'district_code', 'public_meeting','construction_year',\
        'extraction_type_class', 'management', 'payment', 'source_type', 'waterpoint_type_group',\
        'region','water_soft','permit','scheme_name','source_class','lga','user_group','wpt_name','subvillage']]\
        .apply(lambda x: pd.factorize(x)[0])
x_test = x_test.fillna(0)

## Take Random Sample(90%) to train the data, and left 10% to test

In [54]:
x_train = shuffle(x_train,random_state = 0)
y = shuffle(y, random_state = 0)
x_train_test = x_train[:5940]
x_train = x_train[5940:]
y_test = y[:5940]
y = y[5940:]

## Train Random Forest Classifier

### n_estimators: @Falcon is wrong, in general the more trees the less likely the algorithm is to overfit. So try increasing this. The lower this number, the closer the model is to a decision tree, with a restricted feature set.

### max_features: try reducing this number (try 30-50% of the number of features). This determines how many features each tree is randomly assigned. The smaller, the less likely to overfit, but too small will start to introduce under fitting.


### max_depth: Experiment with this. This will reduce the complexity of the learned models, lowering over fitting risk. Try starting small, say 5-10, and increasing you get the best result.


### min_samples_leaf: Try setting this to values greater than one. This has a similar effect to the max_depth parameter, it means the branch will stop splitting once the leaves have that number of samples each.

In [None]:
precisions = pd.DataFrame(columns=['i','precision'])
for i in range(20,30):
# Create a random forest Classifier. By convention, clf means 'Classifier'
    clf = RandomForestClassifier(n_jobs=2, n_estimators = 50, random_state=0, criterion='entropy', max_features = 0.4, max_depth = i)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
    clf.fit(x_train, y)

    feature_importances = pd.DataFrame(list(zip(x_train, clf.feature_importances_)))
    feature_importances.columns = ['feature','importance']
    feature_importances = feature_importances.sort_values(by = 'importance', axis = 0, ascending = 0)
    feature_importances

    y_pred = clf.predict(x_train_test)
    a = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
    precision = np.diag(a).sum()/a.sum().sum()
    precisions = precisions.append({'i': i, 'precision': precision}, ignore_index = True)
precisions

## max_depth = 10, find best min_samples_leaf

In [60]:
precisions = pd.DataFrame(columns=['i','precision'])
for i in np.arange(0.5,0.6,0.01):
# Create a random forest Classifier. By convention, clf means 'Classifier'
    clf = RandomForestClassifier(n_jobs=2, n_estimators = 50, random_state=0, criterion='entropy',\
                                 max_features = i, max_depth = 10)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
    clf.fit(x_train, y)

    feature_importances = pd.DataFrame(list(zip(x_train, clf.feature_importances_)))
    feature_importances.columns = ['feature','importance']
    feature_importances = feature_importances.sort_values(by = 'importance', axis = 0, ascending = 0)
    feature_importances

    y_pred = clf.predict(x_train_test)
    a = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
    precision = np.diag(a).sum()/a.sum().sum()
    precisions = precisions.append({'i': i, 'precision': precision}, ignore_index = True)
precisions

Unnamed: 0,i,precision
0,0.5,0.757407
1,0.51,0.757407
2,0.52,0.757407
3,0.53,0.757407
4,0.54,0.758923
5,0.55,0.758923
6,0.56,0.758923
7,0.57,0.758923
8,0.58,0.757407
9,0.59,0.757407


In [None]:
## max_depth = 10, min_samples_leaf = 0.55， find best min_sample_leaf

In [62]:
precisions = pd.DataFrame(columns=['i','precision'])
for i in np.arange(1,300,30):
# Create a random forest Classifier. By convention, clf means 'Classifier'
    clf = RandomForestClassifier(n_jobs=2, n_estimators = 50, random_state=0, criterion='entropy',\
                                 max_features = 0.55, max_depth = 10, min_samples_leaf = 1)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
    clf.fit(x_train, y)

    feature_importances = pd.DataFrame(list(zip(x_train, clf.feature_importances_)))
    feature_importances.columns = ['feature','importance']
    feature_importances = feature_importances.sort_values(by = 'importance', axis = 0, ascending = 0)
    feature_importances

    y_pred = clf.predict(x_train_test)
    a = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
    precision = np.diag(a).sum()/a.sum().sum()
    precisions = precisions.append({'i': i, 'precision': precision}, ignore_index = True)
precisions

Unnamed: 0,i,precision
0,1.0,0.758923
1,31.0,0.750505
2,61.0,0.747475
3,91.0,0.742088
4,121.0,0.737205
5,151.0,0.73771
6,181.0,0.738721
7,211.0,0.736364
8,241.0,0.735185
9,271.0,0.734848


## max_depth = 10, min_samples_leaf = 0.55， find best min_sample_leaf = 1, try large n_estimators

In [65]:
precisions = pd.DataFrame(columns=['i','precision'])
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, n_estimators = 2000, random_state=0, criterion='gini',\
                                 max_features = 0.55, max_depth = 10, min_samples_leaf = 1)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(x_train, y)

feature_importances = pd.DataFrame(list(zip(x_train, clf.feature_importances_)))
feature_importances.columns = ['feature','importance']
feature_importances = feature_importances.sort_values(by = 'importance', axis = 0, ascending = 0)
feature_importances

y_pred = clf.predict(x_train_test)
a = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
precision = np.diag(a).sum()/a.sum().sum()
precision

0.7627946127946128

## Prediction Using Classifier Trained

In [77]:
precision = np.diag(a).sum()/a.sum().sum()

In [49]:
test_set['predict'] = clf.predict(x_test)
test_set['predict'] = test_set['predict'].replace(to_replace = [0,1,2], value = y_index)
to_sub = test_set[['id','predict']]

In [50]:
submission = pd.merge(sub_form, to_sub, on = 'id')
submission = submission.drop('status_group', axis = 1)
submission.columns = ['id', 'status_group']
submission.to_csv('submission.csv',header = True, index = False)