In [163]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, precision_score, log_loss
from sklearn.metrics.scorer import make_scorer

In [217]:
train = pd.read_csv("train_files/train_with_feature_engineering.csv")
test = pd.read_csv("test_files/test_with_feature_engineering.csv")

print(train.columns)
print(test.columns)

In [180]:
selected_columns = ["Street.Length", "day_of_week","num_restaurants", "time_to_int", 
 "Mon_to_Sat_6", 'Mon_to_Sat_7', 'Mon_to_Sat_8', 'Mon_to_Sat_9', 'Mon_to_Sat_10', 
 'Mon_to_Sat_11', 'Mon_to_Sat_12', 'Mon_to_Sat_13', 'Mon_to_Sat_14', 'Mon_to_Sat_15', 
 'Mon_to_Sat_16', 'Mon_to_Sat_17', 'Mon_to_Sat_18', 'Mon_to_Sat_19', 'Mon_to_Sat_20',
'Mon_to_Sat_21', 'Mon_to_Sat_22', 'Sun_7', 'Sun_8', 'Sun_9', 'Sun_10', 'Sun_11', 
'Sun_12', 'Sun_13', 'Sun_14', 'Sun_15', 'Sun_16', 'Sun_17', 'Sun_18', 'Sun_19', 'Sun_20', 
'Sun_21', "restaurants_avg_rating","block_cluster", 'address_encoded', 'from_encoded', 
'to_encoded','fromto_encoded']

In [181]:
X_train = train[selected_columns]
y_train = train["any_spot"]

X_test = test[selected_columns]

In [219]:
rf_2 = RandomForestClassifier(n_jobs=-1, n_estimators=700)
params = {
    'min_samples_leaf': [5,10,15],
    'max_depth' : [9,11,13,15,20,25]
}

def my_custom_loss_func(ground_truth, predictions):
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    return 1.25*precision*recall/(0.25*precision + recall)

custom_scorer = make_scorer(my_custom_loss_func, greater_is_better=True)

gs = GridSearchCV(cv=2, param_grid=params, estimator=rf_2, scoring=custom_scorer, verbose=2)
gs.fit(X_train, y_train)
cv_df = pd.DataFrame(gs.cv_results_)[['param_min_samples_leaf', 'param_max_depth', 'mean_test_score', 'rank_test_score']]
cv_df

Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] max_depth=9, min_samples_leaf=5 .................................
[CV] .................. max_depth=9, min_samples_leaf=5, total=   1.6s
[CV] max_depth=9, min_samples_leaf=5 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s


[CV] .................. max_depth=9, min_samples_leaf=5, total=   1.4s
[CV] max_depth=9, min_samples_leaf=10 ................................
[CV] ................. max_depth=9, min_samples_leaf=10, total=   1.5s
[CV] max_depth=9, min_samples_leaf=10 ................................
[CV] ................. max_depth=9, min_samples_leaf=10, total=   1.5s
[CV] max_depth=9, min_samples_leaf=15 ................................
[CV] ................. max_depth=9, min_samples_leaf=15, total=   1.5s
[CV] max_depth=9, min_samples_leaf=15 ................................
[CV] ................. max_depth=9, min_samples_leaf=15, total=   1.3s
[CV] max_depth=11, min_samples_leaf=5 ................................
[CV] ................. max_depth=11, min_samples_leaf=5, total=   1.3s
[CV] max_depth=11, min_samples_leaf=5 ................................
[CV] ................. max_depth=11, min_samples_leaf=5, total=   1.3s
[CV] max_depth=11, min_samples_leaf=10 ...............................
[CV] .

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  1.1min finished


Unnamed: 0,param_min_samples_leaf,param_max_depth,mean_test_score,rank_test_score
0,5,9,0.564954,14
1,10,9,0.568547,4
2,15,9,0.565805,10
3,5,11,0.569748,2
4,10,11,0.566947,6
5,15,11,0.573209,1
6,5,13,0.55746,18
7,10,13,0.566104,8
8,15,13,0.56553,12
9,5,15,0.560796,17


In [236]:
rf = RandomForestClassifier(n_estimators=500, max_depth=25, n_jobs=-1, min_samples_leaf=5)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [237]:
threshold = 0.5
train_prediction_prob = rf.predict_proba(X_train)
train_predictions = (train_prediction_prob[:,0] < threshold).astype(int)

print('Training F0.5: %s' % str(my_custom_loss_func(y_train, train_predictions)))

Training F0.5: 0.748502994012


In [238]:
recall_score(y_train, train_predictions)

0.68578553615960103

In [239]:
precision_score(y_train, train_predictions)

0.76601671309192199

In [240]:
log_loss(y_train, train_prediction_prob)

0.43194996008763392

In [241]:
pd.DataFrame({"cols": X_train.columns, "feature_importance": rf.feature_importances_}) \
    .sort_values("feature_importance", ascending=False)

Unnamed: 0,cols,feature_importance
3,time_to_int,0.139604
38,address_encoded,0.137561
41,fromto_encoded,0.124913
40,to_encoded,0.102137
39,from_encoded,0.081304
1,day_of_week,0.073525
0,Street.Length,0.047678
2,num_restaurants,0.037909
36,restaurants_avg_rating,0.025528
8,Mon_to_Sat_10,0.017582


In [242]:
test_prediction_prob = rf.predict_proba(X_test)
test_predictions = (test_prediction_prob[:,0] < threshold).astype(int)

test['any_spot'] = test_predictions

In [243]:
test_with_id = pd.read_csv("test_files/test-no-labels-with-id.csv")

In [244]:
test_with_id["any_spot"] = test_predictions

In [245]:
test_with_id[["id", "any_spot"]].to_csv("results/result_25depth_5_minsample.csv", index=False)