In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import cohen_kappa_score

In [2]:
# Load the data
train_randf = pd.read_csv('train.csv')
test_randf = pd.read_csv('test.csv')

# Prep the train and test sets for model fitting
train_x = train_randf.drop(['Name', 'Description', 'RescuerID', 'PetID', 'AdoptionSpeed'], axis=1)
train_y = train_randf['AdoptionSpeed']
test_x = test_randf.drop(['Name', 'Description', 'RescuerID', 'PetID'], axis=1)


### Decision Tree

In [3]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier()
tree_clf_params = {
    'criterion' : ['gini','entropy'],
    'max_features' : np.arange(15,19,1),
    'max_depth' : [6,7],
    'min_samples_split' : np.arange(150,200,10),
}
tree_gs = GridSearchCV(estimator=tree_clf,
                       param_grid=tree_clf_params,
                       n_jobs=-1, 
                       cv=5, 
                       verbose=True)

tree_gs.fit(train_x,train_y)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  59 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:   10.3s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_features': array([15, 16, 17, 18]), 'max_depth': [6, 7], 'min_samples_split': array([150, 160, 170, 180, 190])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=True)

In [4]:
tree_gs.best_params_

{'criterion': 'gini',
 'max_depth': 7,
 'max_features': 17,
 'min_samples_split': 190}

### Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier


In [6]:
%%time
rnd_clf = RandomForestClassifier(bootstrap=True, n_estimators=500, max_depth=50)
#rnd_clf_params = {
#    'bootstrap': [True],
#    'max_depth': [75, 80],
#    'max_features': ['auto'],
#    'min_samples_leaf': [20, 25]
#}

# Pre-calculated params
rnd_clf_params_calculated = {'bootstrap': [True],
 'max_depth': [80],
 'max_features': ['auto'],
 'min_samples_leaf': [20]} 

# Use the pre-calculated params to run the notebook faster
rnd_gs = GridSearchCV(rnd_clf,rnd_clf_params_calculated, verbose=True)
rnd_gs.fit(train_x,train_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   12.8s finished


CPU times: user 18 s, sys: 195 ms, total: 18.2 s
Wall time: 18.5 s


In [7]:
rnd_gs.best_params_

{'bootstrap': True,
 'max_depth': 80,
 'max_features': 'auto',
 'min_samples_leaf': 20}

### AdaBoost classifier

In [8]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier()
ada_clf_params = {
    'n_estimators' : [100,110,120],
    'learning_rate' : [0.2,0.5,0.8]
}

ada_gs = GridSearchCV(ada_clf, ada_clf_params, verbose=True)
ada_gs.fit(train_x, train_y)

In [9]:
ada_gs.best_params_

### Quadratic Weighted Kappa Score
Sci-kit learn's cohen kappa calculates QWKS when the 'weights' are set to 'quadratic'.<br>
-1 is the least possible score and 1 the best possible. The random forest score is far higher than the others and it's almost certainly overfitting.

In [10]:
#ree_score = cohen_kappa_score(tree_gs.predict(train_x), train_y, weights='quadratic')
#rnd_score = cohen_kappa_score(rnd_gs.predict(train_x), train_y, weights='quadratic')
#rnd_score = cohen_kappa_score(rnd_clf.predict(X_test2), y_test2, weights='quadratic')
#ada_score = cohen_kappa_score(ada_gs.predict(train_x), train_y, weights='quadratic')

#print('Decision tree score:', tree_score)
#print('Random forest score:', rnd_score)
#print('Ada boost score,', ada_score)

In [25]:
final_preds = rnd_gs.predict(test_x)
test_orig = pd.read_csv('test.csv')
subm_df = pd.DataFrame({
    'PetID' : test_orig['PetID'],
    'AdoptionSpeed' : final_preds})

In [None]:
subm_df['AdoptionSpeed'] = subm_df['AdoptionSpeed'].astype('int32');
subm_df.to_csv("submission.csv", index=False)