In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split

In [33]:
# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Prep the train and test sets for model fitting
train_x = train.drop(['Name', 'Description', 'RescuerID', 'PetID', 'AdoptionSpeed'], axis=1)
train_y = train['AdoptionSpeed']
test_x = test.drop(['Name', 'Description', 'RescuerID', 'PetID'], axis=1)

X_train2, X_cross, y_train2, y_cross = train_test_split(train_x, train_y, test_size=0.28, random_state=42)

### Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier


In [31]:
%%time
#rnd_clf = RandomForestClassifier(bootstrap=True, n_estimators=500, max_depth=50)
#rnd_clf_params = {
#    'bootstrap': [True],
#    'max_depth': [75, 80],
#    'max_features': ['auto'],
#    'min_samples_leaf': [20, 25]
#}

# Pre-calculated params
#rnd_clf_params_calculated = {'bootstrap': [True],
# 'max_depth': [80],
# 'max_features': ['auto'],
# 'min_samples_leaf': [20]} 

# Use the pre-calculated params to run the notebook faster
#rnd_gs = GridSearchCV(rnd_clf,rnd_clf_params_calculated, verbose=True)
#rnd_gs.fit(train_x,train_y)

rnd_clf = RandomForestClassifier(bootstrap=True, 
                                 n_estimators=500, 
                                 max_depth=75,
                                 max_features='auto',
                                 min_samples_split=5,
                                 min_samples_leaf=20)
rnd_clf.fit(X_train2, y_train2)

CPU times: user 2.85 s, sys: 6.88 ms, total: 2.85 s
Wall time: 2.85 s


### Quadratic Weighted Kappa Score
Sci-kit learn's cohen kappa calculates QWKS when the 'weights' are set to 'quadratic'.<br>
-1 is the least possible score and 1 the best possible. The random forest score is far higher than the others and it's almost certainly overfitting.

In [32]:
rnd_score = cohen_kappa_score(rnd_clf.predict(X_cross), y_cross, weights='quadratic')

print('Random forest score:', rnd_score)

Random forest score: 0.3620303608994654


In [45]:
final_preds = rnd_clf.predict(test_x)
test_orig = pd.read_csv('test.csv')
subm_df = pd.DataFrame({
    'PetID' : test_orig['PetID'],
    'AdoptionSpeed' : final_preds})

In [46]:
subm_df['AdoptionSpeed'] = subm_df['AdoptionSpeed'].astype('int32');
subm_df.to_csv("submission.csv", index=False)