In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# sklearn :: evaluation metrics
from sklearn.metrics import cohen_kappa_score
sns.set_style('whitegrid')

# Problem Statement

Predict when the pet will be adopted

# Load data

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print(df_train.shape, df_test.shape)

(10000, 24) (4993, 23)


# Feature Engineering

In [3]:
df_train.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,1,â¥â¥â¥ Lily â¥â¥â¥,36,307,0,2,2,7,0,2,...,1,1,0,41326,337914b09c2fa5460e195197e994ef98,0,Adorable 3 year old Lily looking for a forever...,3f8824a3b,1.0,4
1,2,Cookie,3,266,0,1,6,7,0,2,...,1,1,0,41327,4bb1ebb92158078ad54a6bb23c10dffc,0,i rescue this stary kitten from market near my...,9238eb7fc,1.0,2
2,2,Favour Speedy Abundance And Courage,7,250,252,1,1,2,0,2,...,1,4,0,41327,99ba8ce53b4d8515e417e7921563d923,0,The mother was a Burmese cross and had since p...,f0a1f2b90,2.0,4
3,1,,3,307,0,1,2,0,0,3,...,1,1,0,41327,3f3ef74c486beba3bc87f6dbaee772bf,0,This puppy is: 1. Male 2. 3 months old 3. Brow...,7d028bdea,4.0,2
4,2,Abandoned Kitty,1,266,0,1,1,6,7,1,...,1,1,0,41401,844f03ab8054007d4be6686f3a9702b9,0,Mother cat gave birth to a litter of 3 and too...,8377bfe97,0.0,2


In [4]:
for col in ['Type', 'Gender', 'Color1', 'Color2','Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed','Sterilized', 'Health','State']:
    df_dummies = pd.get_dummies(df_train[col])
    df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
    df_train= pd.concat([df_train, df_dummies], axis=1)
  
    del df_train[col]

df_train.columns

Index(['Name', 'Age', 'Breed1', 'Breed2', 'Quantity', 'Fee', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed',
       'Type_1', 'Type_2', 'Gender_1', 'Gender_2', 'Gender_3', 'Color1_1',
       'Color1_2', 'Color1_3', 'Color1_4', 'Color1_5', 'Color1_6', 'Color1_7',
       'Color2_0', 'Color2_2', 'Color2_3', 'Color2_4', 'Color2_5', 'Color2_6',
       'Color2_7', 'Color3_0', 'Color3_3', 'Color3_4', 'Color3_5', 'Color3_6',
       'Color3_7', 'MaturitySize_1', 'MaturitySize_2', 'MaturitySize_3',
       'MaturitySize_4', 'FurLength_1', 'FurLength_2', 'FurLength_3',
       'Vaccinated_1', 'Vaccinated_2', 'Vaccinated_3', 'Dewormed_1',
       'Dewormed_2', 'Dewormed_3', 'Sterilized_1', 'Sterilized_2',
       'Sterilized_3', 'Health_1', 'Health_2', 'Health_3', 'State_41324',
       'State_41325', 'State_41326', 'State_41327', 'State_41330',
       'State_41332', 'State_41335', 'State_41336', 'State_41342',
       'State_41345', 'State_41361', 'State_41367', 'St

In [5]:
for col in ['Type',  'Gender', 'Color1', 'Color2', 
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'State']:
    df_dummies = pd.get_dummies(df_test[col])
    df_dummies.columns=[str(col)+'_'+str(c) for c in df_dummies.columns]
    df_test = pd.concat([df_test, df_dummies], axis=1)
    del df_test[col]


In [6]:
#adding the missing columns to the train and set their values to 0
for column in df_test.columns:
    if not column in df_train.columns:
            df_train[column]=0
            df_trial[column]=0
print(df_train.shape, df_test.shape)

(10000, 70) (4993, 69)


In [7]:
#adding the missing columns to the test and set their values to 0
for column in df_train.columns:
    if not column in df_test.columns:
        if column != 'AdoptionSpeed':
            df_test[column]=0
print(df_train.shape, df_test.shape)

(10000, 70) (4993, 69)


In [8]:
X_columns = ['Age', 'Breed1', 'Breed2', 'Quantity', 'Fee', 
       'VideoAmt', 'PhotoAmt', 'Type_1', 'Type_2', 'Gender_1', 'Gender_2', 'Gender_3', 'Color1_1',
       'Color1_2', 'Color1_3', 'Color1_4', 'Color1_5', 'Color1_6', 'Color1_7',
       'Color2_0', 'Color2_2', 'Color2_3', 'Color2_4', 'Color2_5', 'Color2_6',
       'Color2_7', 'Color3_0', 'Color3_3', 'Color3_4', 'Color3_5', 'Color3_6',
       'Color3_7', 'MaturitySize_1', 'MaturitySize_2', 'MaturitySize_3',
       'MaturitySize_4', 'FurLength_1', 'FurLength_2', 'FurLength_3',
       'Vaccinated_1', 'Vaccinated_2', 'Vaccinated_3', 'Dewormed_1',
       'Dewormed_2', 'Dewormed_3', 'Sterilized_1', 'Sterilized_2',
       'Sterilized_3', 'Health_1', 'Health_2', 'Health_3', 'State_41324',
       'State_41325', 'State_41326', 'State_41327', 'State_41330',
       'State_41332', 'State_41335', 'State_41336', 'State_41342',
       'State_41345', 'State_41361', 'State_41367', 'State_41401',
       'State_41415']
y_column = ['AdoptionSpeed']

In [10]:
# split the data

threshold = 0.8
X = df_train[X_columns]
y = df_train[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (8000, 65)
y_train (8000, 1)
X_test (2000, 65)
y_test (2000, 1)


In [11]:
models = [
    ('Naive Bayes', GaussianNB()),
    ('RandomForestClassifier10', RandomForestClassifier(n_estimators=10)),
    ('RandomForestClassifier100', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('KNeighborsClassifier', KNeighborsClassifier()),
    ('DecisionTreeClassifier', DecisionTreeClassifier())    
]
results = []
for m in models:
    print('MODEL', m[0])
    model = m[1]
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
    print('kappa', round(kappa, 4))
    print(confusion_matrix(y_test, y_pred))

    results.append([m[0], kappa])
    
    # if there is a feature importance, print top 5
    importance = []
    if hasattr(model, 'feature_importances_'):
        print('Feature Importance')
        importance = []
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.feature_importances_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).tail(10))
    elif hasattr(model, 'coef_'):
        print('Feature Importance')
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.coef_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).tail(10))
        
    print('')

# sort the results and print as a table
df_results = pd.DataFrame(results)
df_results.columns = ['model', 'kappa']
df_results = df_results.sort_values(by='kappa', ascending=False)
df_results

MODEL Naive Bayes
kappa 0.1071
[[ 34  16   3   3   3]
 [179 156  38  33  17]
 [202 145  55  67  32]
 [147 106  50 107  27]
 [228 115  55 109  73]]

MODEL RandomForestClassifier10
kappa 0.2566
[[  5  20  20   7   7]
 [  8 135 145  65  70]
 [  3 138 169  90 101]
 [  6  93 118 121  99]
 [  8  99 119  78 276]]
Feature Importance
                 0         1
28        Color3_4  0.002536
52     State_41325  0.001891
57     State_41335  0.001312
60     State_41345  0.000690
35  MaturitySize_4  0.000577
61     State_41361  0.000514
50        Health_3  0.000486
59     State_41342  0.000433
62     State_41367  0.000170
64     State_41415  0.000053

MODEL RandomForestClassifier100
kappa 0.3306
[[  3  24  18   5   9]
 [  7 132 141  54  89]
 [  1 108 160  92 140]
 [  1  64 124 130 118]
 [  1  60 107  56 356]]
Feature Importance
                 0         1
51     State_41324  0.002138
52     State_41325  0.001903
57     State_41335  0.001527
60     State_41345  0.000811
35  MaturitySize_4  0.000600

Unnamed: 0,model,kappa
2,RandomForestClassifier100,0.330612
3,KNeighborsClassifier,0.266082
1,RandomForestClassifier10,0.256632
4,DecisionTreeClassifier,0.205417
0,Naive Bayes,0.107147


In [12]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train.values.ravel())
y_pred = model.predict(X_test)

In [None]:
k = 10
results = []
kf = KFold(n_splits=k)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    model.fit(X_train, y_train.ravel())
    y_pred = model.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
    results.append(round(kappa, 4))

print('Kappa for each fold:', results)
print('AVG(kappa)', round(np.mean(results), 4))
print('STD(kappa)', round(np.std(results), 4))

In [None]:
df_prediction = df_test[X_columns]
df_test['AdoptionSpeed'] = model.predict(df_prediction)
df_test[['PetID', 'AdoptionSpeed']]

In [None]:
df_test[['PetID', 'AdoptionSpeed']].to_csv('submission_knn.csv', index=False)