# Use sklearn classification methods to predict if a pet will be adopted

## Load the packages set

In [213]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# sklearn :: evaluation metrics
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import mean_squared_error

sns.set_style('whitegrid')

# Load the data set

In [214]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print(df_train.shape, df_test.shape)

(10000, 24) (4993, 23)


# Feature Engineering

In [215]:
print(df_train.columns)
df_train.head()

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')


Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,1,â¥â¥â¥ Lily â¥â¥â¥,36,307,0,2,2,7,0,2,...,1,1,0,41326,337914b09c2fa5460e195197e994ef98,0,Adorable 3 year old Lily looking for a forever...,3f8824a3b,1.0,4
1,2,Cookie,3,266,0,1,6,7,0,2,...,1,1,0,41327,4bb1ebb92158078ad54a6bb23c10dffc,0,i rescue this stary kitten from market near my...,9238eb7fc,1.0,2
2,2,Favour Speedy Abundance And Courage,7,250,252,1,1,2,0,2,...,1,4,0,41327,99ba8ce53b4d8515e417e7921563d923,0,The mother was a Burmese cross and had since p...,f0a1f2b90,2.0,4
3,1,,3,307,0,1,2,0,0,3,...,1,1,0,41327,3f3ef74c486beba3bc87f6dbaee772bf,0,This puppy is: 1. Male 2. 3 months old 3. Brow...,7d028bdea,4.0,2
4,2,Abandoned Kitty,1,266,0,1,1,6,7,1,...,1,1,0,41401,844f03ab8054007d4be6686f3a9702b9,0,Mother cat gave birth to a litter of 3 and too...,8377bfe97,0.0,2


In [216]:
# Check for missing values
df_train.isnull().sum(axis = 0)

Type               0
Name             842
Age                0
Breed1             0
Breed2             0
Gender             0
Color1             0
Color2             0
Color3             0
MaturitySize       0
FurLength          0
Vaccinated         0
Dewormed           0
Sterilized         0
Health             0
Quantity           0
Fee                0
State              0
RescuerID          0
VideoAmt           0
Description        8
PetID              0
PhotoAmt           0
AdoptionSpeed      0
dtype: int64

In [217]:
#Lets get rid of 'names' and description as it contains many NaN's and doesnt provide any useful information
df_train.drop(['Name', 'Description'], axis=1)

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,Sterilized,Health,Quantity,Fee,State,RescuerID,VideoAmt,PetID,PhotoAmt,AdoptionSpeed
0,1,36,307,0,2,2,7,0,2,2,...,1,1,1,0,41326,337914b09c2fa5460e195197e994ef98,0,3f8824a3b,1.0,4
1,2,3,266,0,1,6,7,0,2,1,...,2,1,1,0,41327,4bb1ebb92158078ad54a6bb23c10dffc,0,9238eb7fc,1.0,2
2,2,7,250,252,1,1,2,0,2,1,...,2,1,4,0,41327,99ba8ce53b4d8515e417e7921563d923,0,f0a1f2b90,2.0,4
3,1,3,307,0,1,2,0,0,3,1,...,2,1,1,0,41327,3f3ef74c486beba3bc87f6dbaee772bf,0,7d028bdea,4.0,2
4,2,1,266,0,1,1,6,7,1,1,...,2,1,1,0,41401,844f03ab8054007d4be6686f3a9702b9,0,8377bfe97,0.0,2
5,1,3,218,0,1,3,5,0,2,1,...,2,1,1,0,41326,aa66486163b6cbc25ea62a34b11c9b91,0,965b31ba7,2.0,1
6,1,2,307,0,2,1,7,0,1,2,...,2,1,1,0,41326,d21f689eab9b3faa1b738ecc836b4b36,0,3760c73b1,1.0,4
7,1,8,307,0,2,6,0,0,2,1,...,1,1,1,0,41336,8f955b588a9e571d8e267cd73cdd8a45,0,f41a7de83,2.0,4
8,2,1,243,245,2,1,2,7,1,1,...,2,2,1,0,41326,2587c9957372fc186d3b95cfd12cf322,0,7b660c6af,4.0,3
9,1,6,307,0,1,2,7,0,1,1,...,2,1,1,0,41327,b84a2dd96249074fc4b276e55f608d21,0,f94c2a347,2.0,4


In [218]:
# apply dummies on the training set
col = 'Health'
df_dummies = pd.get_dummies(df_train[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_train = pd.concat([df_train, df_dummies], axis=1)

# apply the same dummies on the test set
col = 'Health'
df_dummies = pd.get_dummies(df_test[col])
df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
df_test = pd.concat([df_test, df_dummies], axis=1)

In [219]:
# select the columns
X_columns = ['Age', 'Fee', 'Health_1', 'Health_2', 'Health_3', 'PhotoAmt', 'Quantity', 'Breed1', 'Breed2']
y_column = ['AdoptionSpeed']

# Model Training

In [220]:
# split the data using sklearn

threshold = 0.8
X = df_train[X_columns]
y = df_train[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (8000, 9)
y_train (8000, 1)
X_test (2000, 9)
y_test (2000, 1)


In [221]:
# train a random forest classifier 
model = RandomForestClassifier(n_estimators=100, random_state = 80)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

  This is separate from the ipykernel package so we can avoid doing imports until


# Model Evaluation

In [222]:
kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
print('kappa', round(kappa, 4))
print(confusion_matrix(y_test, y_pred))

kappa 0.2206
[[  5   9  15   8   9]
 [  2 111 136  51 107]
 [  5  75 222  59 132]
 [  7  78 177 110 115]
 [  5  52 133  95 282]]


In [223]:
k = 10
results = []
kf = KFold(n_splits=k)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    model.fit(X_train, y_train.ravel())
    y_pred = model.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
    results.append(round(kappa, 4))

print('Kappa for each fold:', results)
print('AVG(kappa)', round(np.mean(results), 4))
print('STD(kappa)', round(np.std(results), 4))

Kappa for each fold: [0.2582, 0.2602, 0.2976, 0.2928, 0.2325, 0.2372, 0.2408, 0.3125, 0.2667, 0.2648]
AVG(kappa) 0.2663
STD(kappa) 0.0256


# Prepare for Submission

In [224]:
df_prediction = df_test[X_columns]
df_test['AdoptionSpeed'] = model.predict(df_prediction)
df_test[['PetID', 'AdoptionSpeed']]

Unnamed: 0,PetID,AdoptionSpeed
0,f42161740,1
1,0118db3a8,3
2,e5164d828,2
3,5335bfb38,1
4,ff2cf88a0,4
5,1d13441b9,1
6,7d835cf7c,2
7,577d15fea,4
8,91736f444,4
9,db194aec8,2


In [225]:
df_test[['PetID', 'AdoptionSpeed']].to_csv('submission_knn.csv', index=False)