In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# sklearn :: evaluation metrics
from sklearn.metrics import cohen_kappa_score

sns.set_style('whitegrid')
pd.set_option('display.max_columns',50)

# Problem definition

Predict when a pet will be adopted

# Load the data

In [2]:
# load data - source: (https://www.kaggle.com/c/cebd-1260-spring-2019-classification/data)
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
print(df_train.shape, df_test.shape)

(10000, 24) (4993, 23)


In [3]:
# load external datasets

# colour and breed info - source: https://www.kaggle.com/c/petfinder-adoption-prediction/data
df_colours = pd.read_csv('data/color_labels.csv')
df_breeds = pd.read_csv('data/breed_labels.csv')

# breed characteristics - source: https://www.kaggle.com/rturley/pet-breed-characteristics
df_cats = pd.read_csv('data/cat_breed_characteristics.csv')
df_dogs = pd.read_csv('data/dog_breed_characteristics.csv')

# Feature Engineering

In [4]:
# training data
print(df_train.columns)
df_train.head()

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')


Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,1,â¥â¥â¥ Lily â¥â¥â¥,36,307,0,2,2,7,0,2,2,1,2,1,1,1,0,41326,337914b09c2fa5460e195197e994ef98,0,Adorable 3 year old Lily looking for a forever...,3f8824a3b,1.0,4
1,2,Cookie,3,266,0,1,6,7,0,2,1,2,1,2,1,1,0,41327,4bb1ebb92158078ad54a6bb23c10dffc,0,i rescue this stary kitten from market near my...,9238eb7fc,1.0,2
2,2,Favour Speedy Abundance And Courage,7,250,252,1,1,2,0,2,1,1,1,2,1,4,0,41327,99ba8ce53b4d8515e417e7921563d923,0,The mother was a Burmese cross and had since p...,f0a1f2b90,2.0,4
3,1,,3,307,0,1,2,0,0,3,1,2,2,2,1,1,0,41327,3f3ef74c486beba3bc87f6dbaee772bf,0,This puppy is: 1. Male 2. 3 months old 3. Brow...,7d028bdea,4.0,2
4,2,Abandoned Kitty,1,266,0,1,1,6,7,1,1,2,2,2,1,1,0,41401,844f03ab8054007d4be6686f3a9702b9,0,Mother cat gave birth to a litter of 3 and too...,8377bfe97,0.0,2


In [5]:
# inspect datatypes and check for missing values
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 24 columns):
Type             10000 non-null int64
Name             9158 non-null object
Age              10000 non-null int64
Breed1           10000 non-null int64
Breed2           10000 non-null int64
Gender           10000 non-null int64
Color1           10000 non-null int64
Color2           10000 non-null int64
Color3           10000 non-null int64
MaturitySize     10000 non-null int64
FurLength        10000 non-null int64
Vaccinated       10000 non-null int64
Dewormed         10000 non-null int64
Sterilized       10000 non-null int64
Health           10000 non-null int64
Quantity         10000 non-null int64
Fee              10000 non-null int64
State            10000 non-null int64
RescuerID        10000 non-null object
VideoAmt         10000 non-null int64
Description      9992 non-null object
PetID            10000 non-null object
PhotoAmt         10000 non-null float64
AdoptionSpeed 

In [6]:
# merge cat breed codes with characteristics
df_cat_breeds = df_breeds[df_breeds['Type']==2]
df_cat_breeds = df_cat_breeds.merge(df_cats, how='left')
df_cat_breeds = df_cat_breeds[['BreedID','Type','BreedName','Temperment','MalaysiaPopularity','PopularityUS2017']]
df_cat_breeds=df_cat_breeds.fillna(0)
df_cat_breeds.head()

Unnamed: 0,BreedID,Type,BreedName,Temperment,MalaysiaPopularity,PopularityUS2017
0,241,2,Abyssinian,"Active, Energetic, Independent, Intelligent, G...",0.0,7.0
1,242,2,American Curl,"Affectionate, Curious, Intelligent, Interactiv...",0.0,26.0
2,243,2,American Shorthair,"Active, Curious, Easygoing, Playful, Calm",5.0,6.0
3,244,2,American Wirehair,"Affectionate, Curious, Gentle, Intelligent, In...",0.0,41.0
4,245,2,Applehead Siamese,"Active, Agile, Clever, Sociable, Loving, Energ...",0.0,0.0


In [7]:
# merge dog breed codes with characteristics
df_dog_breeds = df_breeds[df_breeds['Type']==1]
df_dog_breeds = df_dog_breeds.merge(df_dogs, how='left')
df_dog_breeds = df_dog_breeds[['BreedID','Type','BreedName','Temperment','MalaysiaPopularity','PopularityUS2017']]
df_dog_breeds = df_dog_breeds.fillna(0)
df_dog_breeds.head()

Unnamed: 0,BreedID,Type,BreedName,Temperment,MalaysiaPopularity,PopularityUS2017
0,1,1,Affenpinscher,"Active, Adventurous, Curious, Fun-loving, Play...",0.0,147.0
1,2,1,Afghan Hound,"Aloof, Clownish, Dignified, Happy, Independent",0.0,93.0
2,3,1,Airedale Terrier,"Alert, Confident, Courageous, Friendly, Intell...",0.0,55.0
3,4,1,Akbash,"Affectionate, Alert, Courageous, Independent, ...",0.0,0.0
4,5,1,Akita,"Alert, Courageous, Dignified, Docile, Friendly...",0.0,47.0


In [8]:
# merge breeds and one-hot encode temperment
df_all_breeds = pd.concat([df_dog_breeds,df_cat_breeds],ignore_index=True)
one_hot = df_all_breeds['Temperment'].str.replace(' ','').str.split(',').str.join('|').str.get_dummies()
sum = one_hot.sum(axis=0)
top_characteristics = list(sum[sum>=30].index)
# top_characteristics = list(sum.index)
df_all_breeds = pd.concat([df_all_breeds,one_hot[top_characteristics]], axis=1)
df_all_breeds = df_all_breeds.drop(['BreedName','Temperment'],axis=1)

In [9]:
# create temperment features
df_train = df_train.merge(df_all_breeds, how='left', left_on=['Breed1','Type'], right_on=['BreedID','Type'])
df_test = df_test.merge(df_all_breeds, how='left', left_on=['Breed1','Type'], right_on=['BreedID','Type'])

In [10]:
# check for merge errors
errors_train = df_train.loc[df_train['BreedID'].isnull(),['Breed1','Type','BreedID']]
errors_test = df_test.loc[df_test['BreedID'].isnull(),['Breed1','Type','BreedID']]
print(errors_train)
print(errors_test)

# these are dog breeds labeled as cats, or no breed given, so dropping from training set and filling test with zeros
df_train = df_train.drop(errors_train.index)
df_test['BreedID'] = df_test['BreedID'].fillna(0)

      Breed1  Type  BreedID
260      307     2      NaN
1198       0     1      NaN
1206     218     2      NaN
2295      15     2      NaN
2453      70     2      NaN
2921     307     2      NaN
3781     307     2      NaN
4202     307     2      NaN
4540       0     2      NaN
6768      25     2      NaN
7094     205     2      NaN
9442      25     2      NaN
      Breed1  Type  BreedID
1869     114     2      NaN
2101      21     2      NaN
2353       0     1      NaN
4577       0     1      NaN
4855       0     1      NaN


In [11]:
# create features

# description length (if missing set to 0)
df_train['DescriptionLength'] = df_train['Description'].str.split(' ').str.len()
df_train.loc[df_train['Description'].isnull(),'DescriptionLength'] = 0
df_test['DescriptionLength'] = df_test['Description'].str.split(' ').str.len()
df_test.loc[df_test['Description'].isnull(),'DescriptionLength'] = 0

# description - number of characters (if missing set to 0)
df_train['DescriptionChars'] = df_train['Description'].str.len()
df_train.loc[df_train['Description'].isnull(),'DescriptionChars'] = 0
df_test['DescriptionChars'] = df_test['Description'].str.len()
df_test.loc[df_test['Description'].isnull(),'DescriptionChars'] = 0

# description - average word length - total characters minus #words-1 (to account for spaces) / #words
df_train['DescriptionAvgWord'] = (df_train['DescriptionChars']-(df_train['DescriptionLength']-1))/df_train['DescriptionLength']
df_train = df_train.replace(np.inf, np.nan)
df_train['DescriptionAvgWord'] = df_train['DescriptionAvgWord'].fillna(0)
df_test['DescriptionAvgWord'] = (df_test['DescriptionChars']-(df_test['DescriptionLength']-1))/df_test['DescriptionLength']
df_test = df_test.replace(np.inf, np.nan)
df_test['DescriptionAvgWord'] = df_test['DescriptionAvgWord'].fillna(0)

# no name, or includes numbers/non-alpha characters (likely a code rather than name for short ones) - fill missing name value with '0'
df_train['Name'] = df_train['Name'].fillna('0')
df_train['NoName'] = (df_train['Name'].str.len()<=3)&(~df_train['Name'].str.isalpha())
df_test['Name'] = df_test['Name'].fillna('0')
df_test['NoName'] = (df_test['Name'].str.len()<=3)&(~df_test['Name'].str.isalpha())

# pure-bred i.e. breed not 'mixed breed' (307) or 'domestic' (264, 265, 266) and no second breed
df_train['PureBred'] = (df_train['Breed1'] != 307) & (df_train['Breed1'] != 264) & (df_train['Breed1'] != 265) & (df_train['Breed1'] != 266) & (df_train['Breed2'] == 0)
df_test['PureBred'] = (df_test['Breed1'] != 307) & (df_test['Breed1'] != 264) & (df_test['Breed1'] != 265) & (df_test['Breed1'] != 266) & (df_test['Breed2'] == 0)

# free
df_train['Free'] = df_train['Fee']==0
df_test['Free'] = df_test['Fee']==0

# fee ranges
bins = [-1, 0, 20, 50, 100, 200, np.inf]
names = list(range(len(bins)-1))
df_train['FeeRange'] = pd.cut(df_train['Fee'], bins, labels=names)
df_test['FeeRange'] = pd.cut(df_test['Fee'], bins, labels=names)

# more than one
df_train['MoreThanOne'] = df_train['Quantity']>1
df_test['MoreThanOne'] = df_test['Quantity']>1

# # multi-colour
# df_train['MultiColour'] = df_train['Color2'] != 0 
# df_test['MultiColour'] = df_test['Color2'] != 0

# # rare breed
# rare_breeds = list(df_train.Breed1.value_counts()[df_train.Breed1.value_counts()<5].index)
# df_train['RareBreed'] = df_train['Breed1'].isin(rare_breeds)
# df_test['RareBreed'] = df_test['Breed1'].isin(rare_breeds)

In [12]:
# create health factor feature (combo of vaccinated, dewormed, sterilized)

## with health
# df_train['HealthFactor'] = df_train['Vaccinated'].astype('str') + df_train['Dewormed'].astype('str') + df_train['Sterilized'].astype('str') + df_train['Health'].astype('str')
# df_test['HealthFactor'] = df_test['Vaccinated'].astype('str') + df_test['Dewormed'].astype('str') + df_test['Sterilized'].astype('str') + df_test['Health'].astype('str')

## without health
df_train['HealthFactor'] = df_train['Vaccinated'].astype('str') + df_train['Dewormed'].astype('str') + df_train['Sterilized'].astype('str')
df_test['HealthFactor'] = df_test['Vaccinated'].astype('str') + df_test['Dewormed'].astype('str') + df_test['Sterilized'].astype('str')

# map unique factors to numeric (healthfactorID)
all_health_factors = df_train['HealthFactor'].append(df_test['HealthFactor'])

unique_health_factors = list(all_health_factors.value_counts().index)
health_factor_map = {}

for i, ID in enumerate(unique_health_factors):
    health_factor_map.update({ID : (i+1)})
    
new_healthID = {i : k for i, k in health_factor_map.items()}

df_train['HealthFactorID'] = df_train.HealthFactor.replace(new_healthID)
df_test['HealthFactorID'] = df_test.HealthFactor.replace(new_healthID)

In [13]:
# create colourcombo feature

# prepare colours dataframes for merge
df_colour1 = df_colours.copy()
df_colour1.columns = ['Color1','ColorName1']

df_colour2 = df_colours.copy()
df_colour2.columns = ['Color2','ColorName2']

df_colour3 = df_colours.copy()
df_colour3.columns = ['Color3','ColorName3']

# merge colours
for dataframe in [df_colour1, df_colour2, df_colour3]:
    df_train = df_train.merge(dataframe,how='left')
    df_test = df_test.merge(dataframe,how='left')
    
# fill missing values
for col in ['ColorName1', 'ColorName2', 'ColorName3']:
    df_train[col] = df_train[col].fillna('')
    df_test[col] = df_test[col].fillna('')
    
# create colour combo
df_train['ColourCombo'] = df_train['ColorName1'] + df_train['ColorName2'] + df_train['ColorName3']
df_test['ColourCombo'] = df_test['ColorName1'] + df_test['ColorName2'] + df_test['ColorName3']

# map unique colour combos to numeric (colourcomboID)
all_colourcombos = df_train['ColourCombo'].append(df_test['ColourCombo'])

unique_colour_combos = list(all_colourcombos.value_counts().index)

colourcombo_map = {}
for i, ID in enumerate(unique_colour_combos):
    colourcombo_map.update({ID : (i+1)})
    
new_colourID = {i : k for i, k in colourcombo_map.items()}

df_train['ColourComboID'] = df_train.ColourCombo.replace(new_colourID)
df_test['ColourComboID'] = df_test.ColourCombo.replace(new_colourID)

In [14]:
# create breedcombo feature

# prepare breeds dataframes for merge
df_breed1 = df_breeds[['BreedID','BreedName']]
df_breed1.columns = ['Breed1','BreedName1']

df_breed2 = df_breeds[['BreedID','BreedName']]
df_breed2.columns = ['Breed2','BreedName2']

# merge breeds
for dataframe in [df_breed1, df_breed2]:
    df_train = df_train.merge(dataframe,how='left')
    df_test = df_test.merge(dataframe,how='left')

# fill missing values
for col in ['BreedName1', 'BreedName2']:
    df_train[col] = df_train[col].fillna('')
    df_test[col] = df_test[col].fillna('')
    
# create breed combo
df_train['BreedCombo'] = df_train['BreedName1'] + df_train['BreedName2']
df_test['BreedCombo'] = df_test['BreedName1'] + df_test['BreedName2']

# map unique colour combos to numeric (colourcomboID)
all_breedcombos = df_train['BreedCombo'].append(df_test['BreedCombo'])

unique_breed_combos = list(all_breedcombos.value_counts().index)

breedcombo_map = {}
for i, ID in enumerate(unique_breed_combos):
    breedcombo_map.update({ID : (i+1)})
    
new_breedID = {i : k for i, k in breedcombo_map.items()}

df_train['BreedComboID'] = df_train.BreedCombo.replace(new_breedID)
df_test['BreedComboID'] = df_test.BreedCombo.replace(new_breedID)

In [15]:
# # apply dummies

# dummies_cols = ['MaturitySize','FurLength','Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Color1']
# for col in dummies_cols:
#     dummies_train=pd.get_dummies(df_train[col],prefix=col)
#     dummies_test=pd.get_dummies(df_test[col],prefix=col)
#     df_train = pd.concat([df_train,dummies_train], axis=1)
#     df_test = pd.concat([df_test,dummies_test], axis=1)
    
# df_train=df_train.drop(dummies_cols, axis=1)
# df_test=df_test.drop(dummies_cols, axis=1)

# dummies_cols = ['FeeRange']
# for col in dummies_cols:
#     dummies_train=pd.get_dummies(df_train[col],prefix=col)
#     dummies_test=pd.get_dummies(df_test[col],prefix=col)
#     df_train = pd.concat([df_train,dummies_train], axis=1)
#     df_test = pd.concat([df_test,dummies_test], axis=1)
    
# df_train=df_train.drop(dummies_cols, axis=1)
# df_test=df_test.drop(dummies_cols, axis=1)

In [16]:
# create feature with top rescuers - i.e. rescued > 10 pets (label rest as 0 for other), and feature for # pets rescued per rescuer
all_rescuers = df_train['RescuerID'].append(df_test['RescuerID'])

rescuer_counts = all_rescuers.value_counts()

num_top_rescuers = (rescuer_counts>10).sum()

top_rescuers = list(df_train.RescuerID.value_counts().index[:num_top_rescuers])

rescuerID_map = {}
for i, ID in enumerate(top_rescuers):
    rescuerID_map.update({ID : (i+1)})
    
new_rescuerID = {i : k for i, k in rescuerID_map.items()}

df_train['RescuerIDNumeric'] = df_train.loc[df_train['RescuerID'].isin(top_rescuers),'RescuerID'].replace(new_rescuerID)
df_train['RescuerIDNumeric'] = df_train['RescuerIDNumeric'].fillna(0)
df_test['RescuerIDNumeric'] = df_test.loc[df_test['RescuerID'].isin(top_rescuers),'RescuerID'].replace(new_rescuerID)
df_test['RescuerIDNumeric'] = df_test['RescuerIDNumeric'].fillna(0)

# create rescuer count feature
df_train['RescuerCount'] = df_train['RescuerID'].map(rescuer_counts)
df_test['RescuerCount'] = df_test['RescuerID'].map(rescuer_counts)

In [17]:
print(df_train.columns)
print(df_train.shape)

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed',
       'BreedID', 'MalaysiaPopularity', 'PopularityUS2017', 'Active',
       'Affectionate', 'Alert', 'Courageous', 'Curious', 'Energetic',
       'Friendly', 'Gentle', 'Independent', 'Intelligent', 'Lively', 'Loving',
       'Loyal', 'Playful', 'Protective', 'Social', 'DescriptionLength',
       'DescriptionChars', 'DescriptionAvgWord', 'NoName', 'PureBred', 'Free',
       'FeeRange', 'MoreThanOne', 'HealthFactor', 'HealthFactorID',
       'ColorName1', 'ColorName2', 'ColorName3', 'ColourCombo',
       'ColourComboID', 'BreedName1', 'BreedName2', 'BreedCombo',
       'BreedComboID', 'RescuerIDNumeric', 'RescuerCount'],
      dtype='object')
(9988, 64)


In [18]:
# select the columns

# drop non-numeric/redundant columns
X_columns = list(df_train.drop(['Name','RescuerID','Description','PetID','AdoptionSpeed','ColorName1','ColorName2','ColorName3','ColourCombo','HealthFactor','BreedID','BreedCombo','BreedName1','BreedName2'],axis=1).columns)

# X_columns = ['Age', 'Fee', 'Health']
y_column = ['AdoptionSpeed']

# Model Training

In [19]:
# split the data using sklearn

threshold = 0.8
X = df_train[X_columns]
y = df_train[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (7990, 50)
y_train (7990, 1)
X_test (1998, 50)
y_test (1998, 1)


In [20]:
# TODO: try different models
models = [
    ('GaussianNB', GaussianNB()),
    ('KNeighborsClassifier', KNeighborsClassifier()),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
    ('RandomForestClassifier 10', RandomForestClassifier(10)),
    ('RandomForestClassifier 100', RandomForestClassifier(100)),
    ('GradientBoostingClassifier', GradientBoostingClassifier())
]
results = []
for m in models:
    print('MODEL', m[0])
    model = m[1]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
    print(confusion_matrix(y_test, y_pred))
    print('kappa', round(kappa, 4))
    print('')
    results.append([m[0], kappa])
    
    # if there is a feature importance, print top 10
    importance = []
    if hasattr(model, 'feature_importances_'):
        print('Feature Importance')
        importance = []
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.feature_importances_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False))
    elif hasattr(model, 'coef_'):
        print('Feature Importance')
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.coef_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False))
        
    print('')

# sort the results and print as a table
df_results = pd.DataFrame(results)
df_results.columns = ['model', 'kappa']
df_results = df_results.sort_values(by='kappa', ascending=False)
df_results

MODEL GaussianNB
[[  8   7   2   3  28]
 [ 54  53  28  23 259]
 [ 56  20  59  56 316]
 [ 39  30  68  63 281]
 [ 32  22  25  39 427]]
kappa 0.1295


MODEL KNeighborsClassifier


  y = column_or_1d(y, warn=True)
  


[[  2  19  11   6  10]
 [ 17 147 120  54  79]
 [ 11 121 183  92 100]
 [ 12 103 162 113  91]
 [  7 113 124  90 211]]
kappa 0.1856


MODEL DecisionTreeClassifier
[[  8  11   5  10  14]
 [ 16 108 116 102  75]
 [ 13 135 163 104  92]
 [ 11  84 128 148 110]
 [ 15  61 150  69 250]]
kappa 0.2047

Feature Importance
                     0         1
39  DescriptionAvgWord  0.112545
38    DescriptionChars  0.084012
49        RescuerCount  0.071140
1                  Age  0.070251
37   DescriptionLength  0.070019
18            PhotoAmt  0.068413
46       ColourComboID  0.068175
47        BreedComboID  0.045541
6               Color2  0.034071
45      HealthFactorID  0.032997
16               State  0.030883
5               Color1  0.030208
48    RescuerIDNumeric  0.025072
2               Breed1  0.024969
8         MaturitySize  0.021298
4               Gender  0.020512
3               Breed2  0.020186
14            Quantity  0.015956
12          Sterilized  0.014852
7               Color3  0.01465

  
  


[[  4  16  13   5  10]
 [  5 155 116  61  80]
 [  7 118 184 100  98]
 [  3  74 168 119 117]
 [  3  62 111  64 305]]
kappa 0.3124

Feature Importance
                     0         1
39  DescriptionAvgWord  0.085365
38    DescriptionChars  0.083286
37   DescriptionLength  0.078563
1                  Age  0.067592
49        RescuerCount  0.062140
18            PhotoAmt  0.059982
46       ColourComboID  0.056966
48    RescuerIDNumeric  0.038548
5               Color1  0.034743
47        BreedComboID  0.034272
6               Color2  0.033361
45      HealthFactorID  0.029793
16               State  0.027702
4               Gender  0.023702
2               Breed1  0.023668
8         MaturitySize  0.021557
9            FurLength  0.020505
3               Breed2  0.019229
14            Quantity  0.018487
7               Color3  0.016956
12          Sterilized  0.016437
11            Dewormed  0.014167
10          Vaccinated  0.013974
15                 Fee  0.012127
43            FeeRange  0.

  y = column_or_1d(y, warn=True)


[[  1  23  13   2   9]
 [  1 141 157  26  92]
 [  1  85 200  81 140]
 [  1  68 170  94 148]
 [  0  46  95  35 369]]
kappa 0.3611

Feature Importance
                     0         1
49        RescuerCount  0.177664
1                  Age  0.151400
18            PhotoAmt  0.075871
2               Breed1  0.074901
47        BreedComboID  0.072044
39  DescriptionAvgWord  0.054403
12          Sterilized  0.048206
38    DescriptionChars  0.042048
48    RescuerIDNumeric  0.029743
20    PopularityUS2017  0.023069
46       ColourComboID  0.022572
16               State  0.022396
45      HealthFactorID  0.021857
14            Quantity  0.021380
37   DescriptionLength  0.021071
3               Breed2  0.018603
8         MaturitySize  0.015111
19  MalaysiaPopularity  0.012983
15                 Fee  0.012602
4               Gender  0.012307
5               Color1  0.010143
9            FurLength  0.007402
10          Vaccinated  0.006263
44         MoreThanOne  0.004318
28              Gentle  0.

Unnamed: 0,model,kappa
4,RandomForestClassifier 100,0.36659
5,GradientBoostingClassifier,0.361057
3,RandomForestClassifier 10,0.312377
2,DecisionTreeClassifier,0.204685
1,KNeighborsClassifier,0.185627
0,GaussianNB,0.129487


In [21]:
# train a classifier based on above results
model = RandomForestClassifier(200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

  This is separate from the ipykernel package so we can avoid doing imports until


# Model Evaluation

In [22]:
kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
print('kappa', round(kappa, 4))
print(confusion_matrix(y_test, y_pred))

kappa 0.3663
[[  5  17  15   1  10]
 [  1 148 128  33 107]
 [  1 108 194  82 122]
 [  1  59 147 130 144]
 [  0  52  79  33 381]]


Using Cross Validation

In [23]:
k = 10
results = []
kf = KFold(n_splits=k)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    model.fit(X_train, y_train.ravel())
    y_pred = model.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
    results.append(round(kappa, 4))

print('Kappa for each fold:', results)
print('AVG(kappa)', round(np.mean(results), 4))
print('STD(kappa)', round(np.std(results), 4))

Kappa for each fold: [0.383, 0.4266, 0.4171, 0.4113, 0.3954, 0.3675, 0.3622, 0.419, 0.434, 0.3856]
AVG(kappa) 0.4002
STD(kappa) 0.0238


# Prepare your submission

In [24]:
# model with all training data
model = RandomForestClassifier(200)
model.fit(X, y)
print(X.shape)
print(y.shape)

  This is separate from the ipykernel package so we can avoid doing imports until


(9988, 50)
(9988, 1)


In [25]:
df_prediction = df_test[X_columns].fillna(0)
df_test['AdoptionSpeed'] = model.predict(df_prediction)
df_test[['PetID', 'AdoptionSpeed']]

Unnamed: 0,PetID,AdoptionSpeed
0,f42161740,2
1,0118db3a8,4
2,e5164d828,3
3,5335bfb38,4
4,ff2cf88a0,4
5,1d13441b9,3
6,7d835cf7c,2
7,577d15fea,2
8,91736f444,4
9,db194aec8,2


In [26]:
df_test[['PetID', 'AdoptionSpeed']].to_csv('submission_6.csv', index=False)