In [15]:
# data manipulation libraries
import pandas as pd
import json
import numpy as np

# plotting libraries
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# preprocessing tools
import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# feature selection
from sklearn import feature_selection

# classification models
from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, svm, tree, discriminant_analysis
from xgboost import XGBClassifier

# model selection
from sklearn import model_selection

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('./test/test.csv')
train.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,2,Nibble,3,299,0,1,1,7,0,1,...,1,1,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,1,1,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,1,1,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3
3,1,Miko,4,307,0,2,1,2,0,2,...,1,1,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2
4,1,Hunter,1,307,0,1,1,0,0,2,...,1,1,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14993 entries, 0 to 14992
Data columns (total 24 columns):
Type             14993 non-null int64
Name             13736 non-null object
Age              14993 non-null int64
Breed1           14993 non-null int64
Breed2           14993 non-null int64
Gender           14993 non-null int64
Color1           14993 non-null int64
Color2           14993 non-null int64
Color3           14993 non-null int64
MaturitySize     14993 non-null int64
FurLength        14993 non-null int64
Vaccinated       14993 non-null int64
Dewormed         14993 non-null int64
Sterilized       14993 non-null int64
Health           14993 non-null int64
Quantity         14993 non-null int64
Fee              14993 non-null int64
State            14993 non-null int64
RescuerID        14993 non-null object
VideoAmt         14993 non-null int64
Description      14981 non-null object
PetID            14993 non-null object
PhotoAmt         14993 non-null float64
AdoptionSpe

In [9]:
# compare adoption speed between dogs and cats
train[['Type', 'AdoptionSpeed']].groupby('Type').mean()

Unnamed: 0_level_0,AdoptionSpeed
Type,Unnamed: 1_level_1
1,2.615101
2,2.399504


# There isn't much of a difference between cats and dogs in terms of adoption speed

In [17]:
# check name
print('Number of unique names: {0:d}'.format(len(train['Name'].unique())))

Number of unique names: 9061


In [22]:
# fill missing names
train['Name'].fillna('No Name Yet', inplace=True)

print('Number of pets with no name: {0:d}'.format(len(train.loc[train['Name']=='No Name Yet', 'Name'])))

Number of pets with no name: 1279


# Next we will look at whether certain names are more favourable

In [30]:
Names = pd.DataFrame(data=train['Name'].unique(), columns=['Name'])
Names['AdoptionSpeed'] = 10
Names['Count'] = 0 # count number of occurences of name

for name, obj in train[['AdoptionSpeed', 'Name']].groupby('Name'):
    Names.loc[Names['Name']==name, 'Count'] = len(obj)
    if (len(obj) > 4):
        Names.loc[Names['Name']==name, 'AdoptionSpeed'] = obj['AdoptionSpeed'].mean()

Unnamed: 0,Name,AdoptionSpeed,Count
330,Shih Tzu,1.333333,9
188,Sweety,1.333333,6
4988,Tam Tam,1.400000,5
1329,Boboy,1.400000,5
1366,Tarzan,1.400000,5
1453,Suki,1.500000,6
437,Misty,1.571429,7
2112,Didi,1.600000,5
30,Comel,1.714286,7
283,Baby Girl,1.800000,5


In [31]:
# names with fastest adoption speed
Names.sort_values(by=['AdoptionSpeed'])

Unnamed: 0,Name,AdoptionSpeed,Count
330,Shih Tzu,1.333333,9
188,Sweety,1.333333,6
4988,Tam Tam,1.400000,5
1329,Boboy,1.400000,5
1366,Tarzan,1.400000,5
1453,Suki,1.500000,6
437,Misty,1.571429,7
2112,Didi,1.600000,5
30,Comel,1.714286,7
283,Baby Girl,1.800000,5


In [33]:
# most popular names
Names.sort_values(by=['Count'], ascending=False)

Unnamed: 0,Name,AdoptionSpeed,Count
1,No Name Yet,2.594214,1279
170,Baby,2.757576,66
316,Lucky,2.343750,64
14,Brownie,2.574074,54
23,No Name,2.481481,54
96,Mimi,2.442308,52
15,Blackie,2.387755,49
268,Puppy,2.888889,45
331,Kittens,2.589744,39
7,Kitty,2.282051,39


# It seems there are many odd names and people generally don't care about what to put in the name section. Conclude that name might not be important