In [1]:
# data manipulation libraries
import pandas as pd
import json
import numpy as np

# plotting libraries
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# preprocessing tools
import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# feature selection
from sklearn import feature_selection

# classification models
from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, svm, tree, discriminant_analysis
from xgboost import XGBClassifier

# model selection
from sklearn import model_selection

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('./test/test.csv')
train.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,2,Nibble,3,299,0,1,1,7,0,1,...,1,1,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,1,1,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,1,1,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3
3,1,Miko,4,307,0,2,1,2,0,2,...,1,1,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2
4,1,Hunter,1,307,0,1,1,0,0,2,...,1,1,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14993 entries, 0 to 14992
Data columns (total 24 columns):
Type             14993 non-null int64
Name             13736 non-null object
Age              14993 non-null int64
Breed1           14993 non-null int64
Breed2           14993 non-null int64
Gender           14993 non-null int64
Color1           14993 non-null int64
Color2           14993 non-null int64
Color3           14993 non-null int64
MaturitySize     14993 non-null int64
FurLength        14993 non-null int64
Vaccinated       14993 non-null int64
Dewormed         14993 non-null int64
Sterilized       14993 non-null int64
Health           14993 non-null int64
Quantity         14993 non-null int64
Fee              14993 non-null int64
State            14993 non-null int64
RescuerID        14993 non-null object
VideoAmt         14993 non-null int64
Description      14981 non-null object
PetID            14993 non-null object
PhotoAmt         14993 non-null float64
AdoptionSpe

In [4]:
# compare adoption speed between dogs and cats
train[['Type', 'AdoptionSpeed']].groupby('Type').mean()

Unnamed: 0_level_0,AdoptionSpeed
Type,Unnamed: 1_level_1
1,2.615101
2,2.399504


# There isn't much of a difference between cats and dogs in terms of adoption speed

In [5]:
# check name
print('Number of unique names: {0:d}'.format(len(train['Name'].unique())))

Number of unique names: 9061


In [6]:
# fill missing names
train['Name'].fillna('No Name Yet', inplace=True)

print('Number of pets with no name: {0:d}'.format(len(train.loc[train['Name']=='No Name Yet', 'Name'])))

Number of pets with no name: 1279


# Next we will look at whether certain names are more favourable

In [7]:
Names = pd.DataFrame(data=train['Name'].unique(), columns=['Name'])
Names['AdoptionSpeed'] = 10
Names['Count'] = 0 # count number of occurences of name

for name, obj in train[['AdoptionSpeed', 'Name']].groupby('Name'):
    Names.loc[Names['Name']==name, 'Count'] = len(obj)
    if (len(obj) > 4):
        Names.loc[Names['Name']==name, 'AdoptionSpeed'] = obj['AdoptionSpeed'].mean()

In [8]:
# names with fastest adoption speed
Names.sort_values(by=['AdoptionSpeed'])

Unnamed: 0,Name,AdoptionSpeed,Count
330,Shih Tzu,1.333333,9
188,Sweety,1.333333,6
4988,Tam Tam,1.400000,5
1329,Boboy,1.400000,5
1366,Tarzan,1.400000,5
1453,Suki,1.500000,6
437,Misty,1.571429,7
2112,Didi,1.600000,5
30,Comel,1.714286,7
283,Baby Girl,1.800000,5


In [9]:
# most popular names
Names.sort_values(by=['Count'], ascending=False)

Unnamed: 0,Name,AdoptionSpeed,Count
1,No Name Yet,2.594214,1279
170,Baby,2.757576,66
316,Lucky,2.343750,64
14,Brownie,2.574074,54
23,No Name,2.481481,54
96,Mimi,2.442308,52
15,Blackie,2.387755,49
268,Puppy,2.888889,45
331,Kittens,2.589744,39
7,Kitty,2.282051,39


# It seems there are many odd names and people generally don't care about what to put in the name section. Conclude that name might not be important

# Check age relevance

In [14]:
Age = pd.DataFrame(data=train['Age'].unique(), columns=['Age'])
Age.head()

Unnamed: 0,Age
0,3
1,1
2,4
3,12
4,0


In [16]:
Age['AdoptionSpeed'] = 10
Age['Count'] = 0 # count number of occurences of name

for age, obj in train[['AdoptionSpeed', 'Age']].groupby('Age'):
    Age.loc[Age['Age']==age, 'Count'] = len(obj)
    if (len(obj) > 4):
        Age.loc[Age['Age']==age, 'AdoptionSpeed'] = obj['AdoptionSpeed'].mean()
        
Age

Unnamed: 0,Age,AdoptionSpeed,Count
0,3,2.458800,1966
1,1,2.192708,2304
2,4,2.630298,1109
3,12,2.883144,967
4,0,2.229050,179
5,2,2.264916,3503
6,78,2.800000,5
7,6,2.801075,558
8,8,2.932039,309
9,10,2.753086,162


In [29]:
train['AgeBin'] = pd.qcut(train['Age'], 7, duplicates='drop')
train[['AgeBin', 'AdoptionSpeed']].groupby('AgeBin').mean()

Unnamed: 0_level_0,AdoptionSpeed
AgeBin,Unnamed: 1_level_1
"(-0.001, 1.0]",2.195328
"(1.0, 2.0]",2.264916
"(2.0, 3.0]",2.4588
"(3.0, 4.0]",2.630298
"(4.0, 8.0]",2.787722
"(8.0, 24.0]",2.844411
"(24.0, 255.0]",2.726801


# Age does have an impact on adoption speed, the older the longer

In [32]:
labelencode = LabelEncoder()
train['AgeBin_Code'] = labelencode.fit_transform(train['AgeBin'])

train[['Age', 'AgeBin_Code']].head(50)

Unnamed: 0,Age,AgeBin_Code
0,3,2
1,1,0
2,1,0
3,4,3
4,1,0
5,3,2
6,12,5
7,0,0
8,2,1
9,12,5


In [36]:
train.loc[train['Breed2']==0, 'Breed2'].count()

10762

# Check adoption rate for mixed vs pure breeds

In [37]:
train['Mix'] = 0
train.loc[train['Breed2']!=0, 'Mix'] = 1

train[['AdoptionSpeed', 'Mix']].groupby('Mix').mean()

Unnamed: 0_level_0,AdoptionSpeed
Mix,Unnamed: 1_level_1
0,2.538469
1,2.460411


# Difference is insignificant

# Adoption rate of different breeds

In [48]:
#train[['AdoptionSpeed', 'Breed1', 'Breed2']].groupby(['Breed1', 'Breed2']).mean().sort_values('AdoptionSpeed')

Breed = train[['Breed1', 'Breed2']]
Breed['Count'] = 0

for name, obj in Breed.groupby(['Breed1', 'Breed2']):
    Breed.loc[(Breed['Breed1']==name[0]) & (Breed['Breed2']==name[1]), 'Count'] = len(obj)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f9171fe36a0>


In [72]:
Breed1 = pd.DataFrame(data=train['Breed1'].copy(deep=True), columns=['Breed1'])
Breed1['Count'] = 0

for name, obj in train[['AdoptionSpeed', 'Breed1']].groupby('Breed1'):
    Breed1.loc[Breed1['Breed1']==name, 'Count'] = len(obj)
    
Breed1['Count'].describe()

#Breed1 = pd.DataFrame(data=train['Breed1'].copy(deep=True), columns=['Breed1'])
#Breed1.groupby(['Breed1']).

count    14993.000000
mean      3368.399853
std       2425.671780
min          1.000000
25%        296.000000
50%       3634.000000
75%       5927.000000
max       5927.000000
Name: Count, dtype: float64

# Decide arbitrarily that if number of occurrences <= 100, count it as a rare breed

In [95]:
train['Rare'] = 1
for name, obj in train.groupby(['Breed1']):
    if (len(obj)>50):
        train.loc[train['Breed1']==name, 'Rare'] = 0

train['Rare'][train['Rare']==1]

22       1
74       1
81       1
86       1
97       1
101      1
122      1
126      1
134      1
143      1
151      1
225      1
235      1
241      1
266      1
286      1
301      1
302      1
385      1
391      1
393      1
419      1
450      1
454      1
459      1
469      1
472      1
480      1
485      1
528      1
        ..
14648    1
14662    1
14664    1
14684    1
14688    1
14704    1
14729    1
14738    1
14749    1
14755    1
14758    1
14796    1
14803    1
14818    1
14830    1
14843    1
14856    1
14862    1
14876    1
14880    1
14896    1
14907    1
14909    1
14914    1
14929    1
14941    1
14959    1
14960    1
14961    1
14963    1
Name: Rare, Length: 1150, dtype: int64

In [96]:
train.loc[train['Rare']==0, 'Rare'].count()

13843

In [97]:
train[['AdoptionSpeed', 'Rare']].groupby('Rare').mean()

Unnamed: 0_level_0,AdoptionSpeed
Rare,Unnamed: 1_level_1
0,2.531315
1,2.337391
