In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

### Get Dataset

In [2]:
foster =  pd.read_csv("train.csv.gz")
foster.head()
test =  pd.read_csv("test.csv.gz")

# Exploratory Data Analysis using Pandas Profiling

In [33]:
from pandas_profiling import ProfileReport

In [34]:
profile = ProfileReport(foster, title="Shelter Animal Outcomes")

In [None]:
profile

In [3]:
foster.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


# Data Preprocessing

### Split up into cats and dogs

In [4]:
fostercats = foster.loc[foster['AnimalType'] == 'Cat']
fostercats_test = test.loc[test['AnimalType'] == 'Cat']

fosterdogs = foster.loc[foster['AnimalType'] != 'Cat']
fosterdogs_test = test.loc[test['AnimalType'] != 'Cat']

## Giving columns less categories

### Name Column

In [5]:
name_null = foster[foster['Name'].isnull()]
print("When name is null:", name_null['OutcomeType'].value_counts())
name_not_null = foster[foster['Name'].notnull()]
print("When name is NOT null:", name_not_null['OutcomeType'].value_counts())

When name is null: Transfer           4925
Adoption           1678
Euthanasia          815
Return_to_owner     153
Died                120
Name: OutcomeType, dtype: int64
When name is NOT null: Adoption           9091
Return_to_owner    4633
Transfer           4497
Euthanasia          740
Died                 77
Name: OutcomeType, dtype: int64


#### Since there is a big difference in adoption for when animals have a name or not, we change the name column to check if it has a name and if it's popular (0 if no name, 1 if unpopular, 2 if popular)

In [6]:
# get top 20 names
cattopnames = fostercats['Name'].value_counts()[:20].index.tolist()
fostercats['Name'] = fostercats['Name'].apply(lambda x: 0 if (x is np.nan) else (2 if (x in (cattopnames)) else 1))
fostercats_test['Name'] =fostercats_test['Name'].apply(lambda x: 0 if (x is np.nan) else (2 if (x in (cattopnames)) else 1))
fostercats.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fostercats['Name'] = fostercats['Name'].apply(lambda x: 0 if (x is np.nan) else (2 if (x in (cattopnames)) else 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fostercats_test['Name'] =fostercats_test['Name'].apply(lambda x: 0 if (x is np.nan) else (2 if (x in (cattopnames)) else 1))


Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
1,A656520,1,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
3,A683430,0,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
6,A699218,1,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby
7,A701489,0,2015-04-30 17:02:00,Transfer,Partner,Cat,Unknown,3 weeks,Domestic Shorthair Mix,Brown Tabby
10,A668402,0,2013-12-05 15:50:00,Transfer,SCRP,Cat,Unknown,2 years,Domestic Shorthair Mix,Black


In [35]:
cattopnames

['Oliver',
 'Max',
 'Bella',
 'X',
 'Lily',
 'Oreo',
 'Daisy',
 'Charlie',
 'Lucy',
 'Sam',
 'Luna',
 'Kitty',
 'Molly',
 'Sophie',
 'Tiger',
 'George',
 'Jack',
 'Oscar',
 'Lilly',
 'Leo']

In [7]:
# get top 20 names
dogtopnames = fosterdogs['Name'].value_counts()[:20].index.tolist()
fosterdogs['Name'] = fosterdogs['Name'].apply(lambda x: 0 if (x is np.nan) else (2 if (x in (dogtopnames)) else 1))
fosterdogs_test['Name'] =fosterdogs_test['Name'].apply(lambda x: 0 if (x is np.nan) else (2 if (x in (dogtopnames)) else 1))
fosterdogs.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fosterdogs['Name'] = fosterdogs['Name'].apply(lambda x: 0 if (x is np.nan) else (2 if (x in (dogtopnames)) else 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fosterdogs_test['Name'] =fosterdogs_test['Name'].apply(lambda x: 0 if (x is np.nan) else (2 if (x in (dogtopnames)) else 1))


Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,1,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
2,A686464,1,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
4,A667013,0,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan
5,A677334,1,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
8,A671784,2,2014-02-04 17:17:00,Adoption,,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White


### Breed Column

In [8]:
# get top 5 breeds
cattopbreeds = fostercats['Breed'].value_counts()[:5].index.tolist()
dogtopbreeds = fosterdogs['Breed'].value_counts()[:5].index.tolist()

In [36]:
cattopbreeds

['Domestic Shorthair Mix',
 'Domestic Medium Hair Mix',
 'Domestic Longhair Mix',
 'Siamese Mix',
 'Domestic Shorthair']

In [9]:
boolean_series = fostercats.Breed.isin(cattopbreeds)
topfoster = fostercats[boolean_series]
print("When top breed:", topfoster['OutcomeType'].value_counts())

inverse_boolean_series = ~fostercats.Breed.isin(cattopbreeds)
nottopfoster = fostercats[inverse_boolean_series]
print("When NOT top breed:", nottopfoster['OutcomeType'].value_counts())

# pure = foster.loc[foster['Breed'].str.contains('Mix') == False]
# print("When pure breed:", pure['OutcomeType'].value_counts())

When top breed: Transfer           5344
Adoption           4056
Euthanasia          693
Return_to_owner     465
Died                143
Name: OutcomeType, dtype: int64
When NOT top breed: Adoption           216
Transfer           161
Return_to_owner     35
Euthanasia          17
Died                 4
Name: OutcomeType, dtype: int64


#### Since there is a big difference in adoption for when animals are mixed breed or not, we change the breed column to a binary 'Rare' column

In [10]:
fostercats['Breed'] = fostercats['Breed'].apply(lambda x: 1 if (x in cattopbreeds) else 0)
fostercats_test['Breed'] = fostercats_test['Breed'].apply(lambda x: 1 if (x in cattopbreeds) else 0)
fosterdogs['Breed'] = fosterdogs['Breed'].apply(lambda x: 1 if (x in dogtopbreeds) else 0)
fosterdogs_test['Breed'] = fosterdogs_test['Breed'].apply(lambda x: 1 if (x in dogtopbreeds) else 0)
# foster = foster.rename(columns={'Breed': 'Purebred'})

# test['Breed'] = test['Breed'].apply(lambda x: 0 if (x.find('Mix') != -1) else 1)
# test = test.rename(columns={'Breed': 'Purebred'})
# foster.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fostercats['Breed'] = fostercats['Breed'].apply(lambda x: 1 if (x in cattopbreeds) else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fostercats_test['Breed'] = fostercats_test['Breed'].apply(lambda x: 1 if (x in cattopbreeds) else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fosterdogs['B

In [11]:
fostercats.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
1,A656520,1,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,1,Cream Tabby
3,A683430,0,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,1,Blue Cream
6,A699218,1,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,3 weeks,1,Blue Tabby
7,A701489,0,2015-04-30 17:02:00,Transfer,Partner,Cat,Unknown,3 weeks,1,Brown Tabby
10,A668402,0,2013-12-05 15:50:00,Transfer,SCRP,Cat,Unknown,2 years,1,Black


## Separating Sex upon Outcome into 2 Columns
### We separate into isMale and Intact

In [12]:
#drop rows with unknown sex upon outcome
#foster = foster.dropna(subset = ['SexuponOutcome'])

#test = test.dropna(subset = ['SexuponOutcome'])
# isMale = []
# for row in foster['SexuponOutcome']:
#     if row.find('Male') != -1 : isMale.append(1)
#     else:           isMale.append(0)
# intact = []
# for row in foster['SexuponOutcome']:
#     if row.find('Intact') != -1 : intact.append(1)
#     else:           intact.append(0)
# foster['isMale'] = isMale
# foster['intact'] = intact
# foster = foster.drop('SexuponOutcome', axis=1)

fostercats = pd.get_dummies(data=fostercats, columns=['SexuponOutcome'])
fostercats_test = pd.get_dummies(data=fostercats_test, columns=['SexuponOutcome'])

fosterdogs = pd.get_dummies(data=fosterdogs, columns=['SexuponOutcome'])
fosterdogs_test = pd.get_dummies(data=fosterdogs_test, columns=['SexuponOutcome'])

In [13]:
fostercats.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,AgeuponOutcome,Breed,Color,SexuponOutcome_Intact Female,SexuponOutcome_Intact Male,SexuponOutcome_Neutered Male,SexuponOutcome_Spayed Female,SexuponOutcome_Unknown
1,A656520,1,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,1 year,1,Cream Tabby,0,0,0,1,0
3,A683430,0,2014-07-11 19:09:00,Transfer,Partner,Cat,3 weeks,1,Blue Cream,0,1,0,0,0
6,A699218,1,2015-03-28 13:11:00,Transfer,Partner,Cat,3 weeks,1,Blue Tabby,0,1,0,0,0
7,A701489,0,2015-04-30 17:02:00,Transfer,Partner,Cat,3 weeks,1,Brown Tabby,0,0,0,0,1
10,A668402,0,2013-12-05 15:50:00,Transfer,SCRP,Cat,2 years,1,Black,0,0,0,0,1


## Making animal type a binary value

In [14]:
foster['AnimalType'] = foster['AnimalType'].apply(lambda x: 0 if x == 'Cat' else 1)
test['AnimalType'] = test['AnimalType'].apply(lambda x: 0 if x == 'Cat' else 1)

foster.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,1,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,0,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,1,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,0,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,1,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [15]:
foster.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,1,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,0,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,1,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,0,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,1,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


# Dropping Unnecessary Columns

In [16]:
#foster = foster.drop(['AnimalID'], axis=1)
fostercats = fostercats.drop('OutcomeSubtype', axis = 1)
#fostercats_test = fostercats_test.drop('OutcomeSubtype', axis = 1)
fosterdogs = fosterdogs.drop('OutcomeSubtype', axis = 1)
#fosterdogs_test = fosterdogs_test.drop('OutcomeSubtype', axis = 1)

#test = test.drop('OutcomeSubtype', axis = 1)

# Converting Date to month/year/day of week columns

#### Converting strings to datetime objects

In [17]:
from datetime import datetime

def StringToDate(stringdate):
    return datetime.strptime(stringdate, "%Y-%m-%d %H:%M:%S")

In [18]:
foster.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,1,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,0,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,1,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,0,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,1,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [19]:
def addDateCols(df):
    dayofweek = []
    year_arr = []
    month = []
    for row in df['DateTime']:
        dayofweek.append((StringToDate(row)).weekday())
        year_arr.append((StringToDate(row)).year)
        month.append((StringToDate(row)).month)
    df['weekday'] = dayofweek
    df['year'] = year_arr
    df['month'] = month

In [20]:
# convert dateTime column to day of year
#min_date = StringToDate(foster['DateTime'].min())
#foster = foster.drop('DateTime', axis=1)
addDateCols(fostercats)
addDateCols(fostercats_test)
addDateCols(fosterdogs)
addDateCols(fosterdogs_test)

fostercats = fostercats.drop('DateTime', axis=1)
fostercats_test = fostercats_test.drop('DateTime', axis=1)
fosterdogs = fosterdogs.drop('DateTime', axis=1)
fosterdogs_test = fosterdogs_test.drop('DateTime', axis=1)
# foster['DateTime'] = foster['DateTime'].apply(lambda x: (StringToDate(x).timetuple().tm_yday))
# test['DateTime'] = test['DateTime'].apply(lambda x: (StringToDate(x).timetuple().tm_yday))

In [21]:
fostercats.head()

Unnamed: 0,AnimalID,Name,OutcomeType,AnimalType,AgeuponOutcome,Breed,Color,SexuponOutcome_Intact Female,SexuponOutcome_Intact Male,SexuponOutcome_Neutered Male,SexuponOutcome_Spayed Female,SexuponOutcome_Unknown,weekday,year,month
1,A656520,1,Euthanasia,Cat,1 year,1,Cream Tabby,0,0,0,1,0,6,2013,10
3,A683430,0,Transfer,Cat,3 weeks,1,Blue Cream,0,1,0,0,0,4,2014,7
6,A699218,1,Transfer,Cat,3 weeks,1,Blue Tabby,0,1,0,0,0,5,2015,3
7,A701489,0,Transfer,Cat,3 weeks,1,Brown Tabby,0,0,0,0,1,3,2015,4
10,A668402,0,Transfer,Cat,2 years,1,Black,0,0,0,0,1,3,2013,12


# Converting Age upon Outcome to Numerical

In [22]:
def stringToAge(age):
    age_list = age.split()
    total_months = 0
    for index, word in enumerate(age_list):
        if word.find('month') != -1:
            months = int(age_list[index - 1])
            total_months += months
        elif word.find('year') != -1:
            months = int(age_list[index - 1]) * 12
            total_months += months
        elif word.find('week') != - 1:
            months = int(age_list[index - 1]) * 0.75
            total_months += months
    return total_months
            

In [23]:
fostercats['AgeuponOutcome'].unique()

array(['1 year', '3 weeks', '2 years', '3 months', '1 month', '2 months',
       '7 years', '3 years', '4 months', '4 years', '9 years', '1 weeks',
       '2 weeks', '4 weeks', '6 years', '10 months', '5 months',
       '11 years', '4 days', '8 months', '12 years', '15 years', '1 week',
       '0 years', '5 years', '8 years', '6 months', '9 months', '3 days',
       '7 months', '10 years', '5 days', '6 days', '2 days', '1 day',
       '5 weeks', '14 years', '13 years', '16 years', '11 months', nan,
       '17 years', '18 years', '19 years', '20 years'], dtype=object)

In [24]:
# Null age upon outcome = 0
fostercats['AgeuponOutcome'] = fostercats['AgeuponOutcome'].apply(lambda x: 0 if (x != x) else stringToAge(x))
fostercats_test['AgeuponOutcome'] = fostercats_test['AgeuponOutcome'].apply(lambda x: 0 if (x != x) else stringToAge(x))

fosterdogs['AgeuponOutcome'] = fosterdogs['AgeuponOutcome'].apply(lambda x: 0 if (x != x) else stringToAge(x))
fosterdogs_test['AgeuponOutcome'] = fosterdogs_test['AgeuponOutcome'].apply(lambda x: 0 if (x != x) else stringToAge(x))

In [25]:
fostercats.head()

Unnamed: 0,AnimalID,Name,OutcomeType,AnimalType,AgeuponOutcome,Breed,Color,SexuponOutcome_Intact Female,SexuponOutcome_Intact Male,SexuponOutcome_Neutered Male,SexuponOutcome_Spayed Female,SexuponOutcome_Unknown,weekday,year,month
1,A656520,1,Euthanasia,Cat,12.0,1,Cream Tabby,0,0,0,1,0,6,2013,10
3,A683430,0,Transfer,Cat,2.25,1,Blue Cream,0,1,0,0,0,4,2014,7
6,A699218,1,Transfer,Cat,2.25,1,Blue Tabby,0,1,0,0,0,5,2015,3
7,A701489,0,Transfer,Cat,2.25,1,Brown Tabby,0,0,0,0,1,3,2015,4
10,A668402,0,Transfer,Cat,24.0,1,Black,0,0,0,0,1,3,2013,12


# Applying label encoding to categorical variables with too many categories and target variable

In [26]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
fostercats['Color'] = le.fit_transform(fostercats['Color'])
#foster['Name'] = le.fit_transform(foster['Name'])
fostercats['OutcomeType'] = le.fit_transform(fostercats['OutcomeType'])
#foster['SexuponOutcome'] = le.fit_transform(foster['SexuponOutcome'])
#foster['Breed'] = le.fit_transform(foster['Breed'])
fostercats_test['Color'] = le.fit_transform(fostercats_test['Color'])

fosterdogs['Color'] = le.fit_transform(fosterdogs['Color'])
#foster['Name'] = le.fit_transform(foster['Name'])
fosterdogs['OutcomeType'] = le.fit_transform(fosterdogs['OutcomeType'])
#foster['SexuponOutcome'] = le.fit_transform(foster['SexuponOutcome'])
#foster['Breed'] = le.fit_transform(foster['Breed'])
fosterdogs_test['Color'] = le.fit_transform(fosterdogs_test['Color'])
#test['SexuponOutcome'] = le.fit_transform(test['SexuponOutcome'])
#test['Breed'] = le.fit_transform(test['Breed'])
#test['Name'] = le.fit_transform(test['Name'])
#test['OutcomeType'] = le.fit_transform(test['OutcomeType'])

In [27]:
fostercats = fostercats.drop('AnimalType', axis=1)
fostercats_test = fostercats_test.drop('AnimalType', axis=1)
fostercats.head()


Unnamed: 0,AnimalID,Name,OutcomeType,AgeuponOutcome,Breed,Color,SexuponOutcome_Intact Female,SexuponOutcome_Intact Male,SexuponOutcome_Neutered Male,SexuponOutcome_Spayed Female,SexuponOutcome_Unknown,weekday,year,month
1,A656520,1,2,12.0,1,72,0,0,0,1,0,6,2013,10
3,A683430,0,4,2.25,1,21,0,1,0,0,0,4,2014,7
6,A699218,1,4,2.25,1,29,0,1,0,0,0,5,2015,3
7,A701489,0,4,2.25,1,41,0,0,0,0,1,3,2015,4
10,A668402,0,4,24.0,1,2,0,0,0,0,1,3,2013,12


# Normalizing Data

In [28]:
fostercats_X = fostercats.drop(['OutcomeType', 'AnimalID'],axis=1)
fostercats_Y = fostercats['OutcomeType']
MinMaxScaler = preprocessing.MinMaxScaler()
X_data_minmax = MinMaxScaler.fit_transform(fostercats_X)
fostercats_X = pd.DataFrame(X_data_minmax,columns=fostercats_X.columns)
fostercats_X.head()
cattestID = fostercats_test['ID']
fostercats_test = fostercats_test.drop('ID', axis=1)
fostercats_test_minmax = MinMaxScaler.fit_transform(fostercats_test)
fostercats_test = pd.DataFrame(fostercats_test_minmax,columns=fostercats_test.columns)

In [29]:
fosterdogs = fosterdogs.drop('AnimalType', axis=1)
fosterdogs_test = fosterdogs_test.drop('AnimalType', axis=1)
fosterdogs.head()

fosterdogs_X = fosterdogs.drop(['OutcomeType', 'AnimalID'],axis=1)
fosterdogs_Y = fosterdogs['OutcomeType']
MinMaxScaler = preprocessing.MinMaxScaler()
X_data_minmax = MinMaxScaler.fit_transform(fosterdogs_X)
fosterdogs_X = pd.DataFrame(X_data_minmax,columns=fosterdogs_X.columns)
fosterdogs_X.head()
dogtestID = fosterdogs_test['ID']
fosterdogs_test = fosterdogs_test.drop('ID', axis=1)
fosterdogs_test_minmax = MinMaxScaler.fit_transform(fosterdogs_test)
fosterdogs_test = pd.DataFrame(fosterdogs_test_minmax,columns=fosterdogs_test.columns)

In [30]:
fosterdogs_X.head()

Unnamed: 0,Name,AgeuponOutcome,Breed,Color,SexuponOutcome_Intact Female,SexuponOutcome_Intact Male,SexuponOutcome_Neutered Male,SexuponOutcome_Spayed Female,SexuponOutcome_Unknown,weekday,year,month
0,0.5,0.052632,0.0,0.363985,0.0,0.0,1.0,0.0,0.0,0.333333,0.333333,0.090909
1,0.5,0.105263,1.0,0.229885,0.0,0.0,1.0,0.0,0.0,0.833333,0.666667,0.0
2,0.0,0.105263,0.0,0.754789,0.0,0.0,1.0,0.0,0.0,0.666667,0.0,0.909091
3,0.5,0.004386,0.0,0.099617,1.0,0.0,0.0,0.0,0.0,0.666667,0.333333,0.272727
4,1.0,0.02193,0.0,0.689655,0.0,0.0,0.0,1.0,0.0,0.166667,0.333333,0.090909


# Exploratory Data Modeling

### Splitting into train and test

In [31]:
from sklearn.model_selection import train_test_split
# foster_X = foster.drop(['OutcomeType'],axis=1)
# foster_Y = foster['OutcomeType']
X_traincats, X_testcats, y_traincats, y_testcats = train_test_split(fostercats_X, fostercats_Y, test_size=0.2, random_state=42)
X_traindogs, X_testdogs, y_traindogs, y_testdogs = train_test_split(fosterdogs_X, fosterdogs_Y, test_size=0.2, random_state=42)

### Trying oversampling to help with imbalance

In [301]:
# from imblearn.over_sampling import RandomOverSampler
# ros = RandomOverSampler(random_state=0)
# X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

## Naive Bayes

Based on assumption that features are independent of each other

In [290]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
y_preddogs = gnb.fit(X_traindogs, y_traindogs).predict(X_testdogs)

In [291]:
print("Number of mislabeled points out of a total %d points : %d"
      % (X_testdogs.shape[0], (y_testdogs != y_preddogs).sum()))

Number of mislabeled points out of a total 3119 points : 1479


In [304]:
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# result = confusion_matrix(y_test, y_pred)
# print("Confusion Matrix:")
# print(result)
# result1 = classification_report(y_test, y_pred)
# print("Classification Report:",)
# print (result1)
# result2 = accuracy_score(y_test,y_pred)
# print("Accuracy:",result2)
# print("Number of mislabeled points out of a total %d points : %d"
#       % (X_test.shape[0], (y_test != y_pred).sum()))

Confusion Matrix:
[[1668    0   26  197  328]
 [   0    0    4    2   27]
 [  44    0   36   70  148]
 [ 485    0   18  334  124]
 [ 455    0   29  182 1169]]
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.75      0.68      2219
           1       0.00      0.00      0.00        33
           2       0.32      0.12      0.18       298
           3       0.43      0.35      0.38       961
           4       0.65      0.64      0.64      1835

    accuracy                           0.60      5346
   macro avg       0.40      0.37      0.38      5346
weighted avg       0.58      0.60      0.58      5346

Accuracy: 0.5998877665544332
Number of mislabeled points out of a total 5346 points : 2139


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## KNN

In [272]:
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [273]:
knn_clf=KNeighborsClassifier()
knn_clf.fit(X_traincats,y_traincats)
ypredcats=knn_clf.predict(X_testcats) #These are the predicted output values

In [274]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_testcats, ypredcats)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_testcats, ypredcats)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_testcats,ypredcats)
print("Accuracy:",result2)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_testcats.shape[0], (y_testcats != ypredcats).sum()))

Confusion Matrix:
[[751   1   1  13 101]
 [  3   1   0   0  16]
 [ 25   1  13   1  96]
 [ 71   0   1   5  18]
 [214   3  19   5 868]]
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.87      0.78       867
           1       0.17      0.05      0.08        20
           2       0.38      0.10      0.15       136
           3       0.21      0.05      0.08        95
           4       0.79      0.78      0.79      1109

    accuracy                           0.74      2227
   macro avg       0.45      0.37      0.38      2227
weighted avg       0.70      0.74      0.71      2227

Accuracy: 0.73551863493489
Number of mislabeled points out of a total 2227 points : 589


In [275]:
knn_clf=KNeighborsClassifier()
knn_clf.fit(X_traindogs,y_traindogs)
ypreddogs=knn_clf.predict(X_testdogs) #These are the predicted output values

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_testdogs, ypreddogs)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_testdogs, ypreddogs)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_testdogs,ypreddogs)
print("Accuracy:",result2)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_testdogs.shape[0], (y_testdogs != ypreddogs).sum()))

Confusion Matrix:
[[968   0   5 260  89]
 [  1   0   2   3   5]
 [ 44   0  19  42  53]
 [385   0  24 303 114]
 [333   0  28 159 282]]
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.73      0.63      1322
           1       0.00      0.00      0.00        11
           2       0.24      0.12      0.16       158
           3       0.40      0.37      0.38       826
           4       0.52      0.35      0.42       802

    accuracy                           0.50      3119
   macro avg       0.34      0.31      0.32      3119
weighted avg       0.49      0.50      0.49      3119

Accuracy: 0.504007694773966
Number of mislabeled points out of a total 3119 points : 1547


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Random Forest

In [65]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_traincats,y_traincats)

y_predcats=clf.predict(X_testcats)

result = confusion_matrix(y_testcats, y_predcats)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_testcats, y_predcats)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_testcats,y_predcats)
print("Accuracy:",result2)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_testcats.shape[0], (y_testcats != y_predcats).sum()))

Confusion Matrix:
[[745   1   2  17 102]
 [  2   0   0   0  18]
 [ 13   1  19   3 100]
 [ 53   0   1  10  31]
 [136   2  30  10 931]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.86      0.82       867
           1       0.00      0.00      0.00        20
           2       0.37      0.14      0.20       136
           3       0.25      0.11      0.15        95
           4       0.79      0.84      0.81      1109

    accuracy                           0.77      2227
   macro avg       0.44      0.39      0.40      2227
weighted avg       0.73      0.77      0.74      2227

Accuracy: 0.7656039515042659
Number of mislabeled points out of a total 2227 points : 522


In [293]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
# rf = RandomForestClassifier()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# # Fit the random search model
# rf_random.fit(X_traincats, y_traincats)

In [297]:
# rf_random.best_params_

{'n_estimators': 1600,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': True}

In [32]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=1600,
 min_samples_split = 10,
 min_samples_leaf = 1,
 max_features = 'sqrt',
 max_depth = 20,
 bootstrap=True)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_traincats,y_traincats)

y_predcats=clf.predict(X_testcats)

result = confusion_matrix(y_testcats, y_predcats)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_testcats, y_predcats)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_testcats,y_predcats)
print("Accuracy:",result2)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_testcats.shape[0], (y_testcats != y_predcats).sum()))

Confusion Matrix:
[[764   0   0   7  96]
 [  2   0   0   0  18]
 [ 15   0  13   1 107]
 [ 55   0   1   4  35]
 [133   0   9   5 962]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.88      0.83       867
           1       0.00      0.00      0.00        20
           2       0.57      0.10      0.16       136
           3       0.24      0.04      0.07        95
           4       0.79      0.87      0.83      1109

    accuracy                           0.78      2227
   macro avg       0.48      0.38      0.38      2227
weighted avg       0.74      0.78      0.75      2227

Accuracy: 0.7826672653794342
Number of mislabeled points out of a total 2227 points : 484


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
query = pd.DataFrame([[0,1, 1, 0.013792, 0, 1, 0, 0, 0, 0.67, 0.5, 0.7]], columns=X_traincats.columns)
query_pred = clf.predict(query)
query_pred[0]

2

In [34]:
import pickle
pickle.dump(clf, open('cat_rf.pkl', 'wb'))

In [70]:
X_traincats.head()

Unnamed: 0,Name,AgeuponOutcome,Breed,Color,SexuponOutcome_Intact Female,SexuponOutcome_Intact Male,SexuponOutcome_Neutered Male,SexuponOutcome_Spayed Female,SexuponOutcome_Unknown,weekday,year,month
10682,0.0,0.0125,1.0,0.013793,1.0,0.0,0.0,0.0,0.0,0.666667,0.333333,0.636364
4698,0.5,0.045833,1.0,0.62069,0.0,0.0,0.0,1.0,0.0,1.0,0.666667,0.545455
10612,0.5,0.041667,1.0,0.868966,0.0,1.0,0.0,0.0,0.0,0.833333,0.0,0.818182
10296,0.0,0.008333,1.0,0.868966,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.818182
3316,0.0,0.25,0.0,0.786207,1.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.818182


In [271]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_traindogs,y_traindogs)

y_preddogs=clf.predict(X_testdogs)
result = confusion_matrix(y_testdogs, y_preddogs)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_testdogs, y_preddogs)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_testdogs,y_preddogs)
print("Accuracy:",result2)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_testdogs.shape[0], (y_testdogs != y_preddogs).sum()))

Confusion Matrix:
[[933   0   7 279 103]
 [  0   0   2   5   4]
 [ 26   0  22  54  56]
 [301   0  19 371 135]
 [267   0  21 182 332]]
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.71      0.65      1322
           1       0.00      0.00      0.00        11
           2       0.31      0.14      0.19       158
           3       0.42      0.45      0.43       826
           4       0.53      0.41      0.46       802

    accuracy                           0.53      3119
   macro avg       0.37      0.34      0.35      3119
weighted avg       0.52      0.53      0.52      3119

Accuracy: 0.5315806348188522
Number of mislabeled points out of a total 3119 points : 1461


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,weekday,year,month
0,0.368842,0.115385,1.0,0.4,0.05,0.885424,0.356164,0.333333,0.333333,0.090909
1,0.289771,0.782967,0.0,0.6,0.05,0.464104,0.457534,1.0,0.0,0.818182
2,0.69658,0.082418,1.0,0.4,0.1,0.773024,0.235616,0.833333,0.666667,0.0
3,1.0,0.524725,0.0,0.2,0.009375,0.464104,0.115068,0.666667,0.333333,0.545455
4,1.0,0.873626,1.0,0.4,0.1,0.662799,0.750685,0.666667,0.0,0.909091


In [311]:
#Retraining with the complete training set
clf.fit(foster_X, foster_Y)
#Getting predicted probabilities
pred = clf.predict_proba(test)
my_submission = pd.DataFrame({'ID':testID, 
                              'Adoption':pred[:,0], 
                              'Died':pred[:,1],'Euthanasia':pred[:,2],
                              'Return_to_owner':pred[:,3],'Transfer':pred[:,4] })

# you could use any filename
my_submission.to_csv('submission.csv', index=False)

## XGBoost

In [285]:
import xgboost
from xgboost import XGBClassifier

# fit model no training data
model = XGBClassifier()
model.fit(X_traindogs, y_traindogs)
y_pred = model.predict(X_testdogs)

In [286]:
result = confusion_matrix(y_testdogs, y_preddogs)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_testdogs, y_preddogs)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_testdogs,y_preddogs)
print("Accuracy:",result2)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_testdogs.shape[0], (y_testdogs != y_preddogs).sum()))

Confusion Matrix:
[[847  62  59 320  34]
 [  0   6   0   4   1]
 [ 17  47  52  34   8]
 [205  82 141 362  36]
 [233 200 101 162 106]]
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.64      0.65      1322
           1       0.02      0.55      0.03        11
           2       0.15      0.33      0.20       158
           3       0.41      0.44      0.42       826
           4       0.57      0.13      0.21       802

    accuracy                           0.44      3119
   macro avg       0.36      0.42      0.30      3119
weighted avg       0.54      0.44      0.45      3119

Accuracy: 0.4402051939724271
Number of mislabeled points out of a total 3119 points : 1746


## Balanced Random Forest

In [283]:
from sklearn.model_selection import StratifiedKFold
from imblearn.ensemble import BalancedRandomForestClassifier 


model = BalancedRandomForestClassifier()
model.fit(X_traindogs,y_traindogs)
y_preddogs = model.predict(X_testdogs)

In [284]:
result = confusion_matrix(y_testdogs, y_preddogs)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_testdogs, y_preddogs)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_testdogs,y_preddogs)
print("Accuracy:",result2)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_testdogs.shape[0], (y_testdogs != y_preddogs).sum()))

Confusion Matrix:
[[847  62  59 320  34]
 [  0   6   0   4   1]
 [ 17  47  52  34   8]
 [205  82 141 362  36]
 [233 200 101 162 106]]
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.64      0.65      1322
           1       0.02      0.55      0.03        11
           2       0.15      0.33      0.20       158
           3       0.41      0.44      0.42       826
           4       0.57      0.13      0.21       802

    accuracy                           0.44      3119
   macro avg       0.36      0.42      0.30      3119
weighted avg       0.54      0.44      0.45      3119

Accuracy: 0.4402051939724271
Number of mislabeled points out of a total 3119 points : 1746


In [318]:
X_train.head()

Unnamed: 0,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,weekday,year,month
4490,0.96423,0.75,0.0,0.4,0.041667,0.458303,0.235616,0.333333,0.333333,0.818182
15829,1.0,0.013736,0.0,0.6,0.4,0.458303,0.926027,0.166667,0.666667,0.0
18441,0.002824,0.96978,1.0,0.4,0.05,0.614213,0.90137,1.0,0.666667,1.0
26562,1.0,0.554945,1.0,0.6,0.05,0.264685,0.750685,0.166667,0.333333,0.545455
6880,0.196737,0.093407,0.0,0.6,0.25,0.464104,0.10411,0.166667,0.333333,0.090909


In [319]:
test.shape

(11456, 10)