# Basic classification model of outcome

This notebook performs a simple binary classification on the outcomes of cats using a `logistic regression`.

## Imports

In [1]:
import datetime
import pandas as pd
import numpy as np
from AAC_challenge import data
from AAC_challenge import utils

from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

## Data loading and cleaning

In [70]:
cat_df = data.get_clean_cat_dataset('cats')
#cat_df['top_breeds'] = cat_df.breed.apply(utils.map_top_breeds)
cat_df.head()

Unnamed: 0,breed,color,date_of_birth,outcome_datetime,outcome_type,sex,sterilized,periods,period_range,outcome_age_(days),...,outcome_weekday,cfa_breed,domestic_breed,coat_pattern,main_color,coat,has_name,top_breeds,adopted_or_not,outcome_year_month
0,domestic shorthair,orange,2014-07-07,2014-07-22,Transfer,0,0,2,7,14,...,Tuesday,0,1,tabby,orange,orange,0,domestic shorthair,0,2014-07-01
1,domestic shorthair,blue /white,2014-06-16,2014-08-14,Adoption,1,0,1,30,30,...,Thursday,0,1,tabby,blue,blue,1,domestic shorthair,1,2014-08-01
2,domestic shorthair,white/black,2014-03-26,2014-06-29,Adoption,1,1,3,30,90,...,Sunday,0,1,tabby,white,white,1,domestic shorthair,1,2014-06-01
3,domestic mediumhair,black/white,2013-03-27,2014-03-28,Return to Owner,1,1,1,365,365,...,Friday,0,1,tabby,black,black,1,domestic mediumhair,0,2014-03-01
4,domestic shorthair,black/white,2013-12-16,2014-01-09,Transfer,0,0,3,7,21,...,Thursday,0,1,tabby,black,black,0,domestic shorthair,0,2014-01-01


## Generating X and y

In [74]:
X, y = data.get_X_y()

In [75]:
X.head()

Unnamed: 0,kitten,sex,sterilized,cfa_breed,domestic_breed,has_name,x0_american shorthair,x0_domestic longhair,x0_domestic mediumhair,x0_domestic shorthair,...,x0_silver,x0_silver.1,x0_silver lynx,x0_silver lynx.1,x0_tan,x0_torbie,x0_tortie,x0_tricolor,x0_white,x0_yellow
0,1,0,0,0,1,0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,0,0,1,1,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1,1,0,1,1,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0,1,1,0,1,1,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,0,0,1,0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
y

0        0.0
1        1.0
2        1.0
3        0.0
4        0.0
        ... 
29406    1.0
29407    1.0
29408    0.0
29409    1.0
29410    1.0
Name: adopted_or_not, Length: 29411, dtype: float64

## One Hot Encoding


* Don't OHE breed (keep top_breeds instead)
* OHE coat_pattern
* OHE coat

The code has been transferred to package to generate X and y

In [38]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse = False)

### One Hot Encode `top_breeds`


In [40]:
enc.fit(features_df[['top_breeds']]) # Fit encoder

top_breeds_encoded = enc.transform(features_df[['top_breeds']]) # Encode alley

features_df["american_shorthair"],features_df["domestic_longhair"],features_df['domestic_mediumhair'],features_df['domestic_shorthair'],features_df['other'],features_df['siamese']  = top_breeds_encoded.T # Transpose encoded Alley back into dataframe

features_df.drop('top_breeds', axis = 1, inplace = True)

### Map `sex`

In [42]:
def map_sex(x):
    if x == 'Male':
        return 0
    return 1

In [43]:
features_df.sex = features_df.sex.apply(map_sex)

### One Hot Encode `coat_pattern`

In [45]:
enc.fit(features_df[['coat_pattern']]) # Fit encoder

coat_pattern_encoded = pd.DataFrame(enc.transform(features_df[['coat_pattern']]), columns=enc.get_feature_names())

#features_df.drop('top_breeds', axis = 1, inplace = True)

In [46]:
features_df = pd.concat([features_df, coat_pattern_encoded], axis=1)
features_df.drop(columns='coat_pattern', inplace = True)

### One Hot Encode `coat`

In [48]:
enc.fit(features_df[['coat']]) # Fit encoder

coat_encoded = pd.DataFrame(enc.transform(features_df[['coat']]), columns=enc.get_feature_names())

#features_df.drop('top_breeds', axis = 1, inplace = True)

In [49]:
features_df = pd.concat([features_df, coat_encoded], axis=1)
features_df.drop(columns='coat', inplace = True)

## Baseline model

In [50]:
target.value_counts()

0.0    16682
1.0    12729
Name: adopted_or_not, dtype: int64

In [51]:
print(f'The dataset is fairly balanced, with {round(target.value_counts()[1]/target.value_counts().sum()*100, 2)}% of adoptions.')

The dataset is fairly balanced, with 43.28% of adoptions.


A baseline model is expected to have a 43% precision.

## Cross validation

### Logistic Regression

In [52]:
features_df.drop(columns='date_of_birth', inplace=True)

Unnamed: 0,sex,sterilized,cfa_breed,domestic_breed,has_name,american_shorthair,domestic_longhair,domestic_mediumhair,domestic_shorthair,other,...,x0_silver,x0_silver.1,x0_silver lynx,x0_silver lynx.1,x0_tan,x0_torbie,x0_tortie,x0_tricolor,x0_white,x0_yellow
0,0,0,0,1,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,0,1,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1,0,1,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,1,0,1,1,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,1,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29406,1,0,1,0,1,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29407,0,0,0,1,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29408,0,1,0,1,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29409,1,1,0,1,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [53]:
features_df.dtypes

sex                 int64
sterilized          int64
cfa_breed           int64
domestic_breed      int64
has_name            int64
                   ...   
x0_torbie         float64
x0_tortie         float64
x0_tricolor       float64
x0_white          float64
x0_yellow         float64
Length: 63, dtype: object

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegression(solver='liblinear')
print(cross_val_score(log_reg, features_df, target, cv=5))

[0.76797552 0.76657599 0.7584155  0.76436586 0.77575655]


### SVC

In [55]:
from sklearn.svm import SVC
svc = SVC(kernel='linear', C=10)

In [56]:
print(cross_val_score(svc, features_df, target, cv=5))

[0.76746558 0.76725604 0.75807548 0.76351581 0.77541652]


### KNN

In [58]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)

In [59]:
print(cross_val_score(knn, features_df, target, cv=5))

[0.70899201 0.74039442 0.6984019  0.65317919 0.68225094]


### Random Forest

In [60]:
from sklearn.ensemble import RandomForestClassifier
rforest = RandomForestClassifier(max_depth=3, random_state=0)

In [61]:
print(cross_val_score(rforest, features_df, target, cv=5))

[0.76780554 0.76640598 0.75807548 0.76368582 0.77439646]


### XGBoost

In [63]:
from sklearn.ensemble import GradientBoostingClassifier
boost = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                 max_depth=2, random_state=0)

In [64]:
print(cross_val_score(boost, features_df, target, cv=5))

[0.76899541 0.76742605 0.7599456  0.76402584 0.77643659]
