# Basic classification model of outcome

This notebook performs a simple binary classification on the outcomes of cats using a `logistic regression`.

## Imports

In [1]:
import datetime
import pandas as pd
import numpy as np
from AAC_challenge import data
from AAC_challenge import utils

from sklearn.impute import SimpleImputer

%load_ext autoreload
%autoreload 2

## Data loading and cleaning

In [2]:
cat_df = data.get_clean_cat_dataset('cats')
cat_df['top_breeds'] = cat_df.breed.apply(utils.map_top_breeds)
cat_df.head()

Unnamed: 0,breed,color,date_of_birth,outcome_datetime,outcome_type,sex,sterilized,periods,period_range,outcome_age_(days),...,outcome_weekday,cfa_breed,domestic_breed,coat_pattern,main_color,coat,has_name,adopted_or_not,outcome_year_month,top_breeds
0,domestic shorthair,orange,2014-07-07,2014-07-22,Transfer,Male,0,2,7,14,...,Tuesday,0,1,tabby,orange,orange,0,0,2014-07-01,domestic shorthair
1,domestic shorthair,blue /white,2014-06-16,2014-08-14,Adoption,Female,0,1,30,30,...,Thursday,0,1,tabby,blue,blue,1,1,2014-08-01,domestic shorthair
2,domestic shorthair,white/black,2014-03-26,2014-06-29,Adoption,Female,1,3,30,90,...,Sunday,0,1,tabby,white,white,1,1,2014-06-01,domestic shorthair
3,domestic mediumhair,black/white,2013-03-27,2014-03-28,Return to Owner,Female,1,1,365,365,...,Friday,0,1,tabby,black,black,1,0,2014-03-01,domestic mediumhair
4,domestic shorthair,black/white,2013-12-16,2014-01-09,Transfer,Male,0,3,7,21,...,Thursday,0,1,tabby,black,black,0,0,2014-01-01,domestic shorthair


In [3]:
cat_df.dtypes

breed                          object
color                          object
date_of_birth          datetime64[ns]
outcome_datetime               object
outcome_type                   object
sex                            object
sterilized                     object
periods                        object
period_range                   object
outcome_age_(days)             object
outcome_age_(years)            object
outcome_month                  object
outcome_year                   object
outcome_weekday                object
cfa_breed                      object
domestic_breed                 object
coat_pattern                   object
main_color                     object
coat                           object
has_name                       object
adopted_or_not                 object
outcome_year_month     datetime64[ns]
top_breeds                     object
dtype: object

In [4]:
cat_df.sterilized = cat_df.sterilized.astype(str).astype(int)
cat_df.cfa_breed = cat_df.cfa_breed.astype(str).astype(int)
cat_df.domestic_breed = cat_df.domestic_breed.astype(str).astype(int)
cat_df.has_name = cat_df.has_name.astype(str).astype(int)
cat_df.adopted_or_not = cat_df.adopted_or_not.astype(str).astype(int)

In [5]:
cat_df.columns

Index(['breed', 'color', 'date_of_birth', 'outcome_datetime', 'outcome_type',
       'sex', 'sterilized', 'periods', 'period_range', 'outcome_age_(days)',
       'outcome_age_(years)', 'outcome_month', 'outcome_year',
       'outcome_weekday', 'cfa_breed', 'domestic_breed', 'coat_pattern',
       'main_color', 'coat', 'has_name', 'adopted_or_not',
       'outcome_year_month', 'top_breeds'],
      dtype='object')

In [25]:
features_df = cat_df[['top_breeds', 'date_of_birth', 'sex', 'sterilized',
                      'cfa_breed', 'domestic_breed', 'coat_pattern',
                     'coat', 'has_name']]

target = cat_df['adopted_or_not'].astype(str).astype(float)

## One Hot Encoding


* Don't OHE breed (keep top_breeds instead)
* OHE coat_pattern
* OHE coat

In [26]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse = False)

### One Hot Encode `top_breeds`


In [27]:
features_df.top_breeds.unique()

array(['domestic shorthair', 'domestic mediumhair', 'siamese', 'other',
       'domestic longhair', 'american shorthair'], dtype=object)

In [28]:
enc.fit(features_df[['top_breeds']]) # Fit encoder

top_breeds_encoded = enc.transform(features_df[['top_breeds']]) # Encode alley

features_df["american_shorthair"],features_df["domestic_longhair"],features_df['domestic_mediumhair'],features_df['domestic_shorthair'],features_df['other'],features_df['siamese']  = top_breeds_encoded.T # Transpose encoded Alley back into dataframe

features_df.drop('top_breeds', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_df["american_shorthair"],features_df["domestic_longhair"],features_df['domestic_mediumhair'],features_df['domestic_shorthair'],features_df['other'],features_df['siamese']  = top_breeds_encoded.T # Transpose encoded Alley back into dataframe
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_df["american_shorthair"],features_df["domestic_longhair"],features_df['domestic_mediumhair'],features_df['domestic_shorthair'],features_df['other'],features_df['siamese']  = top_breeds_encoded.T # Transpose enco

In [29]:
features_df.head(5)

Unnamed: 0,date_of_birth,sex,sterilized,cfa_breed,domestic_breed,coat_pattern,coat,has_name,american_shorthair,domestic_longhair,domestic_mediumhair,domestic_shorthair,other,siamese
0,2014-07-07,Male,0,0,1,tabby,orange,0,0.0,0.0,0.0,1.0,0.0,0.0
1,2014-06-16,Female,0,0,1,tabby,blue,1,0.0,0.0,0.0,1.0,0.0,0.0
2,2014-03-26,Female,1,0,1,tabby,white,1,0.0,0.0,0.0,1.0,0.0,0.0
3,2013-03-27,Female,1,0,1,tabby,black,1,0.0,0.0,1.0,0.0,0.0,0.0
4,2013-12-16,Male,0,0,1,tabby,black,0,0.0,0.0,0.0,1.0,0.0,0.0


### Map `sex`

In [30]:
def map_sex(x):
    if x == 'Male':
        return 0
    return 1

In [31]:
features_df.sex = features_df.sex.apply(map_sex)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_df.sex = features_df.sex.apply(map_sex)


### One Hot Encode `coat_pattern`

In [32]:
features_df.coat_pattern.unique()

array(['tabby', 'point', 'torbie', 'calico', 'tortie', 'smoke', 'agouti',
       'brindle', 'tricolor'], dtype=object)

In [33]:
enc.fit(features_df[['coat_pattern']]) # Fit encoder

coat_pattern_encoded = pd.DataFrame(enc.transform(features_df[['coat_pattern']]), columns=enc.get_feature_names())

#features_df.drop('top_breeds', axis = 1, inplace = True)

In [34]:
features_df = pd.concat([features_df, coat_pattern_encoded], axis=1)
features_df.drop(columns='coat_pattern', inplace = True)

### One Hot Encode `coat`

In [35]:
features_df.coat.unique()

array(['orange', 'blue ', 'white', 'black', 'brown', 'seal', 'torbie',
       'blue', 'cream ', 'calico', 'chocolate', 'orange ', 'silver',
       'flame', 'tortie', 'cream', 'lynx', 'seal ', 'lilac', 'buff',
       'blue cream', 'black ', 'silver lynx', 'gray', 'gray ', 'yellow',
       'apricot', 'lynx ', 'chocolate ', 'silver ', 'lilac ',
       'brown tiger', 'black tiger', 'tan', 'orange tiger', 'agouti',
       'flame ', 'silver lynx ', 'tricolor', 'sable', 'pink',
       'brown merle', 'fawn'], dtype=object)

In [36]:
enc.fit(features_df[['coat']]) # Fit encoder

coat_encoded = pd.DataFrame(enc.transform(features_df[['coat']]), columns=enc.get_feature_names())

#features_df.drop('top_breeds', axis = 1, inplace = True)

In [37]:
features_df = pd.concat([features_df, coat_encoded], axis=1)
features_df.drop(columns='coat', inplace = True)

## Baseline model and cross validation

### Baseline model

In [38]:
target.value_counts()

0.0    16682
1.0    12729
Name: adopted_or_not, dtype: int64

In [39]:
print(f'The dataset is fairly balanced, with {round(target.value_counts()[1]/target.value_counts().sum()*100, 2)}% of adoptions.')

The dataset is fairly balanced, with 43.28% of adoptions.


A baseline model is expected to have a 43% precision.

### Cross validation

In [40]:
features_df.drop(columns='date_of_birth', inplace=True)
features_df

Unnamed: 0,sex,sterilized,cfa_breed,domestic_breed,has_name,american_shorthair,domestic_longhair,domestic_mediumhair,domestic_shorthair,other,...,x0_silver,x0_silver.1,x0_silver lynx,x0_silver lynx.1,x0_tan,x0_torbie,x0_tortie,x0_tricolor,x0_white,x0_yellow
0,0,0,0,1,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,0,1,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1,0,1,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,1,0,1,1,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,1,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29406,1,0,1,0,1,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29407,0,0,0,1,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29408,0,1,0,1,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29409,1,1,0,1,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [45]:
features_df.dtypes

sex                 int64
sterilized          int64
cfa_breed           int64
domestic_breed      int64
has_name            int64
                   ...   
x0_torbie         float64
x0_tortie         float64
x0_tricolor       float64
x0_white          float64
x0_yellow         float64
Length: 63, dtype: object

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegression(solver='liblinear')
print(cross_val_score(log_reg, features_df, target, cv=5))

[0.76797552 0.76657599 0.7584155  0.76436586 0.77575655]
