# Modeling for Repeat Animals

___  
Over this section the data will be explored:  

* Pre-processing - Feature Engineering and Selection 
* Model Fitting - 
* Model Evaluation - 
* Production Steps - 
___

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [2]:
# read in the data
df = pd.read_csv('../data/austin-data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156664 entries, 0 to 156663
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   animal_id         156664 non-null  object 
 1   outcome_time      156664 non-null  object 
 2   date_of_birth     156664 non-null  object 
 3   outcome_type      156664 non-null  object 
 4   outcome_gender    156664 non-null  object 
 5   outcome_age       156664 non-null  float64
 6   intake_time       156664 non-null  object 
 7   found_location    156664 non-null  object 
 8   intake_type       156664 non-null  object 
 9   intake_condition  156664 non-null  object 
 10  animal_type       156664 non-null  object 
 11  intake_gender     156664 non-null  object 
 12  intake_age        156664 non-null  float64
 13  breed             156664 non-null  object 
 14  color             156664 non-null  object 
 15  stay              156664 non-null  int64  
 16  repeat            15

In [3]:
df.head()

Unnamed: 0,animal_id,outcome_time,date_of_birth,outcome_type,outcome_gender,outcome_age,intake_time,found_location,intake_type,intake_condition,animal_type,intake_gender,intake_age,breed,color,stay,repeat,animal_stay,stay_duration,spay_neuter
0,A912799,2024-10-17 13:07:00,2024-07-21,Adoption,Spayed Female,2.0,2024-09-05 14:57:00,7201 Levander Loop in Austin (TX),Abandoned,Normal,Cat,Intact Female,1.0,Domestic Shorthair,Brown Tabby,1,0,A912799-1,41,1
1,A912055,2024-10-17 12:25:00,2023-10-25,Adoption,Neutered Male,11.0,2024-08-25 08:20:00,1800 Fairlawn Lane in Austin (TX),Stray,Injured,Cat,Intact Male,10.0,Domestic Shorthair,Brown Tabby/White,1,0,A912055-1,53,1
2,A915002,2024-10-17 12:21:00,2023-10-10,Return to Owner,Intact Male,12.0,2024-10-10 12:10:00,Austin (TX),Public Assist,Normal,Dog,Intact Male,12.0,German Shepherd Mix,Tan,1,0,A915002-1,7,0
3,A912548,2024-10-17 11:45:00,2021-09-02,Adoption,Neutered Male,36.0,2024-09-02 22:31:00,6900 Bryn Mawr in Austin (TX),Stray,Normal,Dog,Intact Male,36.0,Siberian Husky Mix,Black/White,1,0,A912548-1,44,1
4,A915279,2024-10-17 00:00:00,2022-10-14,Transfer,Intact Female,24.0,2024-10-14 11:47:00,14514 Highsmith Street in Austin (TX),Stray,Normal,Cat,Intact Female,24.0,Domestic Shorthair,Black,1,0,A915279-1,2,0


In [20]:
# Examine features for selection
df.columns

Index(['animal_id', 'outcome_time', 'date_of_birth', 'outcome_type',
       'outcome_gender', 'outcome_age', 'intake_time', 'found_location',
       'intake_type', 'intake_condition', 'animal_type', 'intake_gender',
       'intake_age', 'breed', 'color', 'stay', 'repeat', 'animal_stay',
       'stay_duration', 'spay_neuter'],
      dtype='object')

In [5]:
# Select relevant features for modeling
df_lr = df[['outcome_type', 'outcome_gender', 'intake_type', 'intake_condition', \
'animal_type', 'intake_gender', 'intake_age', 'breed', 'color', 'stay_duration', \
            'spay_neuter', 'repeat']]

In [6]:
# Setup feature matrix and target variable
X = df_lr.drop(columns='repeat')
y = df_lr['repeat']

In [7]:
# Check for distribution of target
df_lr['repeat'].value_counts(normalize=True)

repeat
0    0.81099
1    0.18901
Name: proportion, dtype: float64

In [8]:
# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27, stratify=y)

In [9]:
# Converting categorical features to numerical
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(X_train)

In [10]:
# Transforming training and testing data
X_train_ohe = ohe.transform(X_train)
X_test_ohe = ohe.transform(X_test)
X_train_ohe.shape, X_test_ohe.shape

((125331, 3535), (31333, 3535))

In [11]:
# Putting data back into Dataframes
X_train_ohe = pd.DataFrame(X_train_ohe, columns=ohe.get_feature_names_out())
X_test_ohe = pd.DataFrame(X_test_ohe, columns=ohe.get_feature_names_out())
X_train_ohe.head()

Unnamed: 0,outcome_type_Adoption,outcome_type_Died,outcome_type_Disposal,outcome_type_Euthanasia,outcome_type_Lost,outcome_type_Missing,outcome_type_Relocate,outcome_type_Return to Owner,outcome_type_Rto-Adopt,outcome_type_Stolen,...,stay_duration_357,stay_duration_358,stay_duration_360,stay_duration_361,stay_duration_362,stay_duration_363,stay_duration_364,stay_duration_365,spay_neuter_0,spay_neuter_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [45]:
# Instantiate and Fit model
lr = LogisticRegression(penalty='l1',
                        solver='saga',
                        #C=0.5,
                        max_iter=200)
lr.fit(X_train_ohe, y_train)

In [46]:
# Make predictions
lr.predict(X_test_ohe)

# Create probabilities
np.round(lr.predict_proba(X_test_ohe), decimals=4)[:10]

array([[0.9545, 0.0455],
       [0.4652, 0.5348],
       [0.9926, 0.0074],
       [0.9888, 0.0112],
       [0.857 , 0.143 ],
       [0.5395, 0.4605],
       [0.7082, 0.2918],
       [0.7262, 0.2738],
       [0.9908, 0.0092],
       [0.9653, 0.0347]])

In [43]:
# Evaluate model
f'Training data {round(lr.score(X_train_ohe, y_train) *100,2)}%'

'Training data 84.28%'

In [44]:
f'Testing data {round(lr.score(X_test_ohe, y_test)*100,2)}%'

'Testing data 83.66%'

In [40]:
f'Null Model {round(df_lr['repeat'].value_counts(normalize=True)[0] *100, 2)}%.'

'Null Model 81.1%.'