# Modeling for Repeat Animals

___  
Over this section the data will be explored:  

* Pre-processing - Feature Engineering and Selection 
* Model Fitting - 
* Model Evaluation - 
* Production Steps - 
___

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
# read in the data
df = pd.read_csv('../data/austin-data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156664 entries, 0 to 156663
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   animal_id         156664 non-null  object 
 1   outcome_time      156664 non-null  object 
 2   date_of_birth     156664 non-null  object 
 3   outcome_type      156664 non-null  object 
 4   outcome_gender    156664 non-null  object 
 5   outcome_age       156664 non-null  float64
 6   intake_time       156664 non-null  object 
 7   found_location    156664 non-null  object 
 8   intake_type       156664 non-null  object 
 9   intake_condition  156664 non-null  object 
 10  animal_type       156664 non-null  object 
 11  intake_gender     156664 non-null  object 
 12  intake_age        156664 non-null  float64
 13  breed             156664 non-null  object 
 14  color             156664 non-null  object 
 15  stay              156664 non-null  int64  
 16  repeat            15

In [3]:
df.head()

Unnamed: 0,animal_id,outcome_time,date_of_birth,outcome_type,outcome_gender,outcome_age,intake_time,found_location,intake_type,intake_condition,animal_type,intake_gender,intake_age,breed,color,stay,repeat,animal_stay,stay_duration,spay_neuter
0,A912799,2024-10-17 13:07:00,2024-07-21,Adoption,Spayed Female,2.0,2024-09-05 14:57:00,7201 Levander Loop in Austin (TX),Abandoned,Normal,Cat,Intact Female,1.0,Domestic Shorthair,Brown Tabby,1,0,A912799-1,41,1
1,A912055,2024-10-17 12:25:00,2023-10-25,Adoption,Neutered Male,11.0,2024-08-25 08:20:00,1800 Fairlawn Lane in Austin (TX),Stray,Injured,Cat,Intact Male,10.0,Domestic Shorthair,Brown Tabby/White,1,0,A912055-1,53,1
2,A915002,2024-10-17 12:21:00,2023-10-10,Return to Owner,Intact Male,12.0,2024-10-10 12:10:00,Austin (TX),Public Assist,Normal,Dog,Intact Male,12.0,German Shepherd Mix,Tan,1,0,A915002-1,7,0
3,A912548,2024-10-17 11:45:00,2021-09-02,Adoption,Neutered Male,36.0,2024-09-02 22:31:00,6900 Bryn Mawr in Austin (TX),Stray,Normal,Dog,Intact Male,36.0,Siberian Husky Mix,Black/White,1,0,A912548-1,44,1
4,A915279,2024-10-17 00:00:00,2022-10-14,Transfer,Intact Female,24.0,2024-10-14 11:47:00,14514 Highsmith Street in Austin (TX),Stray,Normal,Cat,Intact Female,24.0,Domestic Shorthair,Black,1,0,A915279-1,2,0


In [4]:
df.corr(numeric_only=True)

Unnamed: 0,outcome_age,intake_age,stay,repeat,stay_duration,spay_neuter
outcome_age,1.0,0.998809,0.10773,0.107172,0.022438,-0.294358
intake_age,0.998809,1.0,0.107337,0.107376,0.004713,-0.300218
stay,0.10773,0.107337,1.0,0.540634,0.014903,-0.199141
repeat,0.107172,0.107376,0.540634,1.0,0.012847,-0.149212
stay_duration,0.022438,0.004713,0.014903,0.012847,1.0,0.243358
spay_neuter,-0.294358,-0.300218,-0.199141,-0.149212,0.243358,1.0


### Feature Selection Engineering and Selection

In [5]:
df.columns

Index(['animal_id', 'outcome_time', 'date_of_birth', 'outcome_type',
       'outcome_gender', 'outcome_age', 'intake_time', 'found_location',
       'intake_type', 'intake_condition', 'animal_type', 'intake_gender',
       'intake_age', 'breed', 'color', 'stay', 'repeat', 'animal_stay',
       'stay_duration', 'spay_neuter'],
      dtype='object')

In [58]:
df_lr = df[['outcome_type', 'outcome_gender', 'intake_type', 'intake_condition', \
'animal_type', 'intake_gender', 'intake_age', 'breed', 'color', 'stay_duration', \
            'spay_neuter', 'repeat']]

In [59]:
df_lr.dtypes

outcome_type         object
outcome_gender       object
intake_type          object
intake_condition     object
animal_type          object
intake_gender        object
intake_age          float64
breed                object
color                object
stay_duration         int64
spay_neuter           int64
repeat                int64
dtype: object

In [60]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [61]:
ohe.fit_transform(df_lr)

array([[1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.]])

In [62]:
X = pd.DataFrame(ohe.fit_transform(df_lr), columns=ohe.get_feature_names_out())

In [63]:
X.head()

Unnamed: 0,outcome_type_Adoption,outcome_type_Died,outcome_type_Disposal,outcome_type_Euthanasia,outcome_type_Lost,outcome_type_Missing,outcome_type_Relocate,outcome_type_Return to Owner,outcome_type_Rto-Adopt,outcome_type_Stolen,...,stay_duration_360,stay_duration_361,stay_duration_362,stay_duration_363,stay_duration_364,stay_duration_365,spay_neuter_0,spay_neuter_1,repeat_0,repeat_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [None]:
# Correlation matrix for features to Sale Price in a heat map

plt.figure(figsize=(8, 5))
plt.title('Top Correlations')

sns.heatmap(X.corr(numeric_only=True)[['Repeat']].sort_values(by = 'Repeat', ascending = False).head(10),
           vmin = -1,
           vmax = 1,
           annot = True,
           cmap = 'inferno');

plt.tight_layout(pad=1.5)
#plt.savefig('./assets/Sale_Price_De_Indicators.jpg');
plt.show();