# Austin Animal Center Shelter - Plotly Summer Challenge

## Imports

In [206]:
import pandas as pd
import numpy as np
from AAC_challenge import data
from AAC_challenge import plots

import plotly as plt
import seaborn as sns
import plotly.express as px

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data loading and basic cleaning

In [178]:
aac_eng_df = data.get_clean_cat_dataset('cats')

### `aac_eng_df` data 

In [141]:
aac_eng_df.head(3)

Unnamed: 0,breed,color,date_of_birth,outcome_datetime,outcome_type,sex,sterilized,periods,period_range,outcome_age_(days),outcome_age_(years),outcome_weekday,cfa_breed,domestic_breed,coat_pattern,main_color,coat,has_name,adopted_or_not,outcome_month_year
0,domestic shorthair,orange,2014-07-07,2014-07-22,Transfer,Male,0,2,7,14,0.038356,Tuesday,0,1,tabby,orange,orange,0,0,2014-07-01
1,domestic shorthair,blue /white,2014-06-16,2014-08-14,Adoption,Female,0,1,30,30,0.082192,Thursday,0,1,tabby,blue,blue,1,1,2014-08-01
2,domestic shorthair,white/black,2014-03-26,2014-06-29,Adoption,Female,1,3,30,90,0.246575,Sunday,0,1,tabby,white,white,1,1,2014-06-01


In [121]:
# total females vs males
total_females = aac_eng_df.groupby('sex').count()['has_name'][0]
total_males = aac_eng_df.groupby('sex').count()['has_name'][1]
ratio_females = round(total_females/(total_females + total_males)*100,2)
ratio_males = round(total_males/(total_females + total_males)*100,2)

print(f"There is {ratio_females}% of females and {ratio_males}% males.")

There is 55.05% of females and 44.95% males.


In [122]:
# outcome types
outcome_types = pd.DataFrame(aac_eng_df.outcome_type.value_counts())
for index, row in outcome_types.iterrows():
    ratio = round(row.outcome_type/outcome_types.outcome_type.sum()*100,2)
    print(f"There is {ratio}% of {index}")
    print("----------------------")

There is 45.29% of Transfer
----------------------
There is 43.28% of Adoption
----------------------
There is 4.93% of Euthanasia
----------------------
There is 4.87% of Return to Owner
----------------------
There is 1.37% of Died
----------------------
There is 0.11% of Rto-Adopt
----------------------
There is 0.1% of Missing
----------------------
There is 0.05% of Disposal
----------------------


### `aac_eng_adopted`

In [123]:
aac_eng_adopted = aac_eng_df[aac_eng_df['outcome_type'] == 'Adoption']
aac_eng_adopted

Unnamed: 0,breed,color,date_of_birth,outcome_datetime,outcome_type,sex,sterilized,periods,period_range,outcome_age_(days),outcome_age_(years),outcome_weekday,cfa_breed,domestic_breed,coat_pattern,main_color,coat,has_name,adopted_or_not,outcome_month_year
1,domestic shorthair,blue /white,2014-06-16,2014-08-14,Adoption,Female,0,1,30,30,0.082192,Thursday,0,1,tabby,blue,blue,1,1,2014-08
2,domestic shorthair,white/black,2014-03-26,2014-06-29,Adoption,Female,1,3,30,90,0.246575,Sunday,0,1,tabby,white,white,1,1,2014-06
5,domestic shorthair,brown,2014-06-02,2014-08-13,Adoption,Female,1,2,30,60,0.164384,Wednesday,0,1,tabby,brown,brown,1,1,2014-08
7,domestic shorthair,black,2014-03-22,2014-08-31,Adoption,Male,1,5,30,150,0.410959,Sunday,0,1,tabby,black,black,1,1,2014-08
8,domestic shorthair,orange,2014-08-03,2014-10-31,Adoption,Male,1,2,30,60,0.164384,Friday,0,1,tabby,orange,orange,1,1,2014-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29405,domestic mediumhair,brown /white,2016-07-29,2018-02-01,Adoption,Female,1,1,365,365,1.0,Thursday,0,1,tabby,brown,brown,1,1,2018-02
29406,american shorthair,blue /white,2017-09-14,2017-12-10,Adoption,Female,0,2,30,60,0.164384,Sunday,1,0,tabby,blue,blue,1,1,2017-12
29407,domestic shorthair,brown /white,2017-08-03,2017-09-24,Adoption,Male,0,1,30,30,0.082192,Sunday,0,1,tabby,brown,brown,1,1,2017-09
29409,domestic shorthair,brown,2017-07-28,2018-02-01,Adoption,Female,1,6,30,180,0.493151,Thursday,0,1,tortie,Breed Specific,tortie,1,1,2018-02


In [124]:
aac_eng_df.adopted_or_not.value_counts()

0    16682
1    12729
Name: adopted_or_not, dtype: int64

In [125]:
print(f'Adoptions represent a total of {round(aac_eng_df.adopted_or_not.value_counts()[1]/aac_eng_df.adopted_or_not.value_counts().sum()*100,2)} % of all outcomes ')

Adoptions represent a total of 43.28 % of all outcomes 


## Basic data visualization

### `coat` and `sex`


The four most common coats are `black`, `brown`, `blue`, and `orange` and fairly even between male and female.

The coats `tortie` and `calico` are exclusively females.

In [126]:
fig = px.histogram(aac_eng_df, x="coat", color = 'sex', title= "All outcomes")
fig.show()

In [127]:
fig = px.histogram(aac_eng_adopted, x="coat", color = 'sex', title= "Adoptions only")
fig.show()

In [128]:
fig = px.histogram(aac_eng_df, x="coat_pattern", color = 'adopted_or_not', title= "Adopted or not")
fig.show()

In [129]:
fig = px.histogram(aac_eng_df, x="cfa_breed", color = 'adopted_or_not', title= "Adopted or not")
fig.show()

In [130]:
fig = px.histogram(aac_eng_df, x="domestic_breed", color = 'adopted_or_not', title= "adopted_or_not")
fig.show()

### `age`

Most outcomes are less than `6 months` old and exponentially decrease with age. Across all ages, outcomes are fairly evenly distributed between male and female.

In [207]:
plots.get_age_histogram(aac_eng_df)

In [209]:
plots.get_age_histogram(aac_eng_df, adoptions_only=True)

### `sterilized` 

Outcomes are primarily transfers and adoptions. For adoptions, 90% of the cats are spayed/neutered.

In [210]:
plots.get_sterilized_histogram(aac_eng_df)

### `outcome_weekday`

In [211]:
plots.get_weekday_histogram(aac_eng_df)

**Overall, there are more adoptions on Saturdays (and Sundays) and less adoptions on Thurdays.**

### `breed`

In [45]:
aac_eng_df.breed.value_counts()

domestic shorthair                      23710
domestic mediumhair                      2454
domestic longhair                        1273
siamese                                  1058
american shorthair                        211
                                        ...  
snowshoe/domestic shorthair                 1
domestic longhair/russian blue              1
havana brown                                1
domestic shorthair/british shorthair        1
domestic mediumhair/manx                    1
Name: breed, Length: 65, dtype: int64

In [46]:
# breeds
breeds = pd.DataFrame(aac_eng_df.breed.value_counts()).reset_index()
ratios = []
for index, row in breeds.iterrows():
    ratios.append(round(row.breed/breeds.breed.sum()*100,2))

breeds['breed_ratio'] = ratios
breeds.columns = ['breed', 'count', 'breed_ratio']
breeds.sort_values(by='breed_ratio', ascending=False).head(10)

Unnamed: 0,breed,count,breed_ratio
0,domestic shorthair,23710,80.62
1,domestic mediumhair,2454,8.34
2,domestic longhair,1273,4.33
3,siamese,1058,3.6
4,american shorthair,211,0.72
5,snowshoe,156,0.53
6,maine coon,116,0.39
7,manx,80,0.27
8,russian blue,69,0.23
9,himalayan,38,0.13


In [74]:
# breeds
breeds_adopted = pd.DataFrame(aac_eng_adopted.breed.value_counts()).reset_index()
ratios = []
for index, row in breeds_adopted.iterrows():
    ratios.append(round(row.breed/breeds_adopted.breed.sum()*100,2))

breeds_adopted['breed_ratio'] = ratios
breeds_adopted.columns = ['breed', 'count', 'breed_ratio']
breeds_adopted.sort_values(by='breed_ratio', ascending=False).head(10)

Unnamed: 0,breed,count,breed_ratio
0,domestic shorthair,10077,79.17
1,domestic mediumhair,1112,8.74
2,domestic longhair,605,4.75
3,siamese,492,3.87
4,american shorthair,77,0.6
5,snowshoe,75,0.59
6,maine coon,62,0.49
7,manx,45,0.35
8,russian blue,35,0.27
9,persian,19,0.15


In [47]:
breeds['breed'][breeds.breed_ratio < 1] = 'other'
breeds



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,breed,count,breed_ratio
0,domestic shorthair,23710,80.62
1,domestic mediumhair,2454,8.34
2,domestic longhair,1273,4.33
3,siamese,1058,3.60
4,other,211,0.72
...,...,...,...
60,other,1,0.00
61,other,1,0.00
62,other,1,0.00
63,other,1,0.00


In [75]:
breeds_adopted['breed'][breeds_adopted.breed_ratio < 1] = 'other'
breeds_adopted



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,breed,count,breed_ratio
0,domestic shorthair,10077,79.17
1,domestic mediumhair,1112,8.74
2,domestic longhair,605,4.75
3,siamese,492,3.87
4,other,77,0.6
5,other,75,0.59
6,other,62,0.49
7,other,45,0.35
8,other,35,0.27
9,other,19,0.15


In [48]:
breeds = breeds.groupby('breed', as_index=False).sum()
breeds

Unnamed: 0,breed,count,breed_ratio
0,domestic longhair,1273,4.33
1,domestic mediumhair,2454,8.34
2,domestic shorthair,23710,80.62
3,other,916,3.08
4,siamese,1058,3.6


In [76]:
breeds_adopted = breeds_adopted.groupby('breed', as_index=False).sum()
breeds_adopted

Unnamed: 0,breed,count,breed_ratio
0,domestic longhair,605,4.75
1,domestic mediumhair,1112,8.74
2,domestic shorthair,10077,79.17
3,other,443,3.54
4,siamese,492,3.87


In [49]:
fig = px.pie(breeds, values = 'count', names = 'breed')
fig.show()

In [77]:
fig = px.pie(breeds_adopted, values = 'count', names = 'breed')
fig.show()

### `outcome_datetime`

Adoptions periodically peak in `July` and are the lowest in `March/April`.

In [213]:
plots.get_outcome_timeseries(aac_eng_df)

### `has_name`

**Most cats adopted have names!!**

In [214]:
plots.get_has_name_histogram(aac_eng_df)

## EDA summary

* There are as much males as there are females;
* Adoptions represent around 43% of all outcomes;
* The great majority of adopted cats are spayed/neutered and have a name;
* The most cats (therefore most adopted) have black, brown, and blue coats;
* The great majority of cats (therefore adopted cats) are 0 to 6 months old;
* There are more adoptions on Saturdays (and Sundays) and less on Thurdays;
* The most represented breeds of cats are domestic short hair, medium hair, long hair, and siamese;
* Adoptions peak in June/July and are the lowest in March/April;

**Next 

**Next step -> suggest actions to increase chances of adoption**

**Next step -> predict outcome 1 if adopted else 0**