In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as st

%matplotlib inline

In [None]:
df_out = pd.read_pickle('df_out.pkl')
df_breeds = pd.read_pickle('df_breeds.pkl')
df_out_with_breeds_info = pd.read_pickle('df_out_with_breeds_info.pkl')
df_breeds_with_info = pd.read_pickle('df_breeds_with_info.pkl')
df_out.info()
df_out.head()

In [None]:
df_breeds_with_info.head()

# Analysis by breed

There isn't much correlation appearing yet

In [None]:
df_breeds_with_info_corr = df_breeds_with_info.corr()

plt.figure(num=None, figsize=(12, 10), dpi=96, facecolor='w', edgecolor='k')
sns.heatmap(data=df_breeds_with_info_corr.abs())

def score(df, var1, var2):
    print(f'Corr({var1}, {var2}) ≈ {df[var1][var2]}')

score(df_breeds_with_info_corr, 'Adopted', 'Color 0 B (mean)')
score(df_breeds_with_info_corr, 'Adopted', 'Color 0 B (std dev)')
score(df_breeds_with_info_corr, 'Adopted', 'Color 0 V (mean)')
score(df_breeds_with_info_corr, 'Adopted', 'Color 0 V (std dev)')
score(df_breeds_with_info_corr, 'Adopted', 'average height')
score(df_breeds_with_info_corr, 'Adopted', 'height_low_inches')
score(df_breeds_with_info_corr, 'Adopted', 'height_high_inches')
score(df_breeds_with_info_corr, 'Adopted', 'Lifespan Low')

In [None]:
sns.pairplot(data=df_breeds_with_info, x_vars=['Adopted'])

## Height ~ adopted?

Is the average height of a breed correlated with likelihood of an animal from that breed being adopted? The Pearson correlation coefficient was Corr(Adopted, average height) ≈ 0.2286839421877296.

This section analyzes this by breed and also by individual animal.

In [None]:
# TODO 1: perform logistic regression on the individual animals in df_out_with_breeds_info
# to regress average_height with Adopted


In [None]:
# TODO 2: make the "Count" column be the weight for the hexbin()
# so that breeds with more animals weigh more heavily than breeds with few animals.

df_breeds_with_info.plot.hexbin(x='average height', y='Adopted', gridsize=8)

In [None]:
# TODO 3: make the "Count" column be the weight for the points and regression here
# TODO 4: include Y error bars for the uncertainty in the true value of Adopted for that breed
# (see how the confidence interval was constructed in the color regression later)

sns.jointplot(
    x=df_breeds_with_info['average height'].astype(dtype=float),
    y=df_breeds_with_info.Adopted.astype(dtype=float),
    kind='reg')

# Analysis by individuals

## Color

(results)

In [None]:
print('Colors')
print(df_out_with_breeds_info['Color 0'].unique())
print(df_out_with_breeds_info['Color 1'].unique())

In [None]:
df_out_colors_1 = df_out.loc[(df_out['Color 0'].notna() == True) & (df_out['Color 1'].notna() == False)]
df_out_colors_2 = df_out.loc[(df_out['Color 0'].notna() == True) & (df_out['Color 1'].notna() == True)]

In [None]:
def bigCorr_bernoulli(df, independent, dependent):
    numerator = (
        df[[independent, dependent]].groupby(independent).value_counts()
    )
    
    denominator = (
        df[[independent]].groupby(independent).value_counts()
    )
    
    return (numerator.div(denominator))[:,True]

In [None]:
def bigCorr_bernoulli_custom_colors_2():
    df_out_colors_2_color_0 = df_out_colors_2[['Color 0', 'Adopted']].rename(columns={'Color 0': 'Color'})
    df_out_colors_2_color_1 = df_out_colors_2[['Color 1', 'Adopted']].rename(columns={'Color 1': 'Color'})
    
    numerator = (
        df_out_colors_2_color_0.groupby('Color').value_counts().add(
            df_out_colors_2_color_1.groupby('Color').value_counts(),
            fill_value=0
        )
    )
    
    denominator = (
        df_out_colors_2_color_0[['Color']].groupby('Color').value_counts().add(
            df_out_colors_2_color_1[['Color']].groupby('Color').value_counts(),
            fill_value=0
        )
    )
    
    return (numerator.div(denominator))[:,True]

def bigCorr_bernoulli_custom_colors_1_or_2():
    df_out_colors_1_color_0 = df_out_colors_1[['Color 0', 'Adopted']].rename(columns={'Color 0': 'Color'})
    df_out_colors_2_color_0 = df_out_colors_2[['Color 0', 'Adopted']].rename(columns={'Color 0': 'Color'})
    df_out_colors_2_color_1 = df_out_colors_2[['Color 1', 'Adopted']].rename(columns={'Color 1': 'Color'})
    
    numerator = (
        df_out_colors_1_color_0.groupby('Color').value_counts().add(
            df_out_colors_2_color_0.groupby('Color').value_counts().add(
                df_out_colors_2_color_1.groupby('Color').value_counts(),
                fill_value=0
            ),
            fill_value=0
        )
    )
    
    denominator = (
        df_out_colors_1_color_0[['Color']].groupby('Color').value_counts().add(
            df_out_colors_2_color_0[['Color']].groupby('Color').value_counts().add(
                df_out_colors_2_color_1[['Color']].groupby('Color').value_counts(),
                fill_value=0
            ),
            fill_value=0
        )
    )
    
    return (numerator.div(denominator))[:,True]

In [None]:
# This is copied from prep.ipynb

from math import pi

# colors.csv was compiled from these wikipedia articles
# https://en.wikipedia.org/wiki/List_of_colors:_A–F
# https://en.wikipedia.org/wiki/List_of_colors:_G%E2%80%93M
# https://en.wikipedia.org/wiki/List_of_colors:_N%E2%80%93Z
# Then the "—" character was replaced with "0"
df_colors = pd.read_csv('colors.csv')
df_colors = df_colors.convert_dtypes(infer_objects=True)
df_colors['Name'] = df_colors['Name'].str.lower()
df_colors['Red (RGB)'] = pd.to_numeric(df_colors['Red (RGB)'].str.replace('%', '')).div(100)
df_colors['Green (RGB)'] = pd.to_numeric(df_colors['Green (RGB)'].str.replace('%', '')).div(100)
df_colors['Blue (RGB)'] = pd.to_numeric(df_colors['Blue (RGB)'].str.replace('%', '')).div(100)
df_colors['Hue (HSL/HSV)'] = pd.to_numeric(df_colors['Hue (HSL/HSV)'].str.replace('°', '')).div(360)
df_colors['Satur. (HSL)'] = pd.to_numeric(df_colors['Satur. (HSL)'].str.replace('%', '')).div(100)
df_colors['Light (HSL)'] = pd.to_numeric(df_colors['Light (HSL)'].str.replace('%', '')).div(100)
df_colors['Satur. (HSV)'] = pd.to_numeric(df_colors['Satur. (HSV)'].str.replace('%', '')).div(100)
df_colors['Value (HSV)'] = pd.to_numeric(df_colors['Value (HSV)'].str.replace('%', '')).div(100)
df_colors.head()

def colorInfo(color):
    color = color.lower()
    words = [color] if color.count(' ') == 0 else [color] + color.split(' ')
    for word in words:
        try:
            items = df_colors.loc[df_colors.Name == word]
            if len(items) > 0:
                return items
        except:
            continue
    
    for word in words:
        try:
            items = df_colors.loc[df_colors.Name.str.contains(word)]
            if len(items) > 0:
                return items
        except:
            continue
        
    return None

def rgb(color):
    info = colorInfo(color)
    if info is None: return (None, None, None)
    r = info['Red (RGB)'].values[0]
    g = info['Green (RGB)'].values[0]
    b = info['Blue (RGB)'].values[0]
    return (r, g, b)

In [None]:
def chartColorAdoptionLikelihood(df_colors, color_relation):
    
    # Wilson confidence interval
    # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
    
    alpha = 0.01
    z = st.norm.ppf(1 - (alpha / 2))
    n = df_colors.Count
    p = df_colors.Adopted
    p_center = (1 / (1 + ((z ** 2) / n))) * (p + ((z ** 2) / (2 * n)))
    p_halfextent = (z / (1 + ((z ** 2) / n))) * ((( (p * (1 - p)) / (n) ) + ((z ** 2) / (4 * (n ** 2)))) ** (1/2))
    p_low = p_center - p_halfextent
    p_high = p_center - p_halfextent
    
    colors = [rgb(color) for color in df_colors.index]
    colors = [color if color[0] != None else '0.3' for color in colors]
    
    plt.figure(num=None, figsize=(5, 12), dpi=96, facecolor='w', edgecolor='k')
    plt.title(f'Probability of an animal with this {color_relation} color being adopted ({(1 - alpha):%} confidence)')
    ax = df_colors.Adopted.plot.barh(x='Color', xerr=[p_low, p_high], ecolor='0.5', color=colors)
    ax.set_xlim(0, 1)
    plt.show()
    print(f'{len(df_colors)} colors')
    print()

def colors_single():
    colors_adopted = bigCorr_bernoulli(df_out_colors_1, 'Color 0', 'Adopted')
    colors_count = df_out_colors_1['Color 0'].value_counts()
    df_colors = pd.DataFrame(index=colors_count.index)
    df_colors = df_colors.assign(Color=colors_count.index, Count=colors_count, Adopted=colors_adopted)
    df_colors.sort_values(by='Adopted', ascending=False, inplace=True)
    chartColorAdoptionLikelihood(df_colors, 'single')
    
def colors_mixed():
    colors_adopted = bigCorr_bernoulli_custom_colors_2()
    colors_count = df_out_colors_2['Color 0'].value_counts().add(df_out_colors_2['Color 1'].value_counts(), fill_value=0)
    df_colors = pd.DataFrame(index=colors_count.index)
    df_colors = df_colors.assign(Color=colors_count.index, Count=colors_count, Adopted=colors_adopted)
    df_colors.sort_values(by='Adopted', ascending=False, inplace=True)
    chartColorAdoptionLikelihood(df_colors, 'mixed')
    
def colors_singleOrMixed():
    colors_adopted = bigCorr_bernoulli_custom_colors_1_or_2()
    colors_count = df_out_colors_1['Color 0'].value_counts().add(df_out_colors_2['Color 0'].value_counts(), fill_value=0).add(df_out_colors_2['Color 1'].value_counts(), fill_value=0)
    df_colors = pd.DataFrame(index=colors_count.index)
    df_colors = df_colors.assign(Color=colors_count.index, Count=colors_count, Adopted=colors_adopted)
    df_colors.sort_values(by='Adopted', ascending=False, inplace=True)
    chartColorAdoptionLikelihood(df_colors, 'single or mixed')

colors_single()
colors_mixed()
colors_singleOrMixed()

# TODO 5: make an outcome chart like this for mixed and solid+mixed
# like the bar charts were made for just the Adopted percentage earlier
df_colors_outcomes = df_out_colors_1[['Outcome Type', 'Color 0']]
plt.figure(num=None, figsize=(5, 12), dpi=96, facecolor='w', edgecolor='k')
plt.title('Probability of an animal with this solid color having a certain outcome')
sns.histplot(
    data=df_colors_outcomes,
    y='Color 0',
    hue='Outcome Type',
    multiple='fill',
)

## Sex

In [None]:
# TODO 6: regress sex against adoption likelihood
# Please make 3 bar charts:
#  - "Sex upon Outcome" (neutered male, spayed female, intact male, intact female)
#  - male or female
#  - neutered/spayed or intact
# Also please construct the 95% confidence interval and make it the error bars
# see the earlier cell in the section on color for an example of how to do this

## Breed characteristics

This analysis considers individual animals and looks for correlations between characteristics of their breed and their outcome.

It looks like the animals belonging to a breed with an average height around 20-25 \(inches?\) are more likely to be adopted than others, and animals between 5-12 inches are less likely than others to be adopted.

**TODO 7:** interpret the other graphs. Why are the different lifespan variables distributed the way they are, and why are they distributed differently compared to each other?

In [None]:
df_out_1 = df_out.assign(Adopted=df_out.Adopted.fillna(False))
df_out_with_breeds_info_1 = df_out_with_breeds_info.assign(Adopted=df_out.Adopted.fillna(False))

def correlo_histogram(df, independent, dependent, binwidth):
    print(f'{independent} ~ {dependent}')
    # TODO 8: add error bars when the dependent variable is "Adopted"
    # (see how the confidence interval was constructed in the previous cell
    # for regression by color)
    
    sns.histplot(data=df,
             x=independent,
             hue=dependent,
             multiple='fill',
             binwidth=binwidth)
    plt.show()

independent_vars_breeds_info = [
    ['average height', 2],
    ['Est. lifespan remaining', 1],
    ['average lifespan', 1],
    ['Lifespan Low', 1],
    ['Lifespan High', 1]
]

independent_vars_individuals = [
    ['Age upon Outcome (years)', 1],
    ['Color 0 H', 0.1],
    ['Color 0 S', 0.1],
    ['Color 0 V', 0.1]
]

for [independent, binwidth] in independent_vars_breeds_info:
    for dependent in ['Adopted', 'Outcome Type']:
        correlo_histogram(df_out_with_breeds_info_1, independent, dependent, binwidth)

for [independent, binwidth] in independent_vars_individuals:
    for dependent in ['Adopted', 'Outcome Type']:
        correlo_histogram(df_out_1, independent, dependent, binwidth)