# <center>Phase 2 - Publication<center>

In [713]:
# General imports.
import sqlite3
import pandas as pd
from matplotlib_venn import venn2, venn3
import scipy.stats as scs
import textwrap
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from  itertools import combinations
import os
from matplotlib.colors import ListedColormap
from matplotlib import ticker
from scipy.stats import ttest_ind
import math
from timeit import default_timer as timer

# Imports from neighbor directories.
import sys
sys.path.append("..")
from src.utilities import field_registry as fieldreg

# IPython magics for this notebook.
%matplotlib inline

# Use latex font for matplotlib
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

In [714]:
# Switches
SAVE_OUTPUT = False

# Data Globals
FR = fieldreg.FieldRegistry()
TOTAL_USERS = 0
REMAINING_USERS = 0
TOTAL_DOGS = 0
REMAINING_DOGS = 0
PREVALENCE = lambda x: (x / REMAINING_DOGS) * 100
CATEGORY_MATRIX = pd.DataFrame()

# Bootstrap Globals
NITER=10

# Database Globals
USER_TABLE = 'users'
DOG_TABLE = 'dogs'
BIAS_FILTER = '''
    USING (record_id)
    WHERE question_reason_for_part_3 = 0
    OR (question_reason_for_part_3 = 1 AND q01_main != 1)'''
CON = sqlite3.connect('../data/processed/processed.db')

In [715]:
def createStringDataFrame(table, fields, labels, filtered=True):
    query = 'SELECT ' + fields + ' FROM ' + table
    if filtered:
        table2 = USER_TABLE if table == DOG_TABLE else DOG_TABLE
        query += ' JOIN ' + table2 + ' ' + BIAS_FILTER
    df = pd.read_sql_query(query, CON)
    df.columns = labels
    return df

def convertToNumeric(df):
    df = df.apply(pd.to_numeric, errors='coerce')
    return df

def createNumericDataFrame(table, fields, labels, filtered=True):
    df = createStringDataFrame(table, fields, labels, filtered)
    return convertToNumeric(df)

def replaceFields(df, column, replacement_dict):
    df[column].replace(replacement_dict, inplace=True)

def getValueCountAndPrevalence(df, field):
    s = df[field].value_counts()
    p = s.apply(PREVALENCE).round().astype(int)
    rv = pd.concat([s, p], axis=1)
    rv.columns = ['frequency', 'prevalence']
    return rv

def createCategoryMatrix():
    fields = []
    labels = []
    counter = 1
    for cat, subdict in FR.labels.items():
        for key, value in  subdict.items():
            if counter == 11:
                counter += 1;
            fields.append('q02_main_{}'.format(counter))
            labels.append(key[0])
            break
        counter += 1
    fields = ', '.join(fields)
    df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
    cols = []
    pvalue = {}
    for col in df:
        cols.append(col)
        pvalue[col] = {}
    pairs = list(combinations(df.columns, 2))
    for pair in pairs:
        contingency = pd.crosstab(df[pair[0]], df[pair[1]])
        c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
        pvalue[pair[0]][pair[1]] = p
        pvalue[pair[1]][pair[0]] = p
    df = pd.DataFrame(pvalue).sort_index(ascending=True)
    return df

def createQuestionMatrix():
    fields = ''
    for cat, sublist in FR.fields.items():
        for field in sublist:
            fields += '{}, '.format(field)
    fields = fields[:-2]
    labels = []
    for cat, subdict in FR.labels.items():
        for key, value in  subdict.items():
            labels.append(key)
    df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
    cols = []
    pvalue = {}
    for col in df:
        cols.append(col)
        pvalue[col] = {}
    pairs = list(combinations(df.columns, 2))
    for pair in pairs:
        contingency = pd.crosstab(df[pair[0]], df[pair[1]])
        c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
        pvalue[pair[0]][pair[1]] = p
        pvalue[pair[1]][pair[0]] = p
    df = pd.DataFrame(pvalue).sort_index(ascending=True)
    return df

def createCorrelationMatrix():
    fields = []
    labels = []
    counter = 1
    for cat, subdict in FR.labels.items():
        for key, value in  subdict.items():
            if counter == 11:
                counter += 1;
            fields.append('q02_main_{}'.format(counter))
            labels.append(key[0])
            break
        counter += 1
    fields = ', '.join(fields)
    df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
    return df.corr()

def createOddsRatioMatrix():
    fields = []
    labels = []
    counter = 1
    for cat, subdict in FR.labels.items():
        for key, value in  subdict.items():
            if counter == 11:
                counter += 1;
            fields.append('q02_main_{}'.format(counter))
            labels.append(key[0])
            break
        counter += 1
    fields = ', '.join(fields)
    df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
    cols = []
    pvalue = {}
    for col in df:
        cols.append(col)
        pvalue[col] = {}
    pairs = list(combinations(df.columns, 2))
    for pair in pairs:
        contingency = pd.crosstab(df[pair[0]], df[pair[1]])
        c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
        pvalue[pair[0]][pair[1]] = getOddsRatio(contingency)
        pvalue[pair[1]][pair[0]] = getOddsRatio(contingency)
    df = pd.DataFrame(pvalue).sort_index(ascending=True)
    return df

def displayOddsRatio(df):
    odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(df)
    print('OR = %.2f, 95%% CI: %.2f-%.2f, n = %d'
          %(round(odds, 2), round(ci_low, 2), round(ci_high, 2), tot))

def getOddsRatio(df):
    return (df[1][1]/df[1][0])/(df[0][1]/df[0][0])

def getOddsRatioAndConfidenceInterval(df):
    odds = 0
    ci_low = 0
    ci_high = 0
    tot = 0
    odds = getOddsRatio(df)
    if odds != 0:
        nl_or = math.log(odds)
        se_nl_or = math.sqrt((1/df[0][0])+(1/df[0][1])+(1/df[1][0])+(1/df[1][1]))
        ci_low = math.exp(nl_or - (1.96 * se_nl_or))
        ci_high = math.exp(nl_or + (1.96 * se_nl_or))
        tot = df[0][0] + df[0][1] + df[1][0] + df[1][1]
    return odds, ci_low, ci_high, tot

def get_significance_category(p):
    if np.isnan(p):
        return p
    elif p > 10**(-3):
        return -1
    elif p <= 10**(-3) and p > 10**(-6):
        return 0
    else:
        return 1
    
def displaySeriesMedian(s, units=""):
    print('MD = %.2f %s (SD = %.2f, min = %.2f, max = %.2f, n = %d)'
          %(round(s.median(), 2), units, round(s.std(), 2), round(s.min(), 2), round(s.max(), 2), s.count()))
    
def displaySeriesMean(s, units=""):
    print('M = %.2f %s (SD = %.2f, min = %.2f, max = %.2f, n = %d)'
          %(round(s.mean(), 2), units, round(s.std(), 2), round(s.min(), 2), round(s.max(), 2), s.count()))
    
def convert_to_binary_response(x, y=1):
    x = float(x)
    if x < y:
        return 0
    return 1
        
def exportTable(data, title):
    if not SAVE_OUTPUT:
        return
    file_ = os.path.join('..', 'reports', 'tables', title) + '.tex'
    with open(file_, 'w') as tf:
        tf.write(r'\documentclass[varwidth=\maxdimen]{standalone}\usepackage{booktabs}\begin{document}')
        tf.write(df.to_latex())
        tf.write(r'\end{document}')
        
def exportFigure(figure, title):
    if not SAVE_OUTPUT:
        return
    file_ = os.path.join('..', 'reports', 'figures', title) + '.pdf'
    figure.tight_layout()
    figure.savefig(file_, format='pdf')

In [716]:
def get_bootstrap_prevalence_ci(data, count=10, alpha=0.95):
    start = timer()
    temp_df = pd.DataFrame()
    for i in range(count):
        sample_df = data.sample(len(data.index), replace=True)
        sums = sample_df.sum().apply(PREVALENCE).round().astype(int)
        temp_df = temp_df.append(sums, ignore_index=True)
    lower = (1-alpha)/2
    upper = alpha+lower
    for name, values in temp_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))

## <center>Demographics</center>

### Number of participants:

In [717]:
df = createNumericDataFrame(USER_TABLE, 'COUNT(*)', ['count'], filtered=False)
# Assign value to global.
TOTAL_USERS = df['count'][0]
print('N = %d owners [unadjusted]' %TOTAL_USERS)

N = 3201 owners [unadjusted]


### Number of participating dogs:

In [718]:
df = createNumericDataFrame(DOG_TABLE, 'COUNT(*)', ['count'], filtered=False)
# Assign value to global.
TOTAL_DOGS = df['count'][0]
print('N = %d dogs [unadjusted]' %TOTAL_DOGS)

N = 5018 dogs [unadjusted]


### Suspicion of behavior problems as one of multiple motivating factors:

In [719]:
fields = ('question_reason_for_part_1, question_reason_for_part_2, '
          'question_reason_for_part_3, question_reason_for_part_4, '
          'question_reason_for_part_5')
labels = ['love for dogs', 'you help shelter animals', 'suspicion of behavior problems',
          'work with animals', 'other']
df = createNumericDataFrame(USER_TABLE, fields, labels, filtered=False)
df = df[df[labels[2]] == 1]
df['sum'] = df.sum(axis=1)
s = df.sum(0, skipna=False)

print('n = %d owners (%d%%) [unadjusted]' %(s.iloc[2], round((s.iloc[2]/TOTAL_USERS)*100, 0)))

n = 830 owners (26%) [unadjusted]


### Suspicion of behavior problems as the sole motivating factor:

In [720]:
fields = ('question_reason_for_part_1, question_reason_for_part_2, '
          'question_reason_for_part_3, question_reason_for_part_4, '
          'question_reason_for_part_5')
labels = ['love for dogs', 'you help shelter animals', 'suspicion of behavior problems',
          'work with animals', 'other']
df = createNumericDataFrame(USER_TABLE, fields, labels, filtered=False)
df = df[df[labels[2]] == 1]
df['sum'] = df.sum(axis=1)
df = df[df['sum'] == 1]
s = df.sum(0, skipna=False)

print('n = %d owners (%d%%) [unadjusted]' %(s.iloc[2], round((s.iloc[2]/TOTAL_USERS)*100, 0)))

n = 98 owners (3%) [unadjusted]


### Adjusting sample for bias:

In [None]:
fields = 'q02_score'
labels = ['Score']
df_adjusted_dogs = createNumericDataFrame(DOG_TABLE, fields, labels)
REMAINING_DOGS = len(df_adjusted_dogs.index)
df_adjusted_users = createNumericDataFrame(USER_TABLE, 'COUNT(DISTINCT email)', ['count'])
REMAINING_USERS = df_adjusted_users['count'][0]

# Display the count results.
print('Adjusted study population:')
print('N = %d owners (adjusted)' %REMAINING_USERS)
print('N = %d dogs (adjusted)' %REMAINING_DOGS)

Adjusted study population:
N = 2480 owners (adjusted)
N = 4114 dogs (adjusted)


### Dogs per household:

In [None]:
fields = 'record_id'
labels = ['record index']
df = createStringDataFrame(DOG_TABLE, fields, labels)

record_dict = {}
for index, row in df.iterrows():
    key = row.iloc[0]
    if not key in record_dict:
        record_dict[key] = 1
    else:
        record_dict[key] += 1

s = pd.Series(record_dict, name='dogs')
displaySeriesMedian(s, 'dogs')

MD = 1.00 dogs (SD = 0.96, min = 1.00, max = 13.00, n = 2480)


### Age at date of response

In [None]:
fields = 'dog_age_today_months'
labels = ['age (months)']
df = createNumericDataFrame(DOG_TABLE, fields, labels)
displaySeriesMedian(df[labels[0]], 'months')

MD = 72.00 months (SD = 47.42, min = 2.00, max = 252.00, n = 4030)


### Gender and neutered status:

In [None]:
fields = 'dog_sex, dog_spayed'
labels = ['Gender', 'Neutered']
df = createStringDataFrame(DOG_TABLE, fields, labels)
replacements = {'':'No response', '1':'Male', '2':'Female'}
replaceFields(df, labels[0], replacements)
replacements = {'':'No response', '0':'No', '1':'Yes', '2':"I don't know"}
replaceFields(df, labels[1], replacements)
df = pd.crosstab(df[labels[0]], df[labels[1]], margins=True)

print("males: n = %d (%d%%), neutered: n = %d (%d%%), intact: n = %d (%d%%)"
      %(df.loc['Male', 'All'], round((df.loc['Male', 'All']/df.loc['All', 'All'])*100, 0),
        df.loc['Male', 'Yes'], round((df.loc['Male', 'Yes']/df.loc['Male', 'All'])*100, 0),
        df.loc['Male', 'No'], round((df.loc['Male', 'No']/df.loc['Male', 'All'])*100, 0)))
print("females: n = %d (%d%%), neutered: n = %d (%d%%), intact: n = %d (%d%%)"
      %(df.loc['Female', 'All'], round((df.loc['Female', 'All']/df.loc['All', 'All'])*100, 0),
        df.loc['Female', 'Yes'], round((df.loc['Female', 'Yes']/df.loc['Female', 'All'])*100, 0),
        df.loc['Female', 'No'], round((df.loc['Female', 'No']/df.loc['Female', 'All'])*100, 0)))

males: n = 2005 (49%), neutered: n = 1682 (84%), intact: n = 312 (16%)
females: n = 2095 (51%), neutered: n = 1778 (85%), intact: n = 308 (15%)


### Age at neutering:

In [None]:
fields = 'dog_sex_month'
labels = ['age (months)']
df = createNumericDataFrame(DOG_TABLE, fields, labels)
displaySeriesMedian(df[labels[0]], 'months')

MD = 9.00 months (SD = 23.82, min = 2.00, max = 180.00, n = 2771)


### Number of purebred dogs:

In [None]:
fields = 'purebred'
labels = ['purebred']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df.head()
replacements = {'':'No response', '0':'No', '1':'Yes'}
replaceFields(df, labels[0], replacements)
df = df.apply(pd.value_counts)
df.rename(index=str, columns={labels[0]: "frequency"}, inplace=True)
df.columns.name = labels[0]
display(df)

purebred,frequency
Yes,2335
No,1723
No response,56


### Number of purebred breeds:

In [None]:
fields = 'purebred_breed, purebred'
labels = ['breed', 'purebred']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] == '1']
df.drop(columns=labels[1], inplace=True)
df = df[df[labels[0]] != '']
display(df.describe())

# Number of purebred dogs without a designated breed.
fields = 'purebred_breed, purebred'
labels = ['breed', 'purebred']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] == '1']
df.drop(columns=labels[1], inplace=True)
df = df[df[labels[0]] == '']
purebred_missing_breed = df.describe().iloc[0][0]
print('Purebreds without breed designated: n = %d dogs' %(purebred_missing_breed))

Unnamed: 0,breed
count,2138
unique,142
top,labrador retriever
freq,382


Purebreds without breed designated: n = 197 dogs


### Numbers of dogs per purebred breed:

In [None]:
fields = 'purebred_breed, purebred'
labels = ['breed', 'purebred']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] == '1']
df.drop(columns=labels[1], inplace=True)
df = df[df[labels[0]] != '']

# Calculate sums and prevalences for each breed.
df = getValueCountAndPrevalence(df, labels[0])
df = df.round(2)
df.columns.name = labels[0]
display(df[:10])

### Number of dogs per source of origin:

In [None]:
fields = 'acquisition_source'
labels = ['origin']
df = createStringDataFrame(DOG_TABLE, fields, labels)
# Rescue fields (online: 1, in-person: 4) were combined in the database.
replacements = {'':'no response', '1': 'rescue', '2': 'online (non-rescue)', '3': 'pet store', '5': 'breeder',
                '6': 'self-bred', '7': 'friends/family', '8': 'found', '9': 'other'}
replaceFields(df, labels[0], replacements)
df = df.apply(pd.value_counts)
df.rename(index=str, columns={labels[0]: 'frequency'}, inplace=True)
df.columns.name = labels[0]

for index, row in df.iterrows():
    print('%s: n = %d (%d%%)' %(index, (row[0]), round((row[0]/df['frequency'].sum())*100, 0)))

### Age at onset:

In [None]:
fields = 'q01_age_months'
labels = ['age (months)']
df = createNumericDataFrame(DOG_TABLE, fields, labels)
displaySeriesMedian(df[labels[0]], 'months')

fields = 'q01_acq'
labels = ['evident when acquired']
df = createNumericDataFrame(DOG_TABLE, fields, labels)
cnt_evident_when_acquired = len(df[df[labels[0]] == 1].index)
print('Behavior problems evident at acquisition: n = %d dogs (%d%%)'
      %(cnt_evident_when_acquired, round((cnt_evident_when_acquired/REMAINING_DOGS)*100, 0)))

## <center>Prevalence of Behavior Problems</center>

### Number of dogs with behavior problems and overall prevalence:

In [None]:
fields = 'q02_score'
labels = ['Score']
df_adjusted_dogs = createNumericDataFrame(DOG_TABLE, fields, labels)
cnt_total_dogs_w_problems_adjusted = len(
    df_adjusted_dogs[df_adjusted_dogs[labels[0]] != 0].index)

print('Dogs with behavior problems: n = %d dogs' %(cnt_total_dogs_w_problems_adjusted))

# Calculate the adjusted prevalence.
prevalence_adjusted = PREVALENCE(cnt_total_dogs_w_problems_adjusted)

print('Overall prevalence: %d%% (%d/%d dogs)'
      %(round(prevalence_adjusted, 0), cnt_total_dogs_w_problems_adjusted, REMAINING_DOGS))

### Prevalence of behavior problem categories (Table 1):

In [None]:
fields = []
labels = []
for counter, category in enumerate(FR.categories, 1):
    if counter > 10:
        counter += 1;
    fields.append('q02_main_{}'.format(counter))
    labels.append(category)
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
#get_bootstrap_prevalence_ci(df, count=10000)

# Get individual behavior problem counts and display as a table.
sums = df.sum()
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE).round().astype(int)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Count':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Category'
display(df)
exportTable(df, 'table_1')


### Prevalence of behavior problem category subtypes (Table 2):

In [None]:
sums = pd.Series()
for i in range(0, 12):
    all_fields = FR.fields[FR.categories[i]].copy()
    all_labels = list(FR.labels[FR.categories[i]].values()).copy()
    df = createNumericDataFrame(DOG_TABLE, ', '.join(all_fields), all_labels, filtered=True)
    #get_bootstrap_prevalence_ci(df, count=10000)
    if sums.empty:
        sums = df.sum().sort_values(ascending=False)
    else:
        sums = sums.append(df.sum().sort_values(ascending=False))

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE).round().astype(int)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Frequency':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Behavior problem'
display(df.head())
print("Note: Only showing dataframe head to conserve notebook space")
exportTable(df, 'table_2')

### Prevalence by gender and neutered status (Table 3):

In [None]:
fields = []
labels = []
for counter, cat in enumerate(FR.categories, 1):
    if counter > 10:
        counter += 1;
    fields.append('q02_main_{}'.format(counter))
    labels.append(cat)   
fields.extend(('dog_sex', 'dog_spayed'))
labels.extend(('Gender', 'Neutered'))
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

df_intact_male = df[(df['Gender'] == 1) & (df['Neutered'] == 0)].drop(
    columns=['Gender', 'Neutered'])
df_neutered_male = df[(df['Gender'] == 1) & (df['Neutered'] == 1)].drop(
    columns=['Gender', 'Neutered'])
df_intact_female = df[(df['Gender'] == 2) & (df['Neutered'] == 0)].drop(
    columns=['Gender', 'Neutered'])
df_neutered_female = df[(df['Gender'] == 2) & (df['Neutered'] == 1)].drop(
    columns=['Gender', 'Neutered'])

def get_group_prevalence(df):
    df = df.sum().apply(lambda x: (x / len(df.index)) * 100)
    return df.round().astype(int)

intact_male_p = get_group_prevalence(df_intact_male)
neutered_male_p = get_group_prevalence(df_neutered_male)
intact_female_p = get_group_prevalence(df_intact_female)
neutered_female_p = get_group_prevalence(df_neutered_female)

df = pd.DataFrame(index=intact_male_p.index, data={'Intact males': intact_male_p,
                                                   'Castrated males': neutered_male_p,
                                                   'Intact females': intact_female_p,
                                                   'Spayed females': neutered_female_p})
df.columns.name = 'Behavior problem'
display(df.head())
print("Note: Only showing dataframe head to conserve notebook space")
exportTable(df, 'table_3')

def get_table_3_bootstrap_ci(data, count=10, alpha=0.95):
    start = timer()
    temp_im_df = pd.DataFrame()
    temp_nm_df = pd.DataFrame()
    temp_if_df = pd.DataFrame()
    temp_nf_df = pd.DataFrame()
    for i in range(count):
        sample_df = data.sample(len(data.index), replace=True)
        df_intact_male = sample_df[(sample_df['Gender'] == 1) & (sample_df['Neutered'] == 0)].drop(
            columns=['Gender', 'Neutered'])
        df_neutered_male = sample_df[(sample_df['Gender'] == 1) & (sample_df['Neutered'] == 1)].drop(
            columns=['Gender', 'Neutered'])
        df_intact_female = sample_df[(sample_df['Gender'] == 2) & (sample_df['Neutered'] == 0)].drop(
            columns=['Gender', 'Neutered'])
        df_neutered_female = sample_df[(sample_df['Gender'] == 2) & (sample_df['Neutered'] == 1)].drop(
            columns=['Gender', 'Neutered'])
        intact_male_p = get_group_prevalence(df_intact_male)
        neutered_male_p = get_group_prevalence(df_neutered_male)
        intact_female_p = get_group_prevalence(df_intact_female)
        neutered_female_p = get_group_prevalence(df_neutered_female)
        temp_im_df = temp_im_df.append(intact_male_p, ignore_index=True)
        temp_nm_df = temp_nm_df.append(neutered_male_p, ignore_index=True)
        temp_if_df = temp_if_df.append(intact_female_p, ignore_index=True)
        temp_nf_df = temp_nf_df.append(neutered_female_p, ignore_index=True)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('\nIntact Males:')
    for name, values in temp_im_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\nNeutered Males:')
    for name, values in temp_nm_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\nIntact Females:')
    for name, values in temp_if_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\nNeutered Females:')
    for name, values in temp_nf_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))

df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
#get_table_3_bootstrap_ci(df, count=10000)

### Prevalence ranked by age of onset (Table 4):

In [None]:
fields = []
labels = []
for counter, cat in enumerate(FR.categories, 1):
    if counter > 10:
        counter += 1;
    fields.append('q02_main_{}'.format(counter))
    labels.append(cat)   
fields.append('q01_age_months')
labels.append('Age')
fields.append('q01_acq')
labels.append('Evident at Acquisition')
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Ranges: 0=0-3m, 1=3-6m, 2=6m-1y, 3=1-3y, 4=3y+, 5=evident at acquisition
rngs = []
rngs.append(df[(df['Age'] < 3.5)].drop(columns=['Age', 'Evident at Acquisition']))
rngs.append(df[(df['Age'] >= 3.5) & (df['Age'] < 6.5)].drop(columns=['Age', 'Evident at Acquisition']))
rngs.append(df[(df['Age'] >= 6.5) & (df['Age'] < 12.5)].drop(columns=['Age', 'Evident at Acquisition']))
rngs.append(df[(df['Age'] >= 12.5) & (df['Age'] < 36.5)].drop(columns=['Age', 'Evident at Acquisition']))
rngs.append(df[(df['Age'] >= 36.5)].drop(columns=['Age', 'Evident at Acquisition']))
rngs.append(df[(df['Evident at Acquisition'] == 1)].drop(columns=['Age', 'Evident at Acquisition']))

def get_group_prevalence(df):
    df = df.sum().apply(lambda x: (x / len(df.index)) * 100)
    return df.round().astype(int)

prevs = []
prevs.append(get_group_prevalence(rngs[0]))
prevs.append(get_group_prevalence(rngs[1]))
prevs.append(get_group_prevalence(rngs[2]))
prevs.append(get_group_prevalence(rngs[3]))
prevs.append(get_group_prevalence(rngs[4]))
prevs.append(get_group_prevalence(rngs[5]))

df = pd.DataFrame(index=prevs[0].index, data={'0-3m': prevs[0], '4-6m': prevs[1], '7-12m': prevs[2],
                                              '13-36m': prevs[3], '37m+': prevs[4],
                                              'Evident at Acquisition': prevs[5]})
df.columns.name = 'Behavior problem'
display(df.head())
print("Note: Only showing dataframe head to conserve notebook space")
exportTable(df, 'table_4')

def get_table_4_bootstrap_ci(data, count=10, alpha=0.95):
    start = timer()
    temp_0_df = pd.DataFrame()
    temp_1_df = pd.DataFrame()
    temp_2_df = pd.DataFrame()
    temp_3_df = pd.DataFrame()
    temp_4_df = pd.DataFrame()
    temp_5_df = pd.DataFrame()
    for i in range(count):
        sample_df = data.sample(len(data.index), replace=True)
        rngs = []
        rngs.append(sample_df[(sample_df['Age'] < 3.5)].drop(columns=['Age', 'Evident at Acquisition']))
        rngs.append(sample_df[(sample_df['Age'] >= 3.5) & (sample_df['Age'] < 6.5)].drop(
            columns=['Age', 'Evident at Acquisition']))
        rngs.append(sample_df[(sample_df['Age'] >= 6.5) & (sample_df['Age'] < 12.5)].drop(
            columns=['Age', 'Evident at Acquisition']))
        rngs.append(sample_df[(sample_df['Age'] >= 12.5) & (sample_df['Age'] < 36.5)].drop(
            columns=['Age', 'Evident at Acquisition']))
        rngs.append(sample_df[(sample_df['Age'] >= 36.5)].drop(
            columns=['Age', 'Evident at Acquisition']))
        rngs.append(sample_df[(sample_df['Evident at Acquisition'] == 1)].drop(
            columns=['Age', 'Evident at Acquisition']))
        prevs = []
        prevs.append(get_group_prevalence(rngs[0]))
        prevs.append(get_group_prevalence(rngs[1]))
        prevs.append(get_group_prevalence(rngs[2]))
        prevs.append(get_group_prevalence(rngs[3]))
        prevs.append(get_group_prevalence(rngs[4]))
        prevs.append(get_group_prevalence(rngs[5]))
        temp_0_df = temp_0_df.append(prevs[0], ignore_index=True)
        temp_1_df = temp_1_df.append(prevs[1], ignore_index=True)
        temp_2_df = temp_2_df.append(prevs[2], ignore_index=True)
        temp_3_df = temp_3_df.append(prevs[3], ignore_index=True)
        temp_4_df = temp_4_df.append(prevs[4], ignore_index=True)
        temp_5_df = temp_5_df.append(prevs[5], ignore_index=True)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('\n0-3m:')
    for name, values in temp_0_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\n4-6m:')
    for name, values in temp_1_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\n7-12m:')
    for name, values in temp_2_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\n13-36m:')
    for name, values in temp_3_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\n37m:')
    for name, values in temp_4_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\nEvident at acquisition:')
    for name, values in temp_5_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
        
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
#get_table_4_bootstrap_ci(df, count=10000)

### Prevalence by origin (Table 5):

In [None]:
fields = []
labels = []
for counter, cat in enumerate(FR.categories, 1):
    if counter > 10:
        counter += 1;
    fields.append('q02_main_{}'.format(counter))
    labels.append(cat)   
fields.append('acquisition_source')
labels.append('origin')
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
# Rescue fields (online: 1, in-person: 4) were combined in the database.
#replacements = {'':'no response', '1': 'rescue', '2': 'online (non-rescue)', '3': 'pet store', '5': 'breeder',
#                '6': 'self-bred', '7': 'friends/family', '8': 'found', '9': 'other'}

rngs = []
for i in range(1, 9):
    cnt = i
    if i >= 4:
        cnt += 1
    rngs.append(df[(df['origin'] == cnt)].drop(columns=['origin']))

def get_group_prevalence(df):
    df = df.sum().apply(lambda x: (x / len(df.index)) * 100)
    return df.round().astype(int)

prevs = []
for j in range(0, 8):
    prevs.append(get_group_prevalence(rngs[j]))

df = pd.DataFrame(index=prevs[0].index, data={'Rescue': prevs[0], 'Online': prevs[1],
                                                   'Pet store': prevs[2], 'Breeder': prevs[3],
                                                   'Self-bred': prevs[4], 'Family/Friends': prevs[5],
                                                   'Found': prevs[6], 'Other': prevs[7]})
df.columns.name = 'Behavior problem'
display(df.head())
print("Note: Only showing dataframe head to conserve notebook space")
exportTable(df, 'table_5')

def get_table_5_bootstrap_ci(data, count=10, alpha=0.95):
    start = timer()
    temp_0_df = pd.DataFrame()
    temp_1_df = pd.DataFrame()
    temp_2_df = pd.DataFrame()
    temp_3_df = pd.DataFrame()
    temp_4_df = pd.DataFrame()
    temp_5_df = pd.DataFrame()
    temp_6_df = pd.DataFrame()
    temp_7_df = pd.DataFrame()
    for i in range(count):
        sample_df = data.sample(len(data.index), replace=True)
        rngs = []
        for i in range(1, 9):
            cnt = i
            if i >= 4:
                cnt += 1
            rngs.append(sample_df[(sample_df['origin'] == cnt)].drop(columns=['origin']))
        prevs = []
        for j in range(0, 8):
            prevs.append(get_group_prevalence(rngs[j]))
        temp_0_df = temp_0_df.append(prevs[0], ignore_index=True)
        temp_1_df = temp_1_df.append(prevs[1], ignore_index=True)
        temp_2_df = temp_2_df.append(prevs[2], ignore_index=True)
        temp_3_df = temp_3_df.append(prevs[3], ignore_index=True)
        temp_4_df = temp_4_df.append(prevs[4], ignore_index=True)
        temp_5_df = temp_5_df.append(prevs[5], ignore_index=True)
        temp_6_df = temp_6_df.append(prevs[6], ignore_index=True)
        temp_7_df = temp_7_df.append(prevs[7], ignore_index=True)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('\nRescue:')
    for name, values in temp_0_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\nOnline:')
    for name, values in temp_1_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\nPet store:')
    for name, values in temp_2_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\nBreeder:')
    for name, values in temp_3_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\nSelf-bred:')
    for name, values in temp_4_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\nFamily/friends:')
    for name, values in temp_5_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\nFound:')
    for name, values in temp_6_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\nOther:')
    for name, values in temp_7_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
        
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
#get_table_5_bootstrap_ci(df, count=10000)

### Prevalence by purebred lineage (Table 6):

In [None]:
fields = []
labels = []
for counter, cat in enumerate(FR.categories, 1):
    if counter > 10:
        counter += 1;
    fields.append('q02_main_{}'.format(counter))
    labels.append(cat)
fields.append('purebred')
labels.append('purebred')
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

rngs = []
rngs.append(df[(df['purebred'] == 1)].drop(columns=['purebred']))
rngs.append(df[(df['purebred'] == 0)].drop(columns=['purebred']))

def get_group_prevalence(df):
    df = df.sum().apply(lambda x: (x / len(df.index)) * 100)
    return df.round().astype(int)

prevs = []
prevs.append(get_group_prevalence(rngs[0]))
prevs.append(get_group_prevalence(rngs[1]))

df = pd.DataFrame(index=prevs[0].index, data={'Purebred': prevs[0], 'Non-purebred': prevs[1]})
df.columns.name = 'Behavior problem'
display(df.head())
print("Note: Only showing dataframe head to conserve notebook space")
exportTable(df, 'table_6')

def get_table_6_bootstrap_ci(data, count=10, alpha=0.95):
    start = timer()
    temp_0_df = pd.DataFrame()
    temp_1_df = pd.DataFrame()
    for i in range(count):
        sample_df = data.sample(len(data.index), replace=True)
        rngs = []
        rngs.append(sample_df[(sample_df['purebred'] == 1)].drop(columns=['purebred']))
        rngs.append(sample_df[(sample_df['purebred'] == 0)].drop(columns=['purebred']))
        prevs = []
        prevs.append(get_group_prevalence(rngs[0]))
        prevs.append(get_group_prevalence(rngs[1]))
        temp_0_df = temp_0_df.append(prevs[0], ignore_index=True)
        temp_1_df = temp_1_df.append(prevs[1], ignore_index=True)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('\nPurebred:')
    for name, values in temp_0_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    print('\nNon-purebred:')
    for name, values in temp_1_df.iteritems():
        print(name + ':')
        values = values.sort_values(ascending=True)
        values = values.reset_index(drop=True)
        print(values[int(lower * len(values))])
        print(values[int(upper * len(values))])
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
        
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
#get_table_6_bootstrap_ci(df, count=10000)

### Impact of gender on prevalence:

In [None]:
fields = 'q02_score, dog_sex'
labels = ['behavior problems', 'gender']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)
boot_df = df.copy()

def gender_to_binary_response(x):
    x = int(x)
    if x == 1:
        return 1
    return 0

df['behavior problems'] = df['behavior problems'].apply(
    lambda x: convert_to_binary_response(x))
df['gender'] = df['gender'].apply(
    lambda x: gender_to_binary_response(x))

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        sample_df = data.sample(len(data.index), replace=True)
        sample_df['behavior problems'] = sample_df['behavior problems'].apply(
            lambda x: convert_to_binary_response(x))
        sample_df['gender'] = sample_df['gender'].apply(
            lambda x: gender_to_binary_response(x))
        contingency = pd.crosstab(sample_df[labels[0]], sample_df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))

get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

### Impact of neutered status on prevalence:

In [None]:
fields = 'q02_score, dog_spayed'
labels = ['behavior problems', 'neutered']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df[df[labels[1]] != '2']
df = df.apply(pd.to_numeric)
boot_df = df.copy()

df['behavior problems'] = df['behavior problems'].apply(
    lambda x: convert_to_binary_response(x))
df['neutered'] = df['neutered'].apply(
    lambda x: convert_to_binary_response(x))

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        sample_df = data.sample(len(data.index), replace=True)
        sample_df['behavior problems'] = sample_df['behavior problems'].apply(
            lambda x: convert_to_binary_response(x))
        sample_df['neutered'] = sample_df['neutered'].apply(
            lambda x: convert_to_binary_response(x))
        contingency = pd.crosstab(sample_df[labels[0]], sample_df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
    
get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

### Impact of origin on prevalence:

In [None]:
fields = 'q02_score, acquisition_source'
labels = ['behavior problems', 'origin']
df = createStringDataFrame(DOG_TABLE, fields, labels)
# Rescue fields (online: 1, in-person: 4) were combined in the database.
#replacements = {'':'no response', '1': 'rescue', '2': 'online (non-rescue)', '3': 'pet store', '5': 'breeder',
#                '6': 'self-bred', '7': 'friends/family', '8': 'found', '9': 'other'}
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)
boot_df = df.copy()

def rescue_to_binary_response(x):
    x = int(x)
    if x != 1:
        return 0
    return 1

df['behavior problems'] = df['behavior problems'].apply(
    lambda x: convert_to_binary_response(x))
df['origin'] = df['origin'].apply(lambda x: rescue_to_binary_response(x))

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        df['behavior problems'] = df['behavior problems'].apply(
            lambda x: convert_to_binary_response(x))
        df['origin'] = df['origin'].apply(lambda x: rescue_to_binary_response(x))
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
    
get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

### Impact of purebred lineage on prevalence:

In [None]:
fields = 'q02_score, purebred'
labels = ['behavior problems', 'purebred']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)
boot_df = df.copy()

df['behavior problems'] = df['behavior problems'].apply(
    lambda x: convert_to_binary_response(x))

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        df['behavior problems'] = df['behavior problems'].apply(
            lambda x: convert_to_binary_response(x))
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
    
get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

### Impact of gender (sans mounting/humping) on prevalence:

In [None]:
fields = 'q02_score, dog_sex, q02_main_7'
labels = ['behavior problems', 'gender', 'mounting']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)
df = df[(df[labels[2]] != 1)].drop(columns=[labels[2]])
boot_df = df.copy()

def gender_to_binary_response(x):
    x = int(x)
    if x == 1:
        return 1
    return 0

df['behavior problems'] = df['behavior problems'].apply(
    lambda x: convert_to_binary_response(x))
df['gender'] = df['gender'].apply(
    lambda x: gender_to_binary_response(x))

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        df['behavior problems'] = df['behavior problems'].apply(
            lambda x: convert_to_binary_response(x))
        df['gender'] = df['gender'].apply(
            lambda x: gender_to_binary_response(x))
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
    
get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

### Impact of age on prevalence:

In [None]:
fields = 'q02_score, q01_age_months'
labels = ['behavior problems', 'age']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)
boot_df = df.copy()

df['age'] = df['age'].apply(lambda x: convert_to_binary_response(x, 12.5))
df['behavior problems'] = df['behavior problems'].apply(lambda x: convert_to_binary_response(x, 1))

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        df['age'] = df['age'].apply(lambda x: convert_to_binary_response(x, 12.5))
        df['behavior problems'] = df['behavior problems'].apply(lambda x: convert_to_binary_response(x, 1))
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
    
get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

## <center>Number of Behavior Problems</center>

### Number of behavior problems per dog:

In [None]:
fields = 'q02_score'
labels = ['number of behavior problems']
df = createNumericDataFrame(DOG_TABLE, fields, labels)
displaySeriesMedian(df[labels[0]], 'behavior problems')

df = df.apply(pd.value_counts)
df.rename(index=str, columns={labels[0]: "frequency"}, inplace=True)
df.columns.name = labels[0]
display(df)

### Impact of gender on number of behavior problems:

In [None]:
fields = 'q02_score, dog_sex'
labels = ['behavior problems', 'gender']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)

print('Males:')
males = df[(df['gender'] == 1)].drop(columns=['gender'])
#display(males.describe())
displaySeriesMean(males[labels[0]], 'behavior problems')

print('\nFemales:')
females = df[(df['gender'] == 2)].drop(columns=['gender'])
#display(females.describe())
displaySeriesMean(females[labels[0]], 'behavior problems')

tval, pval = ttest_ind(males, females, equal_var=False)
tot = males[labels[0]].count() + females[labels[0]].count()
print('\nt(%d) = %.2f, p = %.2E' %(tot, round(tval[0], 2), pval))

### Impact of neuter status on number of behavior problems:

In [None]:
fields = 'q02_score, dog_spayed'
labels = ['behavior problems', 'neutered']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)

print('Neutered:')
neutered = df[(df['neutered'] == 1)].drop(columns=['neutered'])
#display(neutered.describe())
displaySeriesMean(neutered[labels[0]], 'behavior problems')

print('\nIntact:')
intact = df[(df['neutered'] == 0)].drop(columns=['neutered'])
#display(intact.describe())
displaySeriesMean(intact[labels[0]], 'behavior problems')

tval, pval = ttest_ind(neutered, intact, equal_var=False)
tot = neutered[labels[0]].count() + intact[labels[0]].count()
print('\nt(%d) = %.2f, p = %.2E' %(tot, round(tval[0], 2), pval))

### Impact of purebred lineage on number of behavior problems:

In [None]:
fields = 'q02_score, purebred'
labels = ['behavior problems', 'purebred']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)

print('Purebred:')
purebred = df[(df['purebred'] == 1)].drop(columns=['purebred'])
#display(purebred.describe())
displaySeriesMean(purebred[labels[0]], 'behavior problems')

print('\nNon-purebred:')
notpure = df[(df['purebred'] == 0)].drop(columns=['purebred'])
#display(notpure.describe())
displaySeriesMean(notpure[labels[0]], 'behavior problems')

tval, pval = ttest_ind(notpure, purebred, equal_var=False)
tot = purebred[labels[0]].count() + notpure[labels[0]].count()
print('\nt(%d) = %.2f, p = %.2E' %(tot, round(tval[0], 2), pval))

### Impact of acquisition source on number of behavior problems:

In [None]:
fields = 'q02_score, acquisition_source'
labels = ['behavior problems', 'origin']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)

def rescue_to_binary_response(x):
    x = int(x)
    if x != 1:
        return 0
    return 1
df['origin'] = df['origin'].apply(lambda x: rescue_to_binary_response(x))

print('Rescue:')
rescue = df[(df['origin'] == 1)].drop(columns=['origin'])
#display(rescue.describe())
displaySeriesMean(rescue[labels[0]], 'behavior problems')

print('\nNon-rescue:')
nonrescue = df[(df['origin'] == 0)].drop(columns=['origin'])
#display(nonrescue.describe())
displaySeriesMean(nonrescue[labels[0]], 'behavior problems')

tval, pval = ttest_ind(rescue, nonrescue, equal_var=False)
tot = rescue[labels[0]].count() + nonrescue[labels[0]].count()
print('\nt(%d) = %.2f, p = %.2E' %(tot, round(tval[0], 2), pval))

## <center>Separation Anxiety, Noise Phobia, and Thunderstorm Phobia</center>

### Paired independence:

In [None]:
fields = 'q04_1, q04_2, q04_9'
labels = ['Thunderstorm phobia', 'Noise phobia', 'Separation anxiety']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

def get_bootstrap_odds_ratio_ci(data, label1, label2, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        contingency = pd.crosstab(df[label1], df[label2], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))

pairs = [[labels[0], labels[1]], [labels[0], labels[2]], [labels[1], labels[2]]]
for pair in pairs:
    # Execute a chi-squared test of independence.
    contingency = pd.crosstab(df[pair[0]], df[pair[1]])
    print('\nChi-squared Test of Independence for %s and %s:' %(pair[0], pair[1]))
    c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
    print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
    displayOddsRatio(contingency)
    get_bootstrap_odds_ratio_ci(df, label1=pair[0], label2=pair[1], count=NITER)

### Grouped independence (Figure 2):

In [None]:
labels = ['Thunderstorm phobia', 'Noise phobia', 'Separation anxiety']
contingency = pd.crosstab(df[labels[2]], [df[labels[0]], df[labels[1]]])

# Convert the cross tabulated dataframe to a series of sums.
d = {labels[2]: contingency[0][0][1],
     labels[1]: contingency[1][0][0],
     'Separation-Noise': contingency[1][0][1],
     labels[0]: contingency[0][1][0],
     'Separation-Thunderstorm': contingency[0][1][1],
     'Noise-Thunderstorm': contingency[1][1][0],     
     'All': contingency[1][1][1]}   
cross_sums = pd.Series(d)

# Display the cross tabulated data as a venn diagram.
v = venn3(cross_sums, set_labels=[cross_sums.index.values[i] for i in [0, 1, 3]])
lbl = v.get_label_by_id('A')
x, y = lbl.get_position()
lbl.set_position((x+0.25, y+0.05))
for text in v.set_labels:
    text.set_fontsize(16)
for text in v.subset_labels:
    text.set_fontsize(14)
exportFigure(plt, 'figure_2')
plt.show()

# Execute a chi-squared test of independence.
print('Chi-squared Test of Independence for %s, %s, and %s:' %(labels[0], labels[1], labels[2]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))

## <center>Separation Anxiety, Destructive Behavior, and House Soiling</center>

### Destruction and separation anxiety:

In [None]:
fields = 'q02_main_9, q04_9'
labels = ['Destruction', 'Separation anxiety']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
boot_df = df.copy()

# Cross tabulate the relevant columns.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)

# Convert the cross tabulated dataframe to a series of sums.
d = {labels[0]: contingency[0][1],
     labels[1]: contingency[1][0],
     'Both': contingency[1][1]}   
cross_sums = pd.Series(d)

# Display the cross tabulated data as a venn diagram.
labels = cross_sums.index.values
v = venn2(cross_sums, set_labels=labels[0:2])
lbl = v.get_label_by_id('B')
x, y = lbl.get_position()
lbl.set_position((x+0.15, y+0.05))
for text in v.set_labels:
    text.set_fontsize(16)
for text in v.subset_labels:
    text.set_fontsize(14)
plt.show()

# Execute a chi-squared test of independence.
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('Chi-square Test of Independence:')
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))

get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

### House soiling during owner absence and separation anxiety:

In [None]:
fields = 'q06_situation_2, q04_9'
labels = ['House soiling (owner absence)', 'Separation anxiety']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
boot_df = df.copy()

# Cross tabulate the relevant columns.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)

# Convert the cross tabulated dataframe to a series of sums.
d = {labels[0]: contingency[0][1],
     labels[1]: contingency[1][0],
     'Both': contingency[1][1]}   
cross_sums = pd.Series(d)

# Display the cross tabulated data as a venn diagram.
labels = cross_sums.index.values
v = venn2(cross_sums, set_labels=labels[0:2])
lbl = v.get_label_by_id('B')
x, y = lbl.get_position()
lbl.set_position((x+0.15, y+0.05))
for text in v.set_labels:
    text.set_fontsize(16)
for text in v.subset_labels:
    text.set_fontsize(14)
plt.show()

# Execute a chi-squared test of independence.
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('Chi-square Test of Independence:')
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))

get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

## <center>Fear/Anxiety and House Soiling</center>

## <center>Fear/Anxiety and Aggression</center>

### Overall aggression and fearful/anxious behavior (Figure 3):

In [None]:
fields = 'q02_main_1, q02_main_2'
labels = ['Aggression', 'Fear/Anxiety']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
boot_df = df.copy()
    
# Create a contingency table.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)

# Convert the cross tabulated dataframe to a series of sums.
d = {labels[0]: contingency[0][1],
     labels[1]: contingency[1][0],
     'Both': contingency[1][1]}   
cross_sums = pd.Series(d)

# Display the cross tabulated data as a venn diagram.
labels = cross_sums.index.values
v = venn2(cross_sums, set_labels=labels[0:2])
lbl = v.get_label_by_id('A')
x, y = lbl.get_position()
lbl.set_position((x-0.1, y+0.05))
for text in v.set_labels:
    text.set_fontsize(16)
for text in v.subset_labels:
    text.set_fontsize(14)
exportFigure(plt, 'figure_3')
plt.show()

# Execute a chi-squared test of independence.
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('Chi-square Test of Independence:')
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))

get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

### Owner directed aggression and fearful/anxious behavior (Figure 4):

In [None]:
fields = 'q03_main_1, q02_main_2'
labels = ['Owner-directed\naggression', 'Fear/Anxiety']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
boot_df = df.copy()   
    
# Create a contingency table.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)

# Convert the cross tabulated dataframe to a series of sums.
d = {labels[0]: contingency[0][1],
     labels[1]: contingency[1][0],
     'Both': contingency[1][1]}   
cross_sums = pd.Series(d)

# Display the cross tabulated data as a venn diagram.
labels = cross_sums.index.values
v = venn2(cross_sums, set_labels=labels[0:2])
lbl = v.get_label_by_id('A')
x, y = lbl.get_position()
lbl.set_position((x-0.1, y+0.05))
for text in v.set_labels:
    text.set_fontsize(16)
for text in v.subset_labels:
    text.set_fontsize(14)
exportFigure(plt, 'figure_4')
plt.show()

# Execute a chi-squared test of independence.
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('Chi-square Test of Independence:')
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))

get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

### Owner directed aggression and separation anxiety (Figure 5):

In [None]:
fields = 'q03_main_1, q04_9'
labels = ['Owner-directed\naggression', 'Separation anxiety']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
boot_df = df.copy()

# Create a contingency table.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)

# Convert the cross tabulated dataframe to a series of sums.
d = {labels[0]: contingency[0][1],
     labels[1]: contingency[1][0],
     'Both': contingency[1][1]}   
cross_sums = pd.Series(d)

# Display the cross tabulated data as a venn diagram.
labels = cross_sums.index.values
v = venn2(cross_sums, set_labels=labels[0:2])
for text in v.set_labels:
    text.set_fontsize(16)
for text in v.subset_labels:
    text.set_fontsize(14)
exportFigure(plt, 'figure_5')
plt.show()

# Execute a chi-squared test of independence.
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('Chi-square Test of Independence:')
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))

get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

## <center>Fear/Anxiety and Compulsive Behavior</center>

### Fear/anxiety and compulsion (Figure 6):

In [None]:
fields = 'q02_main_2, q02_main_3'
labels = ['Fear/Anxiety', 'Compulsion']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
boot_df = df.copy()

# Cross tabulate the relevant columns.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)

# Convert the cross tabulated dataframe to a series of sums.
d = {labels[0]: contingency[0][1],
     labels[1]: contingency[1][0],
     'Both': contingency[1][1]}   
cross_sums = pd.Series(d)

# Display the cross tabulated data as a venn diagram.
labels = cross_sums.index.values
v = venn2(cross_sums, set_labels=labels[0:2])
lbl = v.get_label_by_id('B')
x, y = lbl.get_position()
lbl.set_position((x+0.15, y+0.05))
for text in v.set_labels:
    text.set_fontsize(16)
for text in v.subset_labels:
    text.set_fontsize(14)
exportFigure(plt, 'figure_6')
plt.show()

# Execute a chi-squared test of independence.
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('Chi-square Test of Independence:')
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
    
get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

## <center>Coprophagia and Age</center>

In [None]:
fields = 'q02_main_8, q01_age_months'
labels = [FR.categories[7], 'age']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
df = df[np.isfinite(df['age'])]
boot_df = df.copy()

# Sort ages into two groups:
# 1: age <= 12 months
# 0: age > 12 months
def age_sort(row):
    if row['age'] <= 12:
        val = 1
    else:
        val = 0
    return val

df['age'] = df.apply(age_sort, axis=1)

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('Chi-square Test of Independence:')
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        df['age'] = df.apply(age_sort, axis=1)
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
    
get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

## <center>Overactivity/Hyperactivity and Age</center>

In [None]:
fields = 'q02_main_13, q01_age_months'
labels = [FR.categories[11], 'age']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
df = df[np.isfinite(df['age'])]
boot_df = df.copy()

# Sort ages into two groups:
# 1: age <= 12 months
# 0: age > 12 months
def age_sort(row):
    if row['age'] <= 12:
        val = 1
    else:
        val = 0
    return val

df['age'] = df.apply(age_sort, axis=1)

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('Chi-square Test of Independence:')
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        df['age'] = df.apply(age_sort, axis=1)
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
    
get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

## <center>Bite Severity</center>

### Prevalence of biting:

In [None]:
fields = 'q03_form_5'
labels = ['bites']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

tot = df.sum()
print('Dogs that bite: n = %d dogs (%d%%)' %(tot, round((tot/REMAINING_DOGS)*100, 0)))

### Bite people:

In [None]:
fields = 'q03_form_5'
labels = ['bites']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
all_bites = df.sum()[0]

fields = 'q03_person'
labels = ['person']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

tot = df.sum()
print('Dogs that bit a person: n = %d dogs (%d%%)' %(tot, round((tot/all_bites)*100, 0)))

fields = 'q03_person_freq'
labels = ['person count']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
print('\nNumber of times the dog has bitten a person:')
displaySeriesMedian(df[labels[0]], labels[0])

### Bite dogs:

In [None]:
fields = 'q03_form_5'
labels = ['bites']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
all_bites = df.sum()[0]

fields = 'q03_dog'
labels = ['dog']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

tot = df.sum()
print('Dogs that bit a dog: n = %d dogs (%d%%)' %(tot, round((tot/all_bites)*100, 0)))

fields = 'q03_dog_freq'
labels = ['dog count']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
print('\nNumber of times the dog has bitten a dog:')
displaySeriesMedian(df[labels[0]], labels[0])

### Multiple bites per incident:

In [None]:
fields = 'q03_form_5'
labels = ['bites']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
all_bites = df.sum()[0]

fields = 'q03_bite_quantity'
labels = ['multi']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

tot = df.sum()
print('Dogs with multi-bite incidents: n = %d dogs (%d%%)' %(tot, round((tot/all_bites)*100, 0)))

### Breakdown of bite severity:

In [None]:
print('OVERALL:')
fields = 'q03_severity'
labels = ['severity']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
df = df.apply(pd.value_counts)
df.rename(index=str, columns={labels[0]: 'count'}, inplace=True)
df.columns.name = labels[0]

teeth_contact = 0
broke_skin = 0
multiple_bites = 0
for index, row in df.iterrows():
    level = float(index)
    if level > 1:
        teeth_contact += row[0]
    if level >= 3:
        broke_skin += row[0]
    if level == 5:
        multiple_bites += row[0]    
    
print('\nteeth contact: n = %d (%d%%)'
      %(teeth_contact,round((teeth_contact/df['count'].sum())*100, 0)))
print('broke skin: n = %d (%d%%)'
      %(broke_skin,round((broke_skin/df['count'].sum())*100, 0)))
print('multiple bites: n = %d (%d%%)'
      %(multiple_bites,round((multiple_bites/df['count'].sum())*100, 0)))


print('\nPEOPLE:')
fields = 'q03_severity, q03_person'
labels = ['severity', 'person']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
df = df[df[labels[1]] == 1]
df.drop(columns=labels[1], inplace=True)
df = df.apply(pd.value_counts)
df.rename(index=str, columns={labels[0]: 'count'}, inplace=True)
df.columns.name = labels[0]

teeth_contact = 0
broke_skin = 0
multiple_bites = 0
for index, row in df.iterrows():
    level = float(index)
    if level > 1:
        teeth_contact += row[0]
    if level >= 3:
        broke_skin += row[0]
    if level == 5:
        multiple_bites += row[0]   
    
print('\nteeth contact: n = %d (%d%%)'
      %(teeth_contact,round((teeth_contact/df['count'].sum())*100, 0)))
print('broke skin: n = %d (%d%%)'
      %(broke_skin,round((broke_skin/df['count'].sum())*100, 0)))
print('multiple bites: n = %d (%d%%)'
      %(multiple_bites,round((multiple_bites/df['count'].sum())*100, 0)))

print('\nDOGS:')
fields = 'q03_severity, q03_dog'
labels = ['severity', 'dog']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
df = df[df[labels[1]] == 1]
df.drop(columns=labels[1], inplace=True)
df = df.apply(pd.value_counts)
df.rename(index=str, columns={labels[0]: 'count'}, inplace=True)
df.columns.name = labels[0]

teeth_contact = 0
broke_skin = 0
multiple_bites = 0
for index, row in df.iterrows():
    level = float(index)
    if level > 1:
        teeth_contact += row[0]
    if level >= 3:
        broke_skin += row[0]
    if level == 5:
        multiple_bites += row[0]   
    
print('\nteeth contact: n = %d (%d%%)'
      %(teeth_contact,round((teeth_contact/df['count'].sum())*100, 0)))
print('broke skin: n = %d (%d%%)'
      %(broke_skin,round((broke_skin/df['count'].sum())*100, 0)))
print('multiple bites: n = %d (%d%%)'
      %(multiple_bites,round((multiple_bites/df['count'].sum())*100, 0)))

### Bite severity by behavior problem:

In [None]:
print('OVERALL:')
fields = []
labels = []
for counter, cat in enumerate(FR.categories, 1):
    if counter > 10:
        counter += 1;
    fields.append('q02_main_{}'.format(counter))
    labels.append(cat)
fields.append('q03_severity')
labels.append('severity')
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

rngs = []
rngs.append(df[df['severity'] < 3].drop(columns=['severity']))
rngs.append(df[df['severity'] >= 3].drop(columns=['severity']))
rngs.append(df[df['severity'] > 0].drop(columns=['severity']))

def get_group_prevalence(df):
    df = df.sum().apply(lambda x: (x / len(df.index)) * 100)
    return df.round().astype(int)

prevs = []
prevs.append(get_group_prevalence(rngs[0]))
prevs.append(get_group_prevalence(rngs[1]))
prevs.append(get_group_prevalence(rngs[2]))

df = pd.DataFrame(index=prevs[0].index, data={'superf.': prevs[0], 'broke skin': prevs[1],
                                              'any': prevs[2]})
df.columns.name = 'Behavior problem'
display(df)

print('\nPEOPLE:')
fields = []
labels = []
for counter, cat in enumerate(FR.categories, 1):
    if counter > 10:
        counter += 1;
    fields.append('q02_main_{}'.format(counter))
    labels.append(cat)
fields.append('q03_severity')
labels.append('severity')
fields.append('q03_person')
labels.append('person')
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
df = df[df['person'] == 1]
df.drop(columns='person', inplace=True)

rngs = []
rngs.append(df[df['severity'] < 3].drop(columns=['severity']))
rngs.append(df[df['severity'] >= 3].drop(columns=['severity']))
rngs.append(df[df['severity'] > 0].drop(columns=['severity']))

prevs = []
prevs.append(get_group_prevalence(rngs[0]))
prevs.append(get_group_prevalence(rngs[1]))
prevs.append(get_group_prevalence(rngs[2]))

df = pd.DataFrame(index=prevs[0].index, data={'superf.': prevs[0], 'broke skin': prevs[1],
                                              'any': prevs[2]})
df.columns.name = 'Behavior problem'
display(df)

print('\nDOGS:')
fields = []
labels = []
for counter, cat in enumerate(FR.categories, 1):
    if counter > 10:
        counter += 1;
    fields.append('q02_main_{}'.format(counter))
    labels.append(cat)
fields.append('q03_severity')
labels.append('severity')
fields.append('q03_dog')
labels.append('dog')
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
df = df[df['dog'] == 1]
df.drop(columns='dog', inplace=True)

rngs = []
rngs.append(df[df['severity'] < 3].drop(columns=['severity']))
rngs.append(df[df['severity'] >= 3].drop(columns=['severity']))
rngs.append(df[df['severity'] > 0].drop(columns=['severity']))

prevs = []
prevs.append(get_group_prevalence(rngs[0]))
prevs.append(get_group_prevalence(rngs[1]))
prevs.append(get_group_prevalence(rngs[2]))

df = pd.DataFrame(index=prevs[0].index, data={'superf.': prevs[0], 'broke skin': prevs[1],
                                              'any': prevs[2]})
df.columns.name = 'Behavior problem'
display(df)

### Bite severity and fear/anxiety:

In [None]:
print('OVERALL:')
fields = ', '.join(FR.fields[FR.categories[1]])
labels = list(FR.labels[FR.categories[1]].values())
fields += ', q03_severity'
labels.append('severity')
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

rngs = []
rngs.append(df[df['severity'] < 3].drop(columns=['severity']))
rngs.append(df[df['severity'] >= 3].drop(columns=['severity']))
rngs.append(df[df['severity'] > 0].drop(columns=['severity']))

def get_group_prevalence(df):
    df = df.sum().apply(lambda x: (x / len(df.index)) * 100)
    return df.round().astype(int)

prevs = []
prevs.append(get_group_prevalence(rngs[0]))
prevs.append(get_group_prevalence(rngs[1]))
prevs.append(get_group_prevalence(rngs[2]))

df = pd.DataFrame(index=prevs[0].index, data={'superf.': prevs[0], 'broke skin': prevs[1],
                                              'any': prevs[2]})
df.columns.name = 'Behavior problem'
display(df)

print('\nPEOPLE:')
fields = ', '.join(FR.fields[FR.categories[1]])
labels = list(FR.labels[FR.categories[1]].values())
fields += ', q03_severity, q03_person'
labels.append('severity')
labels.append('person')
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
df = df[df['person'] == 1]
df.drop(columns='person', inplace=True)

rngs = []
rngs.append(df[df['severity'] < 3].drop(columns=['severity']))
rngs.append(df[df['severity'] >= 3].drop(columns=['severity']))
rngs.append(df[df['severity'] > 0].drop(columns=['severity']))

prevs = []
prevs.append(get_group_prevalence(rngs[0]))
prevs.append(get_group_prevalence(rngs[1]))
prevs.append(get_group_prevalence(rngs[2]))

df = pd.DataFrame(index=prevs[0].index, data={'superf.': prevs[0], 'broke skin': prevs[1],
                                              'any': prevs[2]})
df.columns.name = 'Behavior problem'
display(df)

print('\nDOGS:')
fields = ', '.join(FR.fields[FR.categories[1]])
labels = list(FR.labels[FR.categories[1]].values())
fields += ', q03_severity, q03_dog'
labels.append('severity')
labels.append('dog')
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
df = df[df['dog'] == 1]
df.drop(columns='dog', inplace=True)

rngs = []
rngs.append(df[df['severity'] < 3].drop(columns=['severity']))
rngs.append(df[df['severity'] >= 3].drop(columns=['severity']))
rngs.append(df[df['severity'] > 0].drop(columns=['severity']))

prevs = []
prevs.append(get_group_prevalence(rngs[0]))
prevs.append(get_group_prevalence(rngs[1]))
prevs.append(get_group_prevalence(rngs[2]))

df = pd.DataFrame(index=prevs[0].index, data={'superf.': prevs[0], 'broke skin': prevs[1],
                                              'any': prevs[2]})
df.columns.name = 'Behavior problem'
display(df)

### Bite severity and gender and neutered status:

In [None]:
fields = ['q03_severity']
labels = ['severity']
fields.extend(('dog_sex', 'dog_spayed'))
labels.extend(('Gender', 'Neutered'))
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

df['bites'] = np.where(df['severity'] > 0, 1, 0)
df = df.drop(columns=['severity'])

df_intact_male = df[(df['Gender'] == 1) & (df['Neutered'] == 0)].drop(
    columns=['Gender', 'Neutered'])
df_neutered_male = df[(df['Gender'] == 1) & (df['Neutered'] == 1)].drop(
    columns=['Gender', 'Neutered'])
df_intact_female = df[(df['Gender'] == 2) & (df['Neutered'] == 0)].drop(
    columns=['Gender', 'Neutered'])
df_neutered_female = df[(df['Gender'] == 2) & (df['Neutered'] == 1)].drop(
    columns=['Gender', 'Neutered'])

def get_group_prevalence(df):
    # We aren't doing anything here except the type conversion.
    df = df.sum().apply(lambda x: (x / 1))
    return df.round().astype(int)

intact_male_p = get_group_prevalence(df_intact_male)
neutered_male_p = get_group_prevalence(df_neutered_male)
intact_female_p = get_group_prevalence(df_intact_female)
neutered_female_p = get_group_prevalence(df_neutered_female)

df = pd.DataFrame(index=intact_male_p.index, data={'intact males': intact_male_p,
                                                   'castrated males': neutered_male_p,
                                                   'intact females': intact_female_p,
                                                   'spayed females': neutered_female_p})
for index, row in df.iterrows():
    for index2, row2 in row.iteritems():
        print('%s: n = %d (%d%%)' %(index2, (row2), round((row2/df.sum().sum())*100, 0)))

## <center>Jumping and Mounting/Humping</center>

### Jumping and mounting/humping

In [None]:
fields = 'q02_main_6, q02_main_7'
labels = ['jumping', 'mounting']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df.apply(pd.to_numeric)
boot_df = df.copy()

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
    
get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

### Maleness and jumping up:

In [None]:
fields = 'q02_main_6, dog_sex'
labels = ['jumping', 'gender']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)
boot_df = df.copy()

def gender_to_binary_response(x):
    x = int(x)
    if x != 1:
        return 0
    return 1

df[labels[0]] = df[labels[0]].apply(
    lambda x: convert_to_binary_response(x))
df[labels[1]] = df[labels[1]].apply(
    lambda x: gender_to_binary_response(x))

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        df[labels[0]] = df[labels[0]].apply(
            lambda x: convert_to_binary_response(x))
        df[labels[1]] = df[labels[1]].apply(
            lambda x: gender_to_binary_response(x))
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
    
get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

### Maleness and mounting/humping:

In [None]:
fields = 'q02_main_7, dog_sex'
labels = ['mounting', 'gender']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)
boot_df = df.copy()

def gender_to_binary_response(x):
    x = int(x)
    if x != 1:
        return 0
    return 1

df[labels[0]] = df[labels[0]].apply(
    lambda x: convert_to_binary_response(x))
df[labels[1]] = df[labels[1]].apply(
    lambda x: gender_to_binary_response(x))

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        df[labels[0]] = df[labels[0]].apply(
            lambda x: convert_to_binary_response(x))
        df[labels[1]] = df[labels[1]].apply(
            lambda x: gender_to_binary_response(x))
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
    
get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

### Jumping and overactivity/hyperactivity:

In [None]:
fields = 'q02_main_6, q02_main_13'
labels = ['jumping', 'hyperactivity']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df.apply(pd.to_numeric)
boot_df = df.copy()

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
    
get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

### Jumping and destruction:

In [None]:
fields = 'q02_main_6, q02_main_9'
labels = ['jumping', 'destruction']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df.apply(pd.to_numeric)
boot_df = df.copy()

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
    
get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

### Mounting/humping and overactivity/hyperactivity:

In [None]:
fields = 'q02_main_7, q02_main_13'
labels = ['mounting', 'hyperactivity']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df.apply(pd.to_numeric)
boot_df = df.copy()

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

def get_bootstrap_odds_ratio_ci(data, count=10, alpha=0.95):
    start = timer()
    arr = np.array([])
    for i in range(count):
        df = data.sample(len(data.index), replace=True)
        contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
        odds, ci_low, ci_high, tot = getOddsRatioAndConfidenceInterval(contingency)
        arr = np.append(arr, odds)
    arr = np.sort(arr)
    lower = (1-alpha)/2
    upper = alpha+lower
    print('95%% CI: %.2f-%.2f' %(arr[int(lower * len(arr))], arr[int(upper * len(arr))]))
    end = timer()
    print('\nbootstrap time: %.2f' %(end-start))
    
get_bootstrap_odds_ratio_ci(boot_df, count=NITER)

## <center>Rolling in Repulsive Materials</center>

### Rolling in repulsive materials and coprophagia:

In [None]:
fields = 'q02_main_8, q02_main_10'
labels = ['coprophagia', 'rolling']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df.apply(pd.to_numeric)

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

## <center>Owner-directed Aggression</center>

### Owner-directed aggression and maleness:

In [None]:
fields = 'q03_main_1, dog_sex'
labels = ['owner-directed', 'gender']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)

def gender_to_binary_response(x):
    x = int(x)
    if x != 1:
        return 0
    return 1

df[labels[0]] = df[labels[0]].apply(
    lambda x: convert_to_binary_response(x))
df[labels[1]] = df[labels[1]].apply(
    lambda x: gender_to_binary_response(x))

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

## <center>Running Away/Escaping</center>

### Impact of fear/anxiety on running away/escaping:

In [None]:
fields = 'q02_main_12, q02_main_2'
labels = ['escape', 'fear']
df = createNumericDataFrame(DOG_TABLE, fields, labels)

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

### Distribution of fear/anxiety problems related with running away/escaping:

In [None]:
fields = ', '.join(FR.fields[FR.categories[1]])
labels = list(FR.labels[FR.categories[1]].values())
fields += ', q02_main_12'
labels.append('escape')
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
df = df[df['escape'] == 1]
df.drop(columns='escape', inplace=True)

# Get individual behavior problem counts and display as a table.
sums = df.sum()
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
sub_prevalence = lambda x: (x / len(df.index)) * 100
prevalences = sums.apply(sub_prevalence).round().astype(int)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Count':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Category'
display(df)

### Running away/escaping confinement and fear/anxiety:

In [None]:
fields = 'q14_conf, q02_main_2'
labels = ['escape_conf', 'fear']
df = createNumericDataFrame(DOG_TABLE, fields, labels)

# Execute a chi-squared test of independence.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
displayOddsRatio(contingency)

### Impact of gender and mounting/humping on number of behavior problems:

In [None]:
fields = 'q02_score, dog_sex, q02_main_7'
labels = ['behavior problems', 'gender', 'mounting']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)

print('Males [w/o mnt]:')
mdf = df[(df['gender'] == 1)].drop(columns=['gender'])
mdf2 = mdf[(mdf['mounting'] == 0)].drop(columns=['mounting'])
#display(mdf2.describe())
displaySeriesMean(mdf2[labels[0]], 'behavior problems')

print('\nFemales [w/o mnt]:')
fdf = df[(df['gender'] == 2)].drop(columns=['gender'])
fdf2 = fdf[(fdf['mounting'] == 0)].drop(columns=['mounting'])
#display(fdf2.describe())
displaySeriesMean(fdf2[labels[0]], 'behavior problems')

tval, pval = ttest_ind(mdf2, fdf2, equal_var=False)
tot = mdf2[labels[0]].count() + fdf2[labels[0]].count()
print('\nt(%d) = %.2f, p = %.2E' %(tot, round(tval[0], 2), pval))