# <center>Demographics and Comorbidity of Behavior Problems in Dogs</center>
<center>Ian R. Dinwoodie, Vivian Zottola, Barbara Dwyer, Donna Gleason, Nicholas H. Dodman</center>

In [None]:
import sqlite3
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
from matplotlib_venn import venn2, venn3
import scipy.stats as scs
import textwrap
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from  itertools import combinations
import os
from matplotlib.colors import ListedColormap
from matplotlib import ticker

# IPython magics for this notebook.
%matplotlib inline

# Initiate plotly IPython notebook mode.
py.init_notebook_mode(connected=True)

# Use latex font for matplotlib
#from matplotlib import rc
#rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
## for Palatino and other serif fonts use:
#rc('font',**{'family':'serif','serif':['Palatino']})
#rc('text', usetex=True)
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

In [None]:
class FieldRegistry:
    
    def __init__(self):
        self.fields = {}
        self.labels = {}
    
    def addToRegistry(self, index, labels, fields, category):
        self.labels[category] = {}
        for counter, value in enumerate(labels, 1):
            key = '{}{:02}'.format(index, counter)
            self.labels[category][key] = value
        self.fields[category] = fields
        
FR = FieldRegistry()   
CATEGORY = ['Aggression', 'Fear/Anxiety', 'Compulsion', 'House Soiling', 'Excessive Barking', 
            'Jumping', 'Mounting/Humping', 'Consuming Feces', 'Destructive Behavior',
            'Rolling in Repulsive Material', 'Running Away/Escaping', 
            'Overactivity/Hyperactivity']
        
# Aggression.
labels = ['Familiar people in the home', 'Stangers visiting the home',
          'Stangers away from the home', 'Another dog in the home',
          'Unfamiliar dogs visiting the home', 'Unfamiliar dogs on walks (off lead)',
          'Unfamiliar dogs on walks (on lead)', 'Veterinarians', 'Trainers', 'Groomers',
          'Animals other than dogs in the home']
fields = ['q03_main_1', 'q03_main_2', 'q03_main_3', 'q03_main_4', 'q03_main_5', 'q03_main_6',
          'q03_main_7', 'q03_main_8', 'q03_main_9', 'q03_main_10', 'q03_main_11']
FR.addToRegistry('A', labels, fields, CATEGORY[0])

# Fear/anxiety.
labels = ['Thunderstorm phobia', 'Noise phobia', 'Crowd phobia', 'Phobia of other dogs',
          'PTSD', 'Generalized anxiety', 'Situational anxiety', 'Veterinarian phobia',
          'Separation anxiety', 'Travel anxiety', 'Other']
fields = ['q04_1', 'q04_2', 'q04_3', 'q04_4', 'q04_5', 'q04_6', 'q04_7', 'q04_8', 'q04_9',
          'q04_10', 'q04_11']
FR.addToRegistry('B', labels, fields, CATEGORY[1])

# Compulsion.
labels = ['Spinning', 'Tail chasing', 'Shadow/light chasing', 'Running in geometric patterns',
          'Licking of wrist/hock', 'Fly snapping', 'Sucking flank region/blankets',
          'Tennis ball fetish', 'Collecting/arranging objects', 'Nail biting',
          'Digging in yard', 'Stone/rock chewing', 'Other']
fields = ['q05_main_1', 'q05_main_2', 'q05_main_3', 'q05_main_4', 'q05_main_5', 'q05_main_6',
          'q05_main_7', 'q05_main_8', 'q05_main_9', 'q05_main_10', 'q05_main_11',
          'q05_main_12', 'q05_main_13']
FR.addToRegistry('C', labels, fields, CATEGORY[2])

# House soiling.
## soil_type = ['Urine', 'Feces', 'Urine and feces']
labels = ['Urine', 'Feces', 'Urine and feces', 'Specific locations', 'Anywhere',
          'Owner present', 'Owner away', 'Excited/overwhelmed']
fields = ['q06_soil_type_1', 'q06_soil_type_2', 'q06_soil_type_3','q06_soil_location_1',
          'q06_soil_location_2', 'q06_situation_1', 'q06_situation_2', 'q06_situation_3']
FR.addToRegistry('D', labels, fields, CATEGORY[3])

# Excessive barking.
labels = ['Owner present', 'Owner away', 'To get attention', 'At tiggers (inside)',
          'At triggers (outside)', 'During car rides']
fields = ['q07_sitatuon_1', 'q07_sitatuon_2', 'q07_sitatuon_3', 'q07_sitatuon_4',
          'q07_sitatuon_5', 'q07_sitatuon_6']
FR.addToRegistry('E', labels, fields, CATEGORY[4])

# Jumping.
labels = ['Owner', 'Family members', 'Strangers']
fields = ['q08_who_1', 'q08_who_2', 'q08_who_3']
FR.addToRegistry('F', labels, fields, CATEGORY[5])

# Mounting/humping.
labels = ['People', 'Familiar dogs', 'Unfamiliar dogs', 'Inanimate objects']
fields = ['q09_main_1', 'q09_main_2', 'q09_main_3', 'q09_main_4']
FR.addToRegistry('G', labels, fields, CATEGORY[6])

# Consuming feces.
labels = ['Their own', "Other dogs'", "Other species'"]
fields = ['q10_main_1', 'q10_main_2', 'q10_main_3']
FR.addToRegistry('H', labels, fields, CATEGORY[7])

# Destructive behavior.
labels = ['Owner is home', 'Owner is away']
fields = ['q11_situation_1', 'q11_situation_2']
FR.addToRegistry('I', labels, fields, CATEGORY[8])

# Rolling in repulsive materials.
labels = ['Urine', 'Feces', 'Dead Stuff', 'Garbage']
fields = ['q12_main_1', 'q12_main_2', 'q12_main_3', 'q12_main_4']
FR.addToRegistry('J', labels, fields, CATEGORY[9])

# Running away/escaping.
labels = ['Escapes when out', 'Escapes from home', 'Escapes from confinement',
          'Returns home after escape']
fields = ['q14_out', 'q14_house', 'q14_conf', 'q14_return']
FR.addToRegistry('K', labels, fields, CATEGORY[10])

# Overactivity/hyperactivity.
labels = ['Constant moving/jumping', 'Difficulty settling', 'Highly distractible',
          'Impulsive']
fields = ['q15_main_1', 'q15_main_2', 'q15_main_3', 'q15_main_4']
FR.addToRegistry('L', labels, fields, CATEGORY[11])

In [None]:
# Data Globals
SIG_P = 0.01
TOTAL_USERS = 0
REMAINING_USERS = 0
TOTAL_DOGS = 0
REMAINING_DOGS = 0
PREVALENCE = lambda x: (x / REMAINING_DOGS) * 100
CATEGORY_MATRIX = pd.DataFrame()
QUESTION_MATRIX = pd.DataFrame()

# Database Globals
USER_TABLE = 'users'
DOG_TABLE = 'dogs'
BIAS_FILTER = '''
    USING (record_id)
    WHERE question_reason_for_part_3 = 0
    OR (question_reason_for_part_3 = 1 AND q01_main != 1)'''
CON = sqlite3.connect('../data/processed/processed.db')

In [None]:
def createStringDataFrame(table, fields, labels, filtered=True):
    query = 'SELECT ' + fields + ' FROM ' + table
    if filtered:
        table2 = USER_TABLE if table == DOG_TABLE else DOG_TABLE
        query += ' JOIN ' + table2 + ' ' + BIAS_FILTER
    df = pd.read_sql_query(query, CON)
    df.columns = labels
    return df

def convertToNumeric(df):
    df = df.apply(pd.to_numeric, errors='coerce')
    #for col in df:
    #    df[col] = pd.to_numeric(df[col])
    #    df = df.apply( pd.to_numeric, errors='coerce' )
    return df

def createNumericDataFrame(table, fields, labels, filtered=True):
    df = createStringDataFrame(table, fields, labels, filtered)
    return convertToNumeric(df)

def replaceFields(df, column, replacement_dict):
    df[column].replace(replacement_dict, inplace=True)

def getValueCountAndPrevalence(df, field):
    s = df[field].value_counts()
    p = s.apply(PREVALENCE)
    rv = pd.concat([s, p], axis=1)
    rv.columns = ['frequency', 'prevalence']
    return rv

def checkSignificance(p, sig=SIG_P):
    if p < sig:
        print('The resulting p-value is below the set significance threshold (%.2f).' %sig)
        
def exportTable(data, title):
    file_ = os.path.join('..', 'reports', 'tables', title) + '.tex'
    with open(file_, 'w') as tf:
        tf.write(r'\documentclass[varwidth=\maxdimen]{standalone}\usepackage{booktabs}\begin{document}')
        tf.write(df.to_latex())
        tf.write(r'\end{document}')
        
def exportFigure(figure, title):
    file_ = os.path.join('..', 'reports', 'figures', title) + '.png'
    figure.tight_layout()
    figure.savefig(file_, dpi = 500)

def createCategoryMatrix():
    fields = []
    labels = []
    counter = 1
    for cat, subdict in FR.labels.items():
        for key, value in  subdict.items():
            if counter == 11:
                counter += 1;
            fields.append('q02_main_{}'.format(counter))
            labels.append(key[0])
            break
        counter += 1
    fields = ', '.join(fields)
    df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
    cols = []
    pvalue = {}
    for col in df:
        cols.append(col)
        pvalue[col] = {}
    pairs = list(combinations(df.columns, 2))
    for pair in pairs:
        contingency = pd.crosstab(df[pair[0]], df[pair[1]])
        c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
        pvalue[pair[0]][pair[1]] = p
        pvalue[pair[1]][pair[0]] = p
    df = pd.DataFrame(pvalue).sort_index(ascending=True)
    return df

def createQuestionMatrix():
    fields = ''
    for cat, sublist in FR.fields.items():
        for field in sublist:
            fields += '{}, '.format(field)
    fields = fields[:-2]
    labels = []
    for cat, subdict in FR.labels.items():
        for key, value in  subdict.items():
            labels.append(key)
    df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
    cols = []
    pvalue = {}
    for col in df:
        cols.append(col)
        pvalue[col] = {}
    pairs = list(combinations(df.columns, 2))
    for pair in pairs:
        contingency = pd.crosstab(df[pair[0]], df[pair[1]])
        c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
        pvalue[pair[0]][pair[1]] = p
        pvalue[pair[1]][pair[0]] = p
    df = pd.DataFrame(pvalue).sort_index(ascending=True)
    return df

def createCorrelationMatrix():
    fields = []
    labels = []
    counter = 1
    for cat, subdict in FR.labels.items():
        for key, value in  subdict.items():
            if counter == 11:
                counter += 1;
            fields.append('q02_main_{}'.format(counter))
            labels.append(key[0])
            break
        counter += 1
    fields = ', '.join(fields)
    df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
    return df.corr()

def get_significance_category(p):
    if np.isnan(p):
        return p
    elif p > 10**(-3):
        return -1
    elif p <= 10**(-3) and p > 10**(-6):
        return 0
    else:
        return 1

## 1. Participant Demographics

### Number of participants:

In [None]:
df = createNumericDataFrame(USER_TABLE, 'COUNT(*)', ['count'], filtered=False)
# Assign value to global.
TOTAL_USERS = df['count'][0]
print('Total number of participants: %d' %TOTAL_USERS)

### Motivation for enrollment:

In [None]:
# Create the necessary dataframe.
fields = ('question_reason_for_part_1, question_reason_for_part_2, '
          'question_reason_for_part_3, question_reason_for_part_4, '
          'question_reason_for_part_5')
labels = ['Love for dogs', 'You help shelter animals', 'Suspicion of behavior problems',
          'Work with animals', 'Other']
df = createNumericDataFrame(USER_TABLE, fields, labels, filtered=False)
s = df.sum(0, skipna=False)
s = s.sort_values(ascending=False)
df = pd.DataFrame(index=s.index, data={'Count':s.values})
df.columns.name = 'Motivation'
display(df)
# NOTE: This does not include a count of users who did not provide a reason.

### Suspicion of behavior problems as sole motivating factor:

In [None]:
# Create the necessary dataframe.
fields = ('question_reason_for_part_1, question_reason_for_part_2, '
          'question_reason_for_part_3, question_reason_for_part_4, '
          'question_reason_for_part_5')
labels = ['love for dogs', 'you help shelter animals', 'suspicion of behavior problems',
          'work with animals', 'other']
df = createNumericDataFrame(USER_TABLE, fields, labels, filtered=False)
df = df[df[labels[2]] == 1]
df['sum'] = df.sum(axis=1)
df = df[df['sum'] == 1]
s = df.sum(0, skipna=False)

print('Number of users who reported suspicion of behavior problems as their sole motivation: %d' %(s.iloc[2]))

## 2. Participating Dog Demographics

### Number of participating dogs:

In [None]:
df = createNumericDataFrame(DOG_TABLE, 'COUNT(*)', ['count'], filtered=False)
# Assign value to global.
TOTAL_DOGS = df['count'][0]
print('Total number of participating dogs: %d' %TOTAL_DOGS)

### Adjusting sample for bias:

It is in this section that we define and apply a filter to remove bias from the data set.
 
The current bias filter focuses largely on the participants' motivation for enrollment ("Why did you decide to participate in this survey?", pg. 1). The answer choice "my dog may have a behavior problem" was included to identify participants who enrolled with motivational bias. However, participants may not have a motiivational bias towards all of their participating dogs. To account for this, the question "do you believe \[dog name\] has ever had a behavior problem" (pg. 4) was used in conjunction with the enrollment motivation to identify the dogs most likely to be influenced by the bias of the participating owner. 

In [None]:
fields = 'q02_score'
labels = ['Score']

df_gross = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=False)
cnt_total_dogs_w_problems = len(df_gross[df_gross[labels[0]] != 0].index)

df_adjusted_dogs = createNumericDataFrame(DOG_TABLE, fields, labels)
REMAINING_DOGS = len(df_adjusted_dogs.index)
cnt_total_dogs_w_problems_adjusted = len(df_adjusted_dogs[df_adjusted_dogs[labels[0]] != 0].index)

df_adjusted_users = createNumericDataFrame(USER_TABLE, 'COUNT(DISTINCT email)', ['count'])
REMAINING_USERS = df_adjusted_users['count'][0]


# Display the count results.
print('Pre-adjustment Responses (Gross):')
print('Total number of owners: %d' %TOTAL_USERS)
print('Total number of dogs: %d' %TOTAL_DOGS)
print('Total number of dogs with a reported behavior problem: %d' %cnt_total_dogs_w_problems)

print('\nPost-adjustment Responses (Adjusted):')
print('Total number of owners: %d' %REMAINING_USERS)
print('Total number of dogs: %d' %REMAINING_DOGS)
print('Total number of dogs with a reported behavior problem: %d'
      %cnt_total_dogs_w_problems_adjusted)

### Dogs per household:

In [None]:
fields = 'record_id'
labels = ['record index']
df = createStringDataFrame(DOG_TABLE, fields, labels)

record_dict = {}
for index, row in df.iterrows():
    key = row.iloc[0]
    if not key in record_dict:
        record_dict[key] = 1
    else:
        record_dict[key] += 1

s = pd.Series(record_dict, name='dogs')
df = pd.DataFrame(index=s.index, data={'dogs':s.values})
display(df.describe().round(2))

### Prevalence of overall behavior problems:

Prevalence is defined as the proportion of a population found to be affected by a medical condition. This section aims to calculate the aggregate prevalence of the most common behavior problems in a given canine population.

In [None]:
# Calculate the gross prevalence.
prevalence_gross = (cnt_total_dogs_w_problems / TOTAL_DOGS) * 100

# Calculate the adjusted prevalence.
prevalence_adjusted = PREVALENCE(cnt_total_dogs_w_problems_adjusted)

# Calculate the difference between the gross and adjusted prevalence.
diff_prevalence = prevalence_adjusted - prevalence_gross
sign = '+' if diff_prevalence > 0 else ''

# Display the prevalence results.
print('Pre-adjustment prevalence: %.2f%% (%d/%d dogs)'
      %(prevalence_gross, cnt_total_dogs_w_problems, TOTAL_DOGS))
print('Post-adjustment prevalence: %.2f%% (%d/%d dogs)'
      %(prevalence_adjusted, cnt_total_dogs_w_problems_adjusted, REMAINING_DOGS))
print('Shift in prevalence as a result of adjustment: %s%.2f%%'
      %(sign, diff_prevalence))

### Number of behavior problems per dog:

In [None]:
fields = 'q02_score'
labels = ['number of behavior problems']
df = createNumericDataFrame(DOG_TABLE, fields, labels)
display(df.describe())

df = df.apply(pd.value_counts)
df.rename(index=str, columns={labels[0]: "frequency"}, inplace=True)
df.columns.name = labels[0]
display(df)

### Number of dogs per source of origin:

In [None]:
# Construct the dataframe.
fields = 'acquisition_source'
labels = ['origin']
df = createStringDataFrame(DOG_TABLE, fields, labels)
# Rescue fields (online: 1, in-person: 4) were combined in the database.
replacements = {'':'no response', '1': 'rescue', '2': 'online (non-rescue)', '3': 'pet store', '5': 'breeder',
                '6': 'self-bred', '7': 'friends/family', '8': 'found', '9': 'other'}
replaceFields(df, labels[0], replacements)
df = df.apply(pd.value_counts)
df.rename(index=str, columns={labels[0]: "frequency"}, inplace=True)
df.columns.name = labels[0]
display(df)

### Age at date of response:

In [None]:
fields = 'dog_age_today_months'
labels = ['age (months)']
df = createNumericDataFrame(DOG_TABLE, fields, labels)
display(df.describe())

### Age at neutering:

In [None]:
fields = 'dog_sex_month'
labels = ['age (months)']
df = createNumericDataFrame(DOG_TABLE, fields, labels)
display(df.describe())

### Age at onset:

In [None]:
fields = 'q01_age_months'
labels = ['age (months)']
df = createNumericDataFrame(DOG_TABLE, fields, labels)
display(df.describe())

fields = 'q01_acq'
labels = ['evident when acquired']
df = createNumericDataFrame(DOG_TABLE, fields, labels)
cnt_evident_when_acquired = len(df[df[labels[0]] == 1].index)
print('Number of users who reported that the behavior problem was evident when acquired: %d'
      %(cnt_evident_when_acquired))

### Gender:

In [None]:
# Construct the dataframe.
fields = 'dog_sex'
labels = ['gender']
df = createStringDataFrame(DOG_TABLE, fields, labels)
replacements = {'':'no response', '1':'male', '2':'female'}
replaceFields(df, labels[0], replacements)
df = df.apply(pd.value_counts)
df.rename(index=str, columns={labels[0]: "frequency"}, inplace=True)
df.columns.name = labels[0]
display(df)

### Neutered status:

In [None]:
# status: unverified
# Construct the dataframe.
fields = 'dog_sex, dog_spayed'
labels = ['Gender', 'Neutered']
df = createStringDataFrame(DOG_TABLE, fields, labels)
replacements = {'':'No response', '1':'Male', '2':'Female'}
replaceFields(df, labels[0], replacements)
replacements = {'':'No response', '0':'No', '1':'Yes', '2':"I don't know"}
replaceFields(df, labels[1], replacements)
display(pd.crosstab(df[labels[0]], df[labels[1]], margins=True))

### Number of purebred dogs:

In [None]:
# status: verified
fields = 'purebred'
labels = ['purebred']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df.head()
replacements = {'':'No response', '0':'No', '1':'Yes'}
replaceFields(df, labels[0], replacements)
df = df.apply(pd.value_counts)
df.rename(index=str, columns={labels[0]: "frequency"}, inplace=True)
df.columns.name = labels[0]
display(df)

### Number of purebred dogs without a breed designated:

In [None]:
# status: verified
# Create the necessary dataframe.
fields = 'purebred_breed, purebred'
labels = ['breed', 'purebred']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] == '1']
df.drop(columns=labels[1], inplace=True)
df = df[df[labels[0]] == '']
purebred_missing_breed = df.describe().iloc[0][0]
print('Number of purebreds without breed designated: %d dogs' %(purebred_missing_breed))


### Number of purebred breeds:

In [None]:
# status: verified
# Create the necessary dataframe.
fields = 'purebred_breed, purebred'
labels = ['breed', 'purebred']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] == '1']
df.drop(columns=labels[1], inplace=True)
df = df[df[labels[0]] != '']
display(df.describe())

### Numbers of dogs per purebred breed:

In [None]:
# status: verified
# Create the necessary dataframe.
fields = 'purebred_breed, purebred'
labels = ['breed', 'purebred']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] == '1']
df.drop(columns=labels[1], inplace=True)
df = df[df[labels[0]] != '']

# Calculate sums and prevalences.
df = getValueCountAndPrevalence(df, labels[0])
df = df.round(2)
df.columns.name = labels[0]
display(df[:30])

## 3. Prevalence of Individual Behavior Problems

In [None]:
# Create the necessary dataframe.
fields = []
labels = []
for counter, category in enumerate(CATEGORY, 1):
    if counter > 10:
        counter += 1;
    fields.append('q02_main_{}'.format(counter))
    labels.append(category)
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Get individual behavior problem counts and display as a table.
sums = df.sum()
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
total_dogs = len(df.index)
prevalences = sums.apply(PREVALENCE)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Count':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Category'
display(df)
exportTable(df, 'table_1')

## 4. Behavior Problem Compositions

In [None]:
# Create the necessary dataframe.
sums = pd.Series()
for i in range(0, 12):
    all_fields = FR.fields[CATEGORY[i]].copy()
    all_labels = list(FR.labels[CATEGORY[i]].values()).copy()
    df = createNumericDataFrame(DOG_TABLE, ', '.join(all_fields), all_labels, filtered=True)
    if sums.empty:
        sums = df.sum().sort_values(ascending=False)
    else:
        sums = sums.append(df.sum().sort_values(ascending=False))

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Frequency':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Behavior problem'
display(df)
#exportTable(df, 'table_2') # exporting will overwrite custom format

### Aggression:

In [None]:
# Create the necessary dataframe.
fields = ', '.join(FR.fields[CATEGORY[0]])
labels = list(FR.labels[CATEGORY[0]].values())
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Get individual behavior problem counts and display as a table.
sums = df.sum()
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Frequency':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Target group'
display(df)
#exportTable(df, 'table_2b')

### Fear/Anxiety:

In [None]:
# Create the necessary dataframe.
fields = ', '.join(FR.fields[CATEGORY[1]])
labels = list(FR.labels[CATEGORY[1]].values())
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Get individual behavior problem counts and display as a table.
sums = df.sum()
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Frequency':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Behavior problem'
display(df)
#exportTable(df, 'table_3')

### Compulsion:

In [None]:
# Create the necessary dataframe.
fields = ', '.join(FR.fields[CATEGORY[2]])
labels = list(FR.labels[CATEGORY[2]].values())
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Get individual behavior problem counts and display as a table.
sums = df.sum()
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Frequency':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Behavior problem'
display(df)
#exportTable(df, 'table_4')

### House soiling:

In [None]:
# Create the necessary dataframe.
fields = ', '.join(FR.fields[CATEGORY[3]])
labels = list(FR.labels[CATEGORY[3]].values())
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Get individual behavior problem counts and display as a table.
sums = df.sum()
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Frequency':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Type/scenario/location'
display(df)
#exportTable(df, 'table_5')

### Excessive barking:

In [None]:
# Create the necessary dataframe.
fields = ', '.join(FR.fields[CATEGORY[4]])
labels = list(FR.labels[CATEGORY[4]].values())
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Get individual behavior problem counts and display as a table.
sums = df.sum()
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Frequency':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Scenario'
display(df)
#exportTable(df, 'table_6')

### Jumping:

In [None]:
# Create the necessary dataframe.
fields = ', '.join(FR.fields[CATEGORY[5]])
labels = list(FR.labels[CATEGORY[5]].values())
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Get individual behavior problem counts and display as a table.
sums = df.sum()
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Frequency':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Target group'
display(df)
#exportTable(df, 'table_7')

### Mounting/Humping:

In [None]:
# Create the necessary dataframe.
fields = ', '.join(FR.fields[CATEGORY[6]])
labels = list(FR.labels[CATEGORY[6]].values())
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Get individual behavior problem counts and display as a table.
sums = df.sum()
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Frequency':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Target group'
display(df)
#exportTable(df, 'table_8')

### Consumption of feces:

In [None]:
# Create the necessary dataframe.
fields = ', '.join(FR.fields[CATEGORY[7]])
labels = list(FR.labels[CATEGORY[7]].values())
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Get individual behavior problem counts and display as a table.
sums = df.sum()
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Frequency':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Source'
display(df)
#exportTable(df, 'table_9')

### Destructive behavior:

In [None]:
# Create the necessary dataframe.
fields = ', '.join(FR.fields[CATEGORY[8]])
labels = list(FR.labels[CATEGORY[8]].values())
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Get individual behavior problem counts and display as a table.
sums = df.sum()
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Frequency':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Scenario'
display(df)
#exportTable(df, 'table_10')

### Rolling in repulsive materials:

In [None]:
# Create the necessary dataframe.
fields = ', '.join(FR.fields[CATEGORY[9]])
labels = list(FR.labels[CATEGORY[9]].values())
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Get individual behavior problem counts and display as a table.
sums = df.sum()
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Frequency':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Object'
display(df)
#exportTable(df, 'table_11')

### Running away/Escaping:

In [None]:
# Create the necessary dataframe.
fields = ', '.join(FR.fields[CATEGORY[10]])
labels = list(FR.labels[CATEGORY[10]].values())
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Get individual behavior problem counts and display as a table.
sums = df.sum().astype(int)
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Frequency': sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Scenario'
display(df)
#exportTable(df, 'table_12')

### Overactivity/Hyperactivity:

In [None]:
# Create the necessary dataframe.
fields = ', '.join(FR.fields[CATEGORY[11]])
labels = list(FR.labels[CATEGORY[11]].values())
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Get individual behavior problem counts and display as a table.
sums = df.sum()
sums = sums.sort_values(ascending=False)

# Calculate the prevalence of each behavior problem.
prevalences = sums.apply(PREVALENCE)

# Create a table.
df = pd.DataFrame(index=sums.index, data={'Frequency':sums.values,
                                          'Prevalence (%)': prevalences.values.round(2)})
df.columns.name = 'Behavior problem'
display(df)
#exportTable(df, 'table_13')

## 5. Separation Anxiety, Noise Phobia, and Thunderstorm Phobia

### Paired independence:

In [None]:
# Create the necessary dataframe.
fields = 'q04_1, q04_2, q04_9'
labels = ['Thunderstorm phobia', 'Noise phobia', 'Separation anxiety']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

pairs = [[labels[0], labels[1]], [labels[0], labels[2]], [labels[1], labels[2]]]
for pair in pairs:
    # Create a contingency table.
    contingency = pd.crosstab(df[pair[0]], df[pair[1]])
    # Execute a chi-squared test of independence.
    print('Chi-squared Test of Independence for %s and %s:' %(pair[0], pair[1]))
    c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
    print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
    checkSignificance(p)
    print()

### Grouped independence:

In [None]:
# Create a contingency table.
labels = ['Thunderstorm phobia', 'Noise phobia', 'Separation anxiety']
contingency = pd.crosstab(df[labels[2]], [df[labels[0]], df[labels[1]]])

# Convert the cross tabulated dataframe to a series of sums.
d = {labels[2]: contingency[0][0][1],
     labels[1]: contingency[1][0][0],
     'Separation-Noise': contingency[1][0][1],
     labels[0]: contingency[0][1][0],
     'Separation-Thunderstorm': contingency[0][1][1],
     'Noise-Thunderstorm': contingency[1][1][0],     
     'All': contingency[1][1][1]}   
cross_sums = pd.Series(d)

# Display the cross tabulated data as a venn diagram.
v = venn3(cross_sums, set_labels=[cross_sums.index.values[i] for i in [0, 1, 3]])
lbl = v.get_label_by_id('A')
x, y = lbl.get_position()
lbl.set_position((x+0.25, y+0.05))
for text in v.set_labels:
    text.set_fontsize(16)
for text in v.subset_labels:
    text.set_fontsize(14)
exportFigure(plt, 'figure_1')
plt.show()

# Execute a chi-squared test of independence.
print('Chi-squared Test of Independence for %s, %s, and %s:' %(labels[0], labels[1], labels[2]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
checkSignificance(p)
display(df.corr())

## 6. Compulsion and Fearful/Anxious Behaviors

In [None]:
# Create the necessary dataframe.
fields = 'q02_main_2, q02_main_3'
labels = ['Fear/Anxiety', 'Compulsion']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Cross tabulate the relevant columns.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
display(contingency)

# Convert the cross tabulated dataframe to a series of sums.
d = {labels[0]: contingency[0][1],
     labels[1]: contingency[1][0],
     'Both': contingency[1][1]}   
cross_sums = pd.Series(d)

# Display the cross tabulated data as a venn diagram.
labels = cross_sums.index.values
v = venn2(cross_sums, set_labels=labels[0:2])
lbl = v.get_label_by_id('B')
x, y = lbl.get_position()
lbl.set_position((x+0.15, y+0.05))
for text in v.set_labels:
    text.set_fontsize(16)
for text in v.subset_labels:
    text.set_fontsize(14)
exportFigure(plt, 'figure_5')
plt.show()

# Execute a chi-squared test of independence.
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('Chi-square Test of Independence:')
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
checkSignificance(p)
display(df.corr())

## 7. Aggression and Fearful/Anxious Behavior

### Overall aggression and fearful/anxious behavior:

In [None]:
# Create the necessary dataframe.
fields = 'q02_main_1, q02_main_2'
labels = ['Aggression', 'Fear/Anxiety']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
    
# Create a contingency table.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
display(contingency)

# Convert the cross tabulated dataframe to a series of sums.
d = {labels[0]: contingency[0][1],
     labels[1]: contingency[1][0],
     'Both': contingency[1][1]}   
cross_sums = pd.Series(d)

# Display the cross tabulated data as a venn diagram.
labels = cross_sums.index.values
v = venn2(cross_sums, set_labels=labels[0:2])
lbl = v.get_label_by_id('A')
x, y = lbl.get_position()
lbl.set_position((x-0.1, y+0.05))
for text in v.set_labels:
    text.set_fontsize(16)
for text in v.subset_labels:
    text.set_fontsize(14)
exportFigure(plt, 'figure_2')
plt.show()

# Execute a chi-squared test of independence.
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('Chi-square Test of Independence:')
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
checkSignificance(p)
display(df.corr())

### Owner directed aggression and fearful/anxious behavior:

In [None]:
# Create the necessary dataframe.
fields = 'q03_main_1, q02_main_2'
labels = ['Owner directed\naggression', 'Fear/Anxiety']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
    
# Create a contingency table.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
display(contingency)

# Convert the cross tabulated dataframe to a series of sums.
d = {labels[0]: contingency[0][1],
     labels[1]: contingency[1][0],
     'Both': contingency[1][1]}   
cross_sums = pd.Series(d)

# Display the cross tabulated data as a venn diagram.
labels = cross_sums.index.values
v = venn2(cross_sums, set_labels=labels[0:2])
lbl = v.get_label_by_id('A')
x, y = lbl.get_position()
lbl.set_position((x-0.1, y+0.05))
for text in v.set_labels:
    text.set_fontsize(16)
for text in v.subset_labels:
    text.set_fontsize(14)
exportFigure(plt, 'figure_3')
plt.show()

# Execute a chi-squared test of independence.
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('Chi-square Test of Independence:')
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
checkSignificance(p)
display(df.corr())

### 7.3 Owner directed aggression and separation anxiety:

In [None]:
# Create the necessary dataframe.
fields = 'q03_main_1, q04_9'
labels = ['Owner directed\naggression', 'Separation anxiety']
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
    
# Create a contingency table.
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
display(contingency)

# Convert the cross tabulated dataframe to a series of sums.
d = {labels[0]: contingency[0][1],
     labels[1]: contingency[1][0],
     'Both': contingency[1][1]}   
cross_sums = pd.Series(d)

# Display the cross tabulated data as a venn diagram.
labels = cross_sums.index.values
v = venn2(cross_sums, set_labels=labels[0:2])
for text in v.set_labels:
    text.set_fontsize(16)
for text in v.subset_labels:
    text.set_fontsize(14)
exportFigure(plt, 'figure_4')
plt.show()

# Execute a chi-squared test of independence.
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('Chi-square Test of Independence:')
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
checkSignificance(p)
display(df.corr())

## 8. Factors Influencing Prevalence and Frequency

### Prevalence ranked by age of onset:

In [None]:
# Create the necessary dataframe.
fields = []
labels = []
for counter, cat in enumerate(CATEGORY, 1):
    if counter > 10:
        counter += 1;
    fields.append('q02_main_{}'.format(counter))
    labels.append(cat)   
fields.append('q01_age_months')
labels.append('Age')
fields.append('q01_acq')
labels.append('Evident at Acquisition')
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
#df = df[np.isfinite(df[labels[12]])]

# Ranges: 0=0-3m, 1=3-6m, 2=6m-1y, 3=1-3y, 4=3y+, 5=evident at acquisition
rngs = []
rngs.append(df[(df['Age'] < 3)].drop(columns=['Age', 'Evident at Acquisition']))
rngs.append(df[(df['Age'] >= 3) & (df['Age'] < 6)].drop(columns=['Age', 'Evident at Acquisition']))
rngs.append(df[(df['Age'] >= 6) & (df['Age'] < 12)].drop(columns=['Age', 'Evident at Acquisition']))
rngs.append(df[(df['Age'] >= 12) & (df['Age'] < 36)].drop(columns=['Age', 'Evident at Acquisition']))
rngs.append(df[(df['Age'] >= 36)].drop(columns=['Age', 'Evident at Acquisition']))
rngs.append(df[(df['Evident at Acquisition'] == 1)].drop(columns=['Age', 'Evident at Acquisition']))

def get_group_prevalence(df):
    df = df.sum().apply(lambda x: (x / len(df.index)) * 100)
    return df.round(2)

prevs = []
prevs.append(get_group_prevalence(rngs[0]))
prevs.append(get_group_prevalence(rngs[1]))
prevs.append(get_group_prevalence(rngs[2]))
prevs.append(get_group_prevalence(rngs[3]))
prevs.append(get_group_prevalence(rngs[4]))
prevs.append(get_group_prevalence(rngs[5]))

df = pd.DataFrame(index=prevs[0].index, data={'0-3m': prevs[0], '3-6m': prevs[1], '6-12m': prevs[2],
                                              '12-36m': prevs[3], '+36m': prevs[4],
                                              'Evident at Acquisition': prevs[5]})
df.columns.name = 'Behavior problem'
display(df)
#exportTable(df, 'table_4') # Exporting will overwrite custom formatting

### Prevalence by gender and neutered status:

In [None]:
# Create the necessary dataframe.
fields = []
labels = []
for counter, cat in enumerate(CATEGORY, 1):
    if counter > 10:
        counter += 1;
    fields.append('q02_main_{}'.format(counter))
    labels.append(cat)   
fields.extend(('dog_sex', 'dog_spayed'))
labels.extend(('Gender', 'Neutered'))
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

df_intact_male = df[(df['Gender'] == 1) & (df['Neutered'] == 0)].drop(
    columns=['Gender', 'Neutered'])
df_neutered_male = df[(df['Gender'] == 1) & (df['Neutered'] == 1)].drop(
    columns=['Gender', 'Neutered'])
df_intact_female = df[(df['Gender'] == 2) & (df['Neutered'] == 0)].drop(
    columns=['Gender', 'Neutered'])
df_neutered_female = df[(df['Gender'] == 2) & (df['Neutered'] == 1)].drop(
    columns=['Gender', 'Neutered'])

def get_group_prevalence(df):
    df = df.sum().apply(lambda x: (x / len(df.index)) * 100)
    return df.round(2)

intact_male_p = get_group_prevalence(df_intact_male)
neutered_male_p = get_group_prevalence(df_neutered_male)
intact_female_p = get_group_prevalence(df_intact_female)
neutered_female_p = get_group_prevalence(df_neutered_female)

df = pd.DataFrame(index=intact_male_p.index, data={'Intact males': intact_male_p,
                                                   'Castrated males': neutered_male_p,
                                                   'Intact females': intact_female_p,
                                                   'Spayed females': neutered_female_p})
df.columns.name = 'Behavior problem'
display(df)
#exportTable(df, 'table_3') # Exporting will overwrite custom formatting

### Prevalence by origin:

In [None]:
# Create the necessary dataframe.
fields = []
labels = []
for counter, cat in enumerate(CATEGORY, 1):
    if counter > 10:
        counter += 1;
    fields.append('q02_main_{}'.format(counter))
    labels.append(cat)   
fields.append('acquisition_source')
labels.append('origin')
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)
# Rescue fields (online: 1, in-person: 4) were combined in the database.
#replacements = {'':'no response', '1': 'rescue', '2': 'online (non-rescue)', '3': 'pet store', '5': 'breeder',
#                '6': 'self-bred', '7': 'friends/family', '8': 'found', '9': 'other'}

rngs = []
for i in range(1, 9):
    cnt = i
    if i >= 4:
        cnt += 1
    rngs.append(df[(df['origin'] == cnt)].drop(columns=['origin']))

def get_group_prevalence(df):
    df = df.sum().apply(lambda x: (x / len(df.index)) * 100)
    return df.round(2)

prevs = []
for j in range(0, 8):
    prevs.append(get_group_prevalence(rngs[j]))

df = pd.DataFrame(index=intact_male_p.index, data={'Rescue': prevs[0], 'Online': prevs[1],
                                                   'Pet store': prevs[2], 'Breeder': prevs[3],
                                                   'Self-bred': prevs[4], 'Family/Friends': prevs[5],
                                                   'Found': prevs[6], 'Other': prevs[7]})
df.columns.name = 'Behavior problem'
display(df)
#exportTable(df, 'table_5') # Exporting will overwrite custom formatting

### Prevalence by purebred lineage:

In [None]:
# Create the necessary dataframe.
fields = []
labels = []
for counter, cat in enumerate(CATEGORY, 1):
    if counter > 10:
        counter += 1;
    fields.append('q02_main_{}'.format(counter))
    labels.append(cat)
fields.append('purebred')
labels.append('purebred')
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

rngs = []
rngs.append(df[(df['purebred'] == 1)].drop(columns=['purebred']))
rngs.append(df[(df['purebred'] == 0)].drop(columns=['purebred']))

def get_group_prevalence(df):
    df = df.sum().apply(lambda x: (x / len(df.index)) * 100)
    return df.round(2)

prevs = []
prevs.append(get_group_prevalence(rngs[0]))
prevs.append(get_group_prevalence(rngs[1]))

df = pd.DataFrame(index=intact_male_p.index, data={'Purebred': prevs[0], 'Non-purebred': prevs[1]})
df.columns.name = 'Behavior problem'
display(df)
#exportTable(df, 'table_6') # Exporting will overwrite custom formatting

### Impact of gender on behavior problem prevalence:

In [None]:
fields = 'q02_score, dog_sex'
labels = ['behavior problems', 'gender']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)

def convert_to_binary_response(x):
    x = int(x)
    if x < 1:
        return 0
    return 1

def gender_to_binary_response(x):
    x = int(x)
    if x < 2:
        return 0
    return 1

df['behavior problems'] = df['behavior problems'].apply(
    lambda x: convert_to_binary_response(x))
df['gender'] = df['gender'].apply(
    lambda x: gender_to_binary_response(x))
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
display(contingency)

# Execute a chi-squared test of independence.
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
checkSignificance(p)
display(df.corr())

### Impact of neutered status on behavior problem prevalence:

In [None]:
fields = 'q02_score, dog_spayed'
labels = ['behavior problems', 'neutered']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df[df[labels[1]] != '2']
df = df.apply(pd.to_numeric)

def convert_to_binary_response(x):
    x = int(x)
    if x < 1:
        return 0
    return 1

df['behavior problems'] = df['behavior problems'].apply(
    lambda x: convert_to_binary_response(x))
df['neutered'] = df['neutered'].apply(
    lambda x: convert_to_binary_response(x))
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
display(contingency)

# Execute a chi-squared test of independence.
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
checkSignificance(p)
display(df.corr())

### Impact of source of origin on behavior problem prevalence:

In [None]:
fields = 'q02_score, acquisition_source'
labels = ['behavior problems', 'origin']
df = createStringDataFrame(DOG_TABLE, fields, labels)
# Rescue fields (online: 1, in-person: 4) were combined in the database.
#replacements = {'':'no response', '1': 'rescue', '2': 'online (non-rescue)', '3': 'pet store', '5': 'breeder',
#                '6': 'self-bred', '7': 'friends/family', '8': 'found', '9': 'other'}
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)

def convert_to_binary_response(x):
    x = int(x)
    if x < 1:
        return 0
    return 1

def rescue_to_binary_response(x):
    x = int(x)
    if x != 1:
        return 0
    return 1

df['behavior problems'] = df['behavior problems'].apply(
    lambda x: convert_to_binary_response(x))
df['origin'] = df['origin'].apply(lambda x: rescue_to_binary_response(x))
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
display(contingency)

# Execute a chi-squared test of independence.
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
checkSignificance(p)
display(df.corr())

### Impact of purebred status on behavior problem prevalence:

In [None]:
fields = 'q02_score, purebred'
labels = ['behavior problems', 'purebred']
df = createStringDataFrame(DOG_TABLE, fields, labels)
df = df[df[labels[1]] != '']
df = df.apply(pd.to_numeric)

def convert_to_binary_response(x):
    x = int(x)
    if x < 1:
        return 0
    return 1

df['behavior problems'] = df['behavior problems'].apply(
    lambda x: convert_to_binary_response(x))
contingency = pd.crosstab(df[labels[0]], df[labels[1]], margins=False)
display(contingency)

# Execute a chi-squared test of independence.
print('Chi-squared Test of Independence for %s and %s:' %(labels[0], labels[1]))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
checkSignificance(p)
display(df.corr())

### Impact of age range on prevalence:

In [None]:
# How ?

### Number of behavior problems by age range:

In [None]:
# Create the necessary dataframe.
fields = ['q02_score', 'q01_age_months']
labels = ['behavior problems', 'age'] 
fields = ', '.join(fields)
df = createNumericDataFrame(DOG_TABLE, fields, labels, filtered=True)

# Ranges: 0=0-3m, 1=3-6m, 2=6m-1y, 3=1-3y, 4=3y+, 5=evident at acquisition
rngs = []
rngs.append(df[(df['age'] < 3)].drop(columns=['age']))
rngs.append(df[(df['age'] >= 3) & (df['age'] < 6)].drop(columns=['age']))
rngs.append(df[(df['age'] >= 6) & (df['age'] < 12)].drop(columns=['age']))
rngs.append(df[(df['age'] >= 12) & (df['age'] < 36)].drop(columns=['age']))
rngs.append(df[(df['age'] >= 36)].drop(columns=['age']))

def get_group_mean(df):
    return float(df.mean())

means = []
for i, val in enumerate(rngs):
    means.append(get_group_mean(rngs[i]))
    
df = pd.DataFrame(index=['0-3m', '3-6m', '6-12m', '12-36m', '+36m'], data={'Number of Behavior Problems': means})
df.columns.name = 'Age Range'
display(df)
exportTable(df, 'table_7') # Exporting will overwrite custom formatting

## 9. Heatmaps

### Category reference chart:

In [None]:
# Construct lists.
cat_dict = {}
for cat, subdict in FR.labels.items():
    for key, value in  subdict.items():
        cat_dict[key[0]] = cat
        break

s = pd.Series(cat_dict)
df = pd.DataFrame(index=s.index, data={'Category':s.values})
df.columns.name = 'Index'
display(df)
exportTable(df, 'table_category_reference')

### Category heatmap:

In [None]:
if CATEGORY_MATRIX.empty:
    CATEGORY_MATRIX = createCategoryMatrix()
    
# Create a new dataframe with categories:
df = CATEGORY_MATRIX.applymap(lambda row: get_significance_category(row))
df_corr = createCorrelationMatrix()

# Heatmap
fig, ax = plt.subplots(figsize=(20, 15))
cmap = sns.cubehelix_palette(n_colors=3)
ax = sns.heatmap(data=df, ax=ax, cmap=ListedColormap(cmap), linewidths=0.5, linecolor='black',
                 cbar_kws={'orientation': 'vertical'}, mask=df.isnull(), annot=df_corr,
                 annot_kws={"size": 20})
colorbar = ax.collections[0].colorbar
colorbar.set_ticks([-0.667, 0, 0.667])
colorbar.set_ticklabels(['$p>10^{-3}$', '$10^{-3}>p>10^{-6}$', '$10^{-6}>p$'])
plt.xticks(rotation=0, fontsize=20)
plt.yticks(rotation=0, fontsize=20)
ax.invert_yaxis()
exportFigure(fig, 'figure_6')
plt.show()

### Questionnaire reference chart:

In [None]:
# Construct lists.
locs = []
responses = []
categories = []
for cat, subdict in FR.labels.items():
    for key, value in  subdict.items():
        locs.append(key)
        responses.append(value)
        categories.append(cat)

df = pd.DataFrame(index=locs, data={'Category': categories, 'Response': responses})
df.columns.name = 'Index'
display(df)
exportTable(df, 'table_questionnaire_reference')

### Questionnaire heatmap:

In [None]:
if QUESTION_MATRIX.empty:
    QUESTION_MATRIX = createQuestionMatrix()
    
# Create a new dataframe with categories:
df = QUESTION_MATRIX.applymap(lambda row: get_significance_category(row))

# Heatmap
fig, ax = plt.subplots(figsize=(20, 15))
cmap = sns.cubehelix_palette(n_colors=3)
ax = sns.heatmap(data=df, ax=ax, cmap=ListedColormap(cmap), linewidths=0.1, linecolor='black',
                 cbar_kws={'orientation': 'vertical'}, mask=df.isnull())
colorbar = ax.collections[0].colorbar
colorbar.set_ticks([-0.667, 0, 0.667])
colorbar.set_ticklabels(['$p>10^{-3}$', '$10^{-3}>p>10^{-6}$', '$10^{-6}>p$'])
plt.xticks(rotation=90)
ax.invert_yaxis()
exportFigure(fig, 'figure_7')
plt.show()