# Pet Dog Behavior Study Analysis

## 1. Workspace Setup

In [20]:
import sqlite3
import pandas as pd
import textwrap
import scipy.stats as scs
from IPython.display import display
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go


# Establish a connection to the database.
con = sqlite3.connect('../data/processed/processed.db')

## 2. Overall Prevalence of Behavior Problems

### 2.1 Count gross and adjusted responses.

In [21]:
# Create the necessary dataframe.
query = ('SELECT record_id, question_reason_for_part_3, q02_score FROM users JOIN dogs '
         'USING(record_id)')
df_gross = pd.read_sql_query(query, con)
df_gross.columns = ['id', 'suspicion', 'problems']

# Get a count of the total number of dogs.
cnt_total_dogs = len(df_gross.index)

# Get a count of the dogs with reported behavior problem.
# If the "problems" column does not equal zero, then a behavior problem was reported.
cnt_total_dogs_w_problems = len(df_gross[df_gross['problems'] != '0'].index)

# Create an adjusted dataframe by removing biased results.
# A result is considered biased if a participant listed "suspicion of a behavior problem" as a
# reason for joining the study.
df_adjusted = df_gross[df_gross['suspicion'] == '0']

# Get a count of the total number of dogs from the adjusted dataframe.
cnt_total_dogs_adjusted = len(df_adjusted.index)

# Get a count of the dogs with a reported behavior problem from the adjusted dataframe.
cnt_total_dogs_w_problems_adjusted = len(df_adjusted[df_adjusted['problems'] != '0'].index)

# Display the count results.
print('Total number of dogs: %d' %cnt_total_dogs)
print('Total number of dogs with a reported behavior problem: %d' %cnt_total_dogs_w_problems)
print('Adjusted total number of dogs: %d' %cnt_total_dogs_adjusted)
print('Adjusted total number of dogs with a reported behavior problem: %d'
      %cnt_total_dogs_w_problems_adjusted)

Total number of dogs: 5018
Total number of dogs with a reported behavior problem: 4407
Adjusted total number of dogs: 3749
Adjusted total number of dogs with a reported behavior problem: 3207


### 2.2 Calculate prevalence.

In [34]:
# Calculate the gross prevalence.
prevalence_gross = (cnt_total_dogs_w_problems / cnt_total_dogs) * 100

# Calculate the adjusted prevalence.
prevalence_adjusted = (cnt_total_dogs_w_problems_adjusted / cnt_total_dogs_adjusted) * 100

# Calculate the difference between the gross and adjusted prevalence.
diff_prevalence = prevalence_adjusted - prevalence_gross
sign = '+' if diff_prevalence > 0 else ''

# Display the prevalence results.
print('Gross prevalence: %.2f%% (%d/%d)'
      %(prevalence_gross, cnt_total_dogs_w_problems, cnt_total_dogs))
print('Adjusted prevalence: %.2f%% (%d/%d)'
      %(prevalence_adjusted, cnt_total_dogs_w_problems_adjusted, cnt_total_dogs_adjusted))
print('Shift in prevalence as a result of adjustment: %s%.2f%%' %(sign, diff_prevalence))

Gross prevalence: 87.82% (4407/5018)
Adjusted prevalence: 85.54% (3207/3749)
Shift in prevalence as a result of adjustment: -2.28%


## 3. Separation Anxiety, Noise Phobia, and Thunderstorm Phobia

### 3.1 Preparation of data

In [23]:
# Create the necessary dataframe.
query = ('SELECT q04_1, q04_2, q04_9 FROM dogs')
df = pd.read_sql_query(query, con)
df.columns = ['thunderstorm phobia', 'noise phobia', 'separation anxiety']
for col in df:
    df[col] = pd.to_numeric(df[col])
    
# Set a significance level.
sig_p = 0.01
    
# Record the total number of dogs.
cnt_total_dogs = len(df.index)
print('Total dogs: %d' %cnt_total_dogs)

Total dogs: 5018


### 3.2 Tables and calculations

In [24]:
pairs = [['thunderstorm phobia', 'noise phobia'],
         ['thunderstorm phobia', 'separation anxiety'],
         ['noise phobia', 'separation anxiety']
        ]
for pair in pairs:
    # Create a contingency table.
    contingency = pd.crosstab(df[pair[0]], df[pair[1]])
    display(contingency)
    # Execute a chi-squared test of independence.
    print('Chi-squared Test of Independence for %s and %s:' %(pair[0], pair[1]))
    c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
    print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
    if p < sig_p:
        print('The resulting p-value is below the set significance threshold (%.2f).' %sig_p)

noise phobia,0,1
thunderstorm phobia,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3839,427
1,210,542


Chi-squared Test of Independence for thunderstorm phobia and noise phobia:
chi2 = 1580.493072, p = 0.00E+00, dof = 1
The resulting p-value is below the set significance threshold (0.01).


separation anxiety,0,1
thunderstorm phobia,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3750,516
1,501,251


Chi-squared Test of Independence for thunderstorm phobia and separation anxiety:
chi2 = 223.618927, p = 1.47E-50, dof = 1
The resulting p-value is below the set significance threshold (0.01).


separation anxiety,0,1
noise phobia,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3592,457
1,659,310


Chi-squared Test of Independence for noise phobia and separation anxiety:
chi2 = 258.860919, p = 3.04E-58, dof = 1
The resulting p-value is below the set significance threshold (0.01).


### 3.3 Further exploration

In [25]:
# Create a contingency table.
contingency = pd.crosstab(df['separation anxiety'], [df['noise phobia'],
                                                     df['thunderstorm phobia']])
display(contingency)


# Execute a chi-squared test of independence.
title = ('Chi-squared Test of Independence for separation anxiety and the combination of '
         'noise and thunderstorm phobia:')
print(textwrap.fill(title, width=90))
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
if p < sig_p:
    print('The resulting p-value is below the set significance threshold (%.2f).' %sig_p)

noise phobia,0,0,1,1
thunderstorm phobia,0,1,0,1
separation anxiety,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,3452,140,298,361
1,387,70,129,181


Chi-squared Test of Independence for separation anxiety and the combination of noise and
thunderstorm phobia:
chi2 = 343.870316, p = 3.17E-74, dof = 3
The resulting p-value is below the set significance threshold (0.01).


## 4. Compulsion and Fearful/Anxious Behaviors

### 4.1 Preparation of data

In [26]:
# Create the necessary hdataframe.
query = ('SELECT q02_main_2, q02_main_3 FROM dogs')
df = pd.read_sql_query(query, con)
df.columns = ['fearful/anxious behavior', 'repetitive behavior']
for col in df:
    df[col] = pd.to_numeric(df[col])

### 4.1 Tables and calculations

In [27]:
# Create a contingency table.
contingency = pd.crosstab(df['fearful/anxious behavior'], df['repetitive behavior'])
c_df = pd.DataFrame({'fearful/anxious': contingency[0][1],
                     'repetitive': contingency[1][0],
                     'both': contingency[1][1]}, index=[0])
#display(c_df)
table = ff.create_table(c_df)
py.iplot(table, filename='jupyter-styled-table')

In [28]:
data = [go.Scatter(x=c_df.columns, y=c_df.sum())]
layout = {
    'xaxis': {
        'showticklabels': False,
        'autotick': False,
        'showgrid': False,
        'zeroline': False,
    },
    'yaxis': {
        'showticklabels': False,
        'autotick': False,
        'showgrid': False,
        'zeroline': False,
    },
    'shapes': [
        {
            'opacity': 0.3,
            'xref': 'x',
            'yref': 'y',
            'fillcolor': 'blue',
            'x0': 0,
            'y0': 0,
            'x1': 2,
            'y1': 2,
            'type': 'circle',
            'line': {
                'color': 'blue',
            },
        },
        {
            'opacity': 0.3,
            'xref': 'x',
            'yref': 'y',
            'fillcolor': 'gray',
            'x0': 1.5,
            'y0': 0,
            'x1': 3.5,
            'y1': 2,
            'type': 'circle',
            'line': {
                'color': 'gray',
            },
        }
    ],
    'margin': {
        'l': 20,
        'r': 20,
        'b': 100
    },
    'height': 600,
    'width': 800,
}
fig = {
    'data': data,
    'layout': layout,
}
py.iplot(fig, filename='venn-diagram')

In [29]:
# Execute a chi-squared test of independence.
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('Chi-square Test of Independence:')
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
if p < sig_p:
    print('The resulting p-value is below the set significance threshold (%.2f).' %sig_p)

Chi-square Test of Independence:
chi2 = 64.426486, p = 1.00E-15, dof = 1
The resulting p-value is below the set significance threshold (0.01).


## 5. Aggression and Fearful/Anxious Behavior

### 5.1 Preparation of data

In [30]:
# Create the necessary dataframe.
query = ('SELECT q02_main_1, q02_main_2 FROM dogs')
df = pd.read_sql_query(query, con)
df.columns = ['aggression', 'fearful/anxious behavior']
for col in df:
    df[col] = pd.to_numeric(df[col])

### 5.1 Tables and calculations

In [31]:
# Create a contingency table.
contingency = pd.crosstab(df['aggression'], df['fearful/anxious behavior'])
display(contingency)

# Execute a chi-squared test of independence.
c, p, dof, expected = scs.chi2_contingency(contingency, correction=False)
print('Chi-square Test of Independence:')
print('chi2 = %f, p = %.2E, dof = %d' %(c, p, dof))
if p < sig_p:
    print('The resulting p-value is below the set significance threshold (%.2f).' %sig_p)

fearful/anxious behavior,0,1
aggression,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1905,1325
1,726,1062


Chi-square Test of Independence:
chi2 = 155.793414, p = 9.39E-36, dof = 1
The resulting p-value is below the set significance threshold (0.01).


## 6. Reasons for Joining the Study

### 6.1 Preparation of data

In [32]:
# Create the necessary dataframe.
query = ('SELECT question_reason_for_part_1, question_reason_for_part_2, '  
         'question_reason_for_part_3, question_reason_for_part_4, '
         'question_reason_for_part_5 FROM users')
df = pd.read_sql_query(query, con)
df.columns = ['Love', 'Shelter Animals', 'Behavior Problems', 'Work', 'Other']
for col in df:
    df[col] = pd.to_numeric(df[col])

### 6.2 Breakdown of reasons for joining the study

In [33]:
# Create a bar graph to illustrate breakdown of responses.
data = [go.Bar(x=df.columns, y=df.sum())]
layout = go.Layout(title="Motivation for Participation",
                   xaxis=dict(title='Motivation'),
                   yaxis=dict(title='Total'),
                   autosize=False, width=500, height=500)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='jupyter-styled-bar')

# NOTE: This does not include a count of users who did not provide a reason.