In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical Analysis: Priority

### Import cleaned CAD data

In [2]:
cad = pd.read_csv("../data/cleaned_full_class_data.csv")

#### Convert `calltime` back to a datetime object

In [3]:
cad['calltime'] = pd.to_datetime(cad['calltime'], errors='coerce')

#### Create `results` DataFrame that will later be exported as a .csv file

In [4]:
results = pd.DataFrame(columns=['test', 'statistic', 'p_value'])

## Test #1: ANOVA

Use: to determine if there are significant differences between the means of three or more groups or sample

Hypotheses:
- null: all priorities' means are equal.
- alternative: at least one priority's mean is different.

**input**: the continuous response variable (`secs_to_arrv`) from data categorized into groups based on the levels of an independent variable (`priority`)

**output**: F-stat and p-value

#### Split `cad` into seperate DataFrames by priority

In [5]:
p1 = cad[cad['priority'] == '1']['secs_to_arrv']
p2 = cad[cad['priority'] == '2']['secs_to_arrv']
p3 = cad[cad['priority'] == '3']['secs_to_arrv']
p4 = cad[cad['priority'] == '4']['secs_to_arrv']
p5 = cad[cad['priority'] == '5']['secs_to_arrv']
p6 = cad[cad['priority'] == '6']['secs_to_arrv']
p7 = cad[cad['priority'] == '7']['secs_to_arrv']
p8 = cad[cad['priority'] == '8']['secs_to_arrv']
p9 = cad[cad['priority'] == '9']['secs_to_arrv']
pp = cad[cad['priority'] == 'P']['secs_to_arrv']

In [6]:
from scipy.stats import f_oneway

stat, pvalue = f_oneway(p1, p2, p3, p4, p5, p6, p7, p8, p9, pp)

results.loc[len(results)] = np.array(['ANOVA', stat, pvalue])

print(f'f-stat = {stat}')
print(f'p = {pvalue}')

f-stat = 10390.481645113088
p = 0.0


## Test #2: T-Test
Use: to determine if there is a significant difference between the means of two groups.

#### Hypotheses:
- H₀: The mean arrival time is the same for EPD and CAHOOTS.
- H₁: The mean arrival time is different for EPD and CAHOOTS.

**input**: the continuous response variable (`secs_to_arrv`) from two groups, EPD and CAHOOTS

**output**: T-stat and p-value

#### Split into seperate arrays based on `agency`

In [7]:
cahoots = cad[cad['agency'] == 'CAHOOTS']
epd = cad[cad['agency'] == 'EPD']

cahoots_times = cahoots['secs_to_arrv']
epd_times = epd['secs_to_arrv']

#### Normalize both groups and run t-test using `scipy.stats`

In [8]:
from scipy.stats import ttest_ind

all_times = np.concatenate([cahoots_times, epd_times])
mean_all = np.nanmean(all_times)
std_all = np.nanstd(all_times)

cahoots_norm = (cahoots_times - mean_all) / std_all
epd_norm = (epd_times - mean_all) / std_all

stat, pvalue = ttest_ind(cahoots_norm, epd_norm, equal_var=False)

results.loc[len(results)] = np.array(['t_test_agency', stat, pvalue])

print(f't-stat = {stat}')
print(f'p = {pvalue}')

t-stat = 81.98689179525557
p = 0.0


#### Just to take a quick look at the means

In [9]:
print(f'cahoots avg time: {np.mean(cahoots_times)}')
print(f'epd avg time: {np.mean(epd_times)}')

cahoots avg time: 5978.226631499021
epd avg time: 3726.1499152343185


CAHOOTS, on *average* slower, but looking at the difference between each priority might give a better explanation.

## Test #3: T-Test (10x)
Use: to determine if there is a significant difference between the means of two groups.

#### Hypotheses:
- H₀: The mean arrival time is the same for a priority, no matter if EPD or CAHOOTS has responded to it.
- H₁: The mean arrival time is different for a priority depending on whether EPD or CAHOOTS has responded to it.

**input**: the continuous response variable (`secs_to_arrv`) for each priority, from both groups, EPD and CAHOOTS

**output**: T-stat and p-value

#### Split each priority *and* agency into seperate arrays

In [10]:
cahoots_p1 = cahoots[cahoots['priority'] == '1']['secs_to_arrv']
cahoots_p2 = cahoots[cahoots['priority'] == '2']['secs_to_arrv']
cahoots_p3 = cahoots[cahoots['priority'] == '3']['secs_to_arrv']
cahoots_p4 = cahoots[cahoots['priority'] == '4']['secs_to_arrv']
cahoots_p5 = cahoots[cahoots['priority'] == '5']['secs_to_arrv']
cahoots_p6 = cahoots[cahoots['priority'] == '6']['secs_to_arrv']
cahoots_p7 = cahoots[cahoots['priority'] == '7']['secs_to_arrv']
cahoots_p8 = cahoots[cahoots['priority'] == '8']['secs_to_arrv']
cahoots_p9 = cahoots[cahoots['priority'] == '9']['secs_to_arrv']
cahoots_pp = cahoots[cahoots['priority'] == 'P']['secs_to_arrv']

epd_p1 = epd[epd['priority'] == '1']['secs_to_arrv']
epd_p2 = epd[epd['priority'] == '2']['secs_to_arrv']
epd_p3 = epd[epd['priority'] == '3']['secs_to_arrv']
epd_p4 = epd[epd['priority'] == '4']['secs_to_arrv']
epd_p5 = epd[epd['priority'] == '5']['secs_to_arrv']
epd_p6 = epd[epd['priority'] == '6']['secs_to_arrv']
epd_p7 = epd[epd['priority'] == '7']['secs_to_arrv']
epd_p8 = epd[epd['priority'] == '8']['secs_to_arrv']
epd_p9 = epd[epd['priority'] == '9']['secs_to_arrv']
epd_pp = epd[epd['priority'] == 'P']['secs_to_arrv']

#### Analysis Steps

1. Create dictionary, key is the priority itself and the value is a tuple consisting of (cahoots, epd)
2. Use `for` loop to...
3. Normalize each pair using combines mean and standard deviation
4. Use `scipy.stats`to get the t-stat and p-value

In [11]:
priority_pairs = {
    '1': (cahoots_p1, epd_p1),
    '2': (cahoots_p2, epd_p2),
    '3': (cahoots_p3, epd_p3),
    '4': (cahoots_p4, epd_p4),
    '5': (cahoots_p5, epd_p5),
    '6': (cahoots_p6, epd_p6),
    '7': (cahoots_p7, epd_p7),
    '8': (cahoots_p8, epd_p8),
    '9': (cahoots_p9, epd_p9),
    'P': (cahoots_pp, epd_pp),
}

for pr, (c_group, e_group) in priority_pairs.items():
    c_group = c_group.dropna()
    e_group = e_group.dropna()
    
    combined = np.concatenate([c_group, e_group])
    mean_comb = np.mean(combined)
    std_comb = np.std(combined, ddof=0)
    
    c_norm = (c_group - mean_comb) / std_comb
    e_norm = (e_group - mean_comb) / std_comb
    
    t_stat, p_val = ttest_ind(c_norm, e_norm, equal_var=False)

    results.loc[len(results)] = np.array([f't_test_priority_{pr}', t_stat, p_val])
    
    print(f"Priority {pr}: t-stat = {t_stat:.3f}, p = {p_val:.3g}")

Priority 1: t-stat = 24.596, p = 6.33e-126
Priority 2: t-stat = 1.945, p = 0.0625
Priority 3: t-stat = -13.487, p = 3.35e-41
Priority 4: t-stat = -14.335, p = 5.31e-32
Priority 5: t-stat = -67.659, p = 0
Priority 6: t-stat = 8.117, p = 8.88e-15
Priority 7: t-stat = 0.643, p = 0.52
Priority 8: t-stat = 14.229, p = 1.54e-43
Priority 9: t-stat = 1.978, p = 0.0792
Priority P: t-stat = 0.682, p = 0.495


#### Take a quick look at the results so far

In [12]:
results

Unnamed: 0,test,statistic,p_value
0,ANOVA,10390.481645113088,0.0
1,t_test_agency,81.98689179525557,0.0
2,t_test_priority_1,24.596237858049605,6.334146341463216e-126
3,t_test_priority_2,1.9450806917662704,0.0624803385217386
4,t_test_priority_3,-13.487246163129084,3.347214007213987e-41
5,t_test_priority_4,-14.334843280695724,5.314287148346843e-32
6,t_test_priority_5,-67.65893370433757,0.0
7,t_test_priority_6,8.117497376606899,8.883262582404328e-15
8,t_test_priority_7,0.6433414758608552,0.5201247482493778
9,t_test_priority_8,14.228776619270278,1.5416443941575946e-43


### For priorities where difference is *statistically significant*, look at the means of the two groups

In [13]:
print(f'cahoots priority 1 mean: {np.mean(cahoots_p1)}')
print(f'epd priority 1 mean: {np.mean(epd_p1)}')

cahoots priority 1 mean: 1342.6136242682278
epd priority 1 mean: 772.6996636698791


In [14]:
print(f'cahoots priority 3 mean: {np.mean(cahoots_p3)}')
print(f'epd priority 3 mean: {np.mean(epd_p3)}')

cahoots priority 3 mean: 1963.7854251012145
epd priority 3 mean: 2320.64217950422


In [15]:
print(f'cahoots priority 4 mean: {np.mean(cahoots_p4)}')
print(f'epd priority 4 mean: {np.mean(epd_p4)}')

cahoots priority 4 mean: 5430.15
epd priority 4 mean: 11597.646849819173


In [16]:
print(f'cahoots priority 5 mean: {np.mean(cahoots_p5)}')
print(f'epd priority 5 mean: {np.mean(epd_p5)}')

cahoots priority 5 mean: 2313.0375392023316
epd priority 5 mean: 7385.411898213666


In [17]:
print(f'cahoots priority 6 mean: {np.mean(cahoots_p6)}')
print(f'epd priority 6 mean: {np.mean(epd_p6)}')

cahoots priority 6 mean: 4733.07250755287
epd priority 6 mean: 1716.3178589548452


In [18]:
print(f'cahoots priority 8 mean: {np.mean(cahoots_p8)}')
print(f'epd priority 8 mean: {np.mean(epd_p8)}')

cahoots priority 8 mean: 24488.431353456668
epd priority 8 mean: 9742.138806851743


### RESULTS AND INTERPRETATION:

| Priority | t-stat | p-value   | Interpretation                                  |
| -------- | ------ | --------- | ----------------------------------------------- |
| 1        | 24.60  | 6.33e-126 | Huge difference — EPD is **much faster** |
| 2        | 1.95   | 0.0625  | Marginal — not statistically significant |
| 3        | -13.49 | 3.35e-41  | CAHOOTS is **faster**                           |
| 4        | -14.34 | 5.31e-32  | CAHOOTS is **faster**                           |
| 5        | -67.66 | \~0       | Huge difference — CAHOOTS is **much faster** |
| 6        | 8.12   | 8.88e-15  | EPD is **faster**                               |
| 7        | 0.64   | 0.52    | No difference                            |
| 8        | 14.23  | 1.54e-43  | EPD is **faster**                               |
| 9        | 1.98   | 0.079   | Marginal — not statistically significant      |
| P        | 0.68   | 0.495   | No difference                            |


### General consensus: 
- CAHOOTS faster for (most) higher priority calls
- EPD faster for (most) lower priority calls
- **Not uniform across all priority levels**

### Save results to .csv file

In [19]:
results.to_csv('../results/priority_results.csv')