In [1]:
import pandas as pd
from scipy import stats

In [2]:
# Read in Data from SQL Outputs and convert to a pandas data frame
felony_counts = '../SQL_Analysis/outputs/felony_counts.csv'
felony_severity = '../SQL_Analysis/outputs/felony_severity.csv'


counts_df = pd.read_csv(felony_counts)
severity_df = pd.read_csv(felony_severity)

counts_df

Unnamed: 0,court_district,charges_1_3,charges_4_6,charges_7_9,charges_10plus
0,1,2891,1174,298,238
1,2,11741,3476,597,360
2,3,31804,9747,1968,1051
3,4,8358,3243,705,409
4,5,5952,2326,628,276
5,6,2028,808,191,104
6,7,2043,915,222,99
7,8,2462,929,197,107


Going to test if court district makes an impact on the distribution of charges and violation severities. Since this categorical data will be using a Chi-Square Test of Independence, where the null hypothesis is that their would be no difference in count or severity distribution between the court district porportions. It will test this by finding the difference between observed and expected values throughout the categories.

In [3]:
# Find the porportion of charges groups 
# Find total cases for each court district
counts_df['total_cases'] = counts_df[['charges_1_3', 'charges_4_6', 
                                      'charges_7_9', 'charges_10plus']].sum(axis=1)

# Loop through each column except first one to divide the column by the number of total cases, 
# giving porportion by each district

for col in counts_df.columns:
    if col == 'court_district':
        continue
    else:
        counts_df[col] = counts_df[col]/ counts_df['total_cases']

# Create a contingency table for proportions to set up for chi-square test
contingency_table_counts = counts_df.set_index('court_district')[['charges_1_3', 
                                                           'charges_4_6', 'charges_7_9', 
                                                           'charges_10plus']]


contingency_table_counts

Unnamed: 0_level_0,charges_1_3,charges_4_6,charges_7_9,charges_10plus
court_district,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.628342,0.255162,0.064769,0.051728
2,0.725918,0.214913,0.036911,0.022258
3,0.713574,0.21869,0.044155,0.023581
4,0.657334,0.255053,0.055446,0.032167
5,0.648225,0.253322,0.068395,0.030059
6,0.647716,0.258065,0.061003,0.033216
7,0.623056,0.279048,0.067704,0.030192
8,0.666306,0.251421,0.053315,0.028958


In [4]:
chi2, p, dof, expected = stats.chi2_contingency(contingency_table_counts)

print("Chi-Square Test on Charge Counts:")
print("---------------------------------")
print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Expectd Values: {expected}")


Chi-Square Test on Charge Counts:
---------------------------------
Chi-Square Statistic: 0.061604807414694734
P-value: 1.0
Expectd Values: [[0.66380883 0.24820914 0.0564622  0.03151983]
 [0.66380883 0.24820914 0.0564622  0.03151983]
 [0.66380883 0.24820914 0.0564622  0.03151983]
 [0.66380883 0.24820914 0.0564622  0.03151983]
 [0.66380883 0.24820914 0.0564622  0.03151983]
 [0.66380883 0.24820914 0.0564622  0.03151983]
 [0.66380883 0.24820914 0.0564622  0.03151983]
 [0.66380883 0.24820914 0.0564622  0.03151983]]


In [6]:
# Do the same test on the severity of the violations
severity_df

Unnamed: 0,court_district,count_capital,count_1st_degree_felony,count_2nd_degree_felony,count_3rd_degree_felony,count_class_a_misdemeanor,count_class_b_misdemeanor,count_class_c_misdemeanor,count_infraction
0,8,0,173,584,1934,877,119,7,1
1,7,0,98,667,1701,752,59,2,0
2,1,2,297,700,2302,1182,111,5,2
3,5,0,340,1990,4927,1706,196,15,8
4,2,0,736,2883,9747,2638,160,9,1
5,4,0,670,2073,6452,3115,379,16,5
6,6,0,119,591,1646,702,67,6,0
7,3,1,2673,8800,24002,8443,600,43,6


In [7]:
# Find total violations for each court district
severity_df['total_violations'] = severity_df[['count_capital', 'count_1st_degree_felony', 
                                               'count_2nd_degree_felony','count_3rd_degree_felony', 
                                               'count_class_a_misdemeanor', 'count_class_b_misdemeanor', 
                                               'count_class_c_misdemeanor',	'count_infraction']].sum(axis=1)

# Loop through each column except first one to divide the column by the number of total cases, 
# giving porportion by each district

for col in severity_df.columns:
    if col == 'court_district':
        continue
    else:
        severity_df[col] = severity_df[col]/ severity_df['total_violations']


# Create a contingency table for proportions to set up for chi-square test
contingency_table_violations = severity_df.set_index('court_district')[['count_capital', 'count_1st_degree_felony', 
                                               'count_2nd_degree_felony','count_3rd_degree_felony', 
                                               'count_class_a_misdemeanor', 'count_class_b_misdemeanor', 
                                               'count_class_c_misdemeanor',	'count_infraction']]


contingency_table_violations


Unnamed: 0_level_0,count_capital,count_1st_degree_felony,count_2nd_degree_felony,count_3rd_degree_felony,count_class_a_misdemeanor,count_class_b_misdemeanor,count_class_c_misdemeanor,count_infraction
court_district,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8,0.0,0.04682,0.158051,0.52341,0.237348,0.032206,0.001894,0.000271
7,0.0,0.029887,0.203416,0.518756,0.229338,0.017993,0.00061,0.0
1,0.000435,0.064551,0.152141,0.500326,0.256901,0.024125,0.001087,0.000435
5,0.0,0.037029,0.216728,0.536593,0.185798,0.021346,0.001634,0.000871
2,0.0,0.045505,0.178249,0.602634,0.163101,0.009892,0.000556,6.2e-05
4,0.0,0.052714,0.1631,0.507632,0.245083,0.029819,0.001259,0.000393
6,0.0,0.038007,0.188758,0.525711,0.22421,0.021399,0.001916,0.0
3,2.2e-05,0.059976,0.197451,0.538548,0.189441,0.013463,0.000965,0.000135


In [9]:
chi2, p, dof, expected = stats.chi2_contingency(contingency_table_violations)

print("Chi-Square Test on Highest Violation Type:")
print("---------------------------------")
print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")


Chi-Square Test on Highest Violation Type:
---------------------------------
Chi-Square Statistic: 0.11574397716251532
P-value: 1.0
