In [1]:
import pandas as pd
from scipy import stats

In [2]:
# Read in Data from SQL Outputs and convert to a pandas data frame
felony_counts = '../SQL_Analysis/outputs/felony_counts.csv'
felony_severity = '../SQL_Analysis/outputs/felony_severity.csv'
counts_vs_severity = '../SQL_Analysis/outputs/charge_vs_severity.csv'


counts_df = pd.read_csv(felony_counts)
severity_df = pd.read_csv(felony_severity)
versus_df = pd.read_csv(counts_vs_severity)

counts_df

Unnamed: 0,court_district,charges_1_3,charges_4_6,charges_7_9,charges_10plus
0,1,2891,1174,298,238
1,2,11741,3476,597,360
2,3,31804,9747,1968,1051
3,4,8358,3243,705,409
4,5,5952,2326,628,276
5,6,2028,808,191,104
6,7,2043,915,222,99
7,8,2462,929,197,107


To test if court district makes an impact on the distribution of charges and violation severities I will be using Chi-Square Test of Independence. The reason for this test is becuase both the violations and my grouping of charges is categorical data. The null hypothesis is that their would be no difference in count or severity distribution between the court districts (will be porportional to the number of cases in each district). I will test this by finding the difference between observed and expected values throughout the categories.

In [3]:
# Create a contingency table for proportions to set up for chi-square test
contingency_table_counts = counts_df.set_index('court_district')
contingency_table_counts

Unnamed: 0_level_0,charges_1_3,charges_4_6,charges_7_9,charges_10plus
court_district,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2891,1174,298,238
2,11741,3476,597,360
3,31804,9747,1968,1051
4,8358,3243,705,409
5,5952,2326,628,276
6,2028,808,191,104
7,2043,915,222,99
8,2462,929,197,107


In [4]:
chi2, p, dof, expected = stats.chi2_contingency(contingency_table_counts.values)

print("Chi-Square Test on Charge Counts:")
print("---------------------------------")
print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Expectd Values: {expected}")

Chi-Square Test on Charge Counts:
---------------------------------
Chi-Square Statistic: 688.9945076145193
P-value: 2.8040113820653452e-132
Expectd Values: [[ 3179.86870679  1069.01515198   227.15035902   124.9657822 ]
 [11178.26482583  3757.93329019   798.5068261    439.29505789]
 [30803.46625987 10355.57603213  2200.41110666  1210.54660133]
 [ 8787.66150986  2954.25508747   627.73675614   345.34664653]
 [ 6345.91490236  2133.38342219   453.31332244   249.38835301]
 [ 2163.91413192   727.46934163   154.57678203    85.03974442]
 [ 2266.20071497   761.85626676   161.88350951    89.05950877]
 [ 2553.7089484    858.51140764   182.4213381    100.35830585]]


In [5]:
# Do the same test on the severity of the violations
# Drop the capital and infraction charges as the expected value is lower than 5, can alter the test
severity_df = severity_df.drop(columns= ['count_capital', 'count_infraction'])

# Create a contingency table for proportions to set up for chi-square test
contingency_table_violations = severity_df.set_index('court_district')
contingency_table_violations

Unnamed: 0_level_0,count_1st_degree_felony,count_2nd_degree_felony,count_3rd_degree_felony,count_class_a_misdemeanor,count_class_b_misdemeanor,count_class_c_misdemeanor
court_district,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8,173,584,1934,877,119,7
7,98,667,1701,752,59,2
1,297,700,2302,1182,111,5
5,340,1990,4927,1706,196,15
2,736,2883,9747,2638,160,9
4,670,2073,6452,3115,379,16
6,119,591,1646,702,67,6
3,2673,8800,24002,8443,600,43


In [9]:
chi2, p, dof, expected = stats.chi2_contingency(contingency_table_violations.values)

print("Chi-Square Test on Highest Violation Type:")
print("---------------------------------")
print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")


Chi-Square Test on Highest Violation Type:
---------------------------------
Chi-Square Statistic: 1175.4346036539723
P-value: 3.37748027118191e-224


In [7]:
# Side Question on the relationship between higheset rank and max charge amount
filtered_df = versus_df.loc[versus_df['highest_severity_rank'] != 0]

Testing of the relationship between highest violation rank and max amount of charges will be done with a Kendall's Tau Correlation test. The reason for using this test is because the 8 different types of violations are considered ordinal data. Meaning it is data where the order matters, but unlike numerical data the intervals don't. And there is a lot of ties in the rank of the data.

In [8]:
# find the linear relationship between highest rank and max charge with Spearman test
correlation, p_value = stats.kendalltau(filtered_df['highest_severity_rank'], filtered_df['max_charge'])
print(f"Spearman Correlation Coefficient: {correlation}")


Spearman Correlation Coefficient: 0.18333871340736435
