In [1]:
from pathlib import Path
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import datetime

In [2]:
 # Create a file path 
url = "/Users/aileen/Downloads/chicago_crimes_all_cleaned (2).csv"

 # Read in the data. 
crime_data = pd.read_csv(url)
crime_data.head()

Unnamed: 0,Case Number,Date,Block,Primary Type,Description,District,Ward,Year,Latitude,Longitude,Location,Population,Zipcode
0,JC104662,12/31/2018 11:59:00 PM,112XX S SACRAMENTO AVE,MINOR PROPERTY CRIME,TO VEHICLE,22,19.0,2018,41.689079,-87.696064,"(41.689078832, -87.696064026)",28804.0,60655.0
1,JC100043,12/31/2018 11:57:00 PM,084XX S SANGAMON ST,MINOR PROPERTY CRIME,TO PROPERTY,6,21.0,2018,41.740521,-87.647391,"(41.740520866, -87.647390719)",68096.0,60620.0
2,JC100006,12/31/2018 11:56:00 PM,018XX S ALLPORT ST,ASSAULT & BATTERY,AGG: HANDS/FIST/FEET NO/MINOR INJURY,12,25.0,2018,41.857068,-87.657625,"(41.857068095, -87.657625201)",79205.0,60608.0
3,JC100031,12/31/2018 11:55:00 PM,078XX S SANGAMON ST,ASSAULT & BATTERY,DOMESTIC BATTERY SIMPLE,6,17.0,2018,41.751914,-87.647717,"(41.75191443, -87.647716532)",68096.0,60620.0
4,JC100026,12/31/2018 11:49:00 PM,052XX W GLADYS AVE,ASSAULT & BATTERY,AGGRAVATED - HANDGUN,15,29.0,2018,41.875684,-87.760479,"(41.87568438, -87.760479356)",47712.0,60644.0


In [3]:
#calculating the total count of districts 
counts_of_districts = crime_data["District"].nunique()
counts_of_districts

22

In [6]:
#calculating total crime
total_crime = crime_data["Primary Type"].count()
total_crime

1145670

In [7]:
crime_count_by_district = crime_data.groupby("District")["Primary Type"].count()
crime_count_by_district

District
1     59665
2     52762
3     57296
4     63844
5     52105
6     74703
7     59949
8     67500
9     50039
10    53620
11    77988
12    60084
14    39203
15    39493
16    37940
17    30992
18    61019
19    53373
20    21917
22    35809
24    36947
25    59422
Name: Primary Type, dtype: int64

In [8]:
#expected value
expected_total = total_crime/22
expected_total

52075.90909090909

In [9]:
# Observed counts of crime by district (cumulative, over the last 5 years)
observed = pd.Series([59665.00000000000,52762.00000000000,57296.00000000000,63844.00000000000,52105.00000000000,74703.00000000000,59949.00000000000,67500.00000000000,50039.00000000000,53620.00000000000,77988.00000000000,60084.00000000000,39203.00000000000,39493.00000000000,37940.00000000000,30992.00000000000,61019.00000000000,53373.00000000000,21917.00000000000,35809.00000000000,36947.00000000000,59422.00000000000], 
                     index=['1','2','3','4','5','6','7','8','9','10','11','12','14','15','16','17','18','19','20','22','24','25'])

In [10]:
# Create a data frame
df = pd.DataFrame([observed]).T

In [11]:
# Add a column whose default values are the expected values
df[1] = 52075.90909090909

In [12]:
# Rename columns
df.columns = ["observed", "expected"]

In [13]:
# View the data frame
df 

Unnamed: 0,observed,expected
1,59665.0,52075.909091
2,52762.0,52075.909091
3,57296.0,52075.909091
4,63844.0,52075.909091
5,52105.0,52075.909091
6,74703.0,52075.909091
7,59949.0,52075.909091
8,67500.0,52075.909091
9,50039.0,52075.909091
10,53620.0,52075.909091


In [14]:
# The degree of freedom is 22-1 = 21
# With a p-value of 0.05, the confidence level is 1.00-0.05 = 0.95.
critical_value = stats.chi2.ppf(q = 0.95, df = 21)
# The critical value
critical_value 

32.670573340917315

In [15]:
# Run the chi square test with stats.chisquare()
stats.chisquare(df['observed'], df['expected']) 

Power_divergenceResult(statistic=82280.93870311696, pvalue=0.0)

Conclusion

Since the chi square value of 82,281 exceeds the critical value of 32.67, we conclude that the results are statistically significant.

In [16]:
#get an array of all years
years = crime_data["Year"].unique()

In [17]:
# Split data file into years
df_2018 = crime_data[crime_data["Year"]==2018]
df_2019 = crime_data[crime_data["Year"]==2019]
df_2020 = crime_data[crime_data["Year"]==2020]
df_2021 = crime_data[crime_data["Year"]==2021]
df_2022 = crime_data[crime_data["Year"]==2022]
df_2018

Unnamed: 0,Case Number,Date,Block,Primary Type,Description,District,Ward,Year,Latitude,Longitude,Location,Population,Zipcode
0,JC104662,12/31/2018 11:59:00 PM,112XX S SACRAMENTO AVE,MINOR PROPERTY CRIME,TO VEHICLE,22,19.0,2018,41.689079,-87.696064,"(41.689078832, -87.696064026)",28804.0,60655.0
1,JC100043,12/31/2018 11:57:00 PM,084XX S SANGAMON ST,MINOR PROPERTY CRIME,TO PROPERTY,6,21.0,2018,41.740521,-87.647391,"(41.740520866, -87.647390719)",68096.0,60620.0
2,JC100006,12/31/2018 11:56:00 PM,018XX S ALLPORT ST,ASSAULT & BATTERY,AGG: HANDS/FIST/FEET NO/MINOR INJURY,12,25.0,2018,41.857068,-87.657625,"(41.857068095, -87.657625201)",79205.0,60608.0
3,JC100031,12/31/2018 11:55:00 PM,078XX S SANGAMON ST,ASSAULT & BATTERY,DOMESTIC BATTERY SIMPLE,6,17.0,2018,41.751914,-87.647717,"(41.75191443, -87.647716532)",68096.0,60620.0
4,JC100026,12/31/2018 11:49:00 PM,052XX W GLADYS AVE,ASSAULT & BATTERY,AGGRAVATED - HANDGUN,15,29.0,2018,41.875684,-87.760479,"(41.87568438, -87.760479356)",47712.0,60644.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
894540,JB115033,01/02/2018 11:00:00 AM,064XX W ADDISON ST,THEFT (ALL TYPES),OVER $500,16,36.0,2018,41.945654,-87.786853,"(41.945653748, -87.786852586)",24272.0,60706.0
894541,JB101356,01/02/2018 08:05:00 AM,051XX N NORMANDY AVE,ASSAULT & BATTERY,SIMPLE,16,41.0,2018,41.974559,-87.793957,"(41.974559221, -87.793956917)",24272.0,60706.0
894542,JB101127,01/01/2018 10:30:00 PM,062XX W CORNELIA AVE,OTHER MINOR CRIMES,HARASSMENT BY ELECTRONIC MEANS,16,36.0,2018,41.943873,-87.782380,"(41.943873325, -87.782379691)",24272.0,60706.0
894543,JB100762,01/01/2018 12:00:00 PM,063XX W WAVELAND AVE,OTHER MINOR CRIMES,HARASSMENT BY ELECTRONIC MEANS,16,38.0,2018,41.947501,-87.784668,"(41.947501166, -87.784667989)",24272.0,60706.0


In [18]:
# Create a data frame of population from year to year, noting that zip codes repeat, thus creating repeated population counts,
# since we pulled population based on zip code. So, we groupby unique zip code, get the population for each zip code, and sum by population,
# then by population across all zip codes for one year
population = [df_2018.groupby("Zipcode")["Population"].unique().sum().sum(),
              df_2019.groupby("Zipcode")["Population"].unique().sum().sum(),
              df_2020.groupby("Zipcode")["Population"].unique().sum().sum(),
              df_2021.groupby("Zipcode")["Population"].unique().sum().sum(),
              df_2022.groupby("Zipcode")["Population"].unique().sum().sum()]


In [19]:
# Look at types of crime per capita on the year level
# Count total crimes by crime type per year
year_crime_cnt = crime_data.groupby("Year")["Primary Type"].count()
year_crime_cnt

# Put it in a dataframe
crime_by_year = pd.DataFrame({"Year" : year_crime_cnt.index, "Total Crime" : year_crime_cnt.values})
crime_by_year

Unnamed: 0,Year,Total Crime
0,2018,258966
1,2019,254518
2,2020,204271
3,2021,199060
4,2022,228855


In [20]:
#total number of unique crime
crime_2018 = df_2018["Primary Type"].count()
crime_2018

258966

In [21]:
#crime by district
crime_by_district_2018 = df_2018.groupby("District")["Primary Type"].count()
crime_by_district_2018

District
1     15249
2     11608
3     11926
4     13569
5     11775
6     16272
7     14124
8     15203
9     10874
10    12480
11    18817
12    12818
14     9151
15     8778
16     7945
17     7201
18    15829
19    11930
20     4836
22     7887
24     7546
25    13148
Name: Primary Type, dtype: int64

In [22]:
#expected value
expected_2018 = crime_2018/22
expected_2018

11771.181818181818

In [23]:
# Observed counts of crime by district
observed_2018 = pd.Series([15249.000000000000,11608.000000000000,11926.000000000000,13569.000000000000,11775.000000000000,16272.000000000000,14124.000000000000,15203.000000000000,10874.000000000000,12480.000000000000,18817.000000000000,12818.000000000000,9151.000000000000,8778.000000000000,7945.000000000000,7201.000000000000,15829.000000000000,11930.000000000000,4836.000000000000,7887.000000000000,7546.000000000000,13148.000000000000], 
                     index=['1','2','3','4','5','6','7','8','9','10','11','12','14','15','16','17','18','19','20','22','24','25'])

In [24]:
# Create a data frame
chi_2018 = pd.DataFrame([observed_2018]).T

In [25]:
 # Add a column whose default values are the expected values
chi_2018[1] = 11771.181818181818

In [26]:
# Rename columns
chi_2018.columns = ["observed", "expected"]

In [27]:
# View the data frame
chi_2018

Unnamed: 0,observed,expected
1,15249.0,11771.181818
2,11608.0,11771.181818
3,11926.0,11771.181818
4,13569.0,11771.181818
5,11775.0,11771.181818
6,16272.0,11771.181818
7,14124.0,11771.181818
8,15203.0,11771.181818
9,10874.0,11771.181818
10,12480.0,11771.181818


In [28]:
# Run the chi square test with stats.chisquare()
stats.chisquare(chi_2018['observed'], chi_2018['expected'])

Power_divergenceResult(statistic=21728.357884818855, pvalue=0.0)

Conclusion

Since the chi square value of 21,728 exceeds the critical value of 32.67, we conclude that the results are statistically significant.

In [29]:
#total number of unique crime
crime_2019 = df_2019["Primary Type"].count()
crime_2019

254518

In [30]:
#crime by district
crime_by_district_2019 = df_2019.groupby("District")["Primary Type"].count()
crime_by_district_2019

District
1     15002
2     11072
3     12404
4     13518
5     11274
6     16755
7     13674
8     14598
9     10934
10    12413
11    18597
12    12960
14     9106
15     8867
16     7710
17     6440
18    14919
19    11531
20     4284
22     7745
24     7874
25    12841
Name: Primary Type, dtype: int64

In [31]:
#expected value
expected_2019 = crime_2019/22
expected_2019

11569.0

In [32]:
# Observed counts of crime by district
observed_2019 = pd.Series([15002, 11072,12404,13518,11274,16755,13674,14598,10934,12413,18597,12960,9106,8867,7710,6440,14919,11531,4284,7745,7874,12841], 
                     index=['1','2','3','4','5','6','7','8','9','10','11','12','14','15','16','17','18','19','20','22','24','25'])

In [33]:
# Create a data frame
chi_2019 = pd.DataFrame([observed_2019]).T

In [34]:
 # Add a column whose default values are the expected values
chi_2019[1] = 11569

In [35]:
# Rename columns
chi_2019.columns = ["observed", "expected"]

In [37]:
# View the data frame
chi_2019

Unnamed: 0,observed,expected
1,15002,11569
2,11072,11569
3,12404,11569
4,13518,11569
5,11274,11569
6,16755,11569
7,13674,11569
8,14598,11569
9,10934,11569
10,12413,11569


In [38]:
# Run the chi square test with stats.chisquare()
stats.chisquare(chi_2019['observed'], chi_2019['expected'])

Power_divergenceResult(statistic=22328.108393119543, pvalue=0.0)

Conclusion

Since the chi square value of 22,328 exceeds the critical value of 32.67, we conclude that the results are statistically significant.

In [39]:
#total number of unique crime
crime_2020 = df_2020["Primary Type"].count()
crime_2020

204271

In [40]:
#crime by district
crime_by_district_2020 = df_2020.groupby("District")["Primary Type"].count()
crime_by_district_2020

District
1      8150
2      9270
3     10869
4     11711
5     10067
6     13902
7     11648
8     12178
9      9105
10     9994
11    14780
12     9886
14     6455
15     7813
16     7024
17     5566
18     8555
19     8977
20     3930
22     6628
24     6621
25    11142
Name: Primary Type, dtype: int64

In [41]:
#expected value
expected_2020 = crime_2020/22
expected_2020

9285.045454545454

In [42]:
# Observed counts of crime by district
observed_2020 = pd.Series([8150.000000000000,9270.000000000000,10869.000000000000,11711.000000000000,10067.000000000000,13902.000000000000,11648.000000000000,12178.000000000000,9105.000000000000,9994.000000000000,14780.000000000000,9886.000000000000,6455.000000000000,7813.000000000000,7024.000000000000,5566.000000000000,8555.000000000000,8977.000000000000,3930.000000000000,6628.000000000000,6621.000000000000,11142.000000000000], 
                     index=['1','2','3','4','5','6','7','8','9','10','11','12','14','15','16','17','18','19','20','22','24','25'])

In [43]:
# Create a data frame
chi_2020 = pd.DataFrame([observed_2020]).T

In [44]:
 # Add a column whose default values are the expected values
chi_2020[1] = 9285.045454545454

In [45]:
# Rename columns
chi_2020.columns = ["observed", "expected"]

In [46]:
# View the data frame
chi_2020

Unnamed: 0,observed,expected
1,8150.0,9285.045455
2,9270.0,9285.045455
3,10869.0,9285.045455
4,11711.0,9285.045455
5,10067.0,9285.045455
6,13902.0,9285.045455
7,11648.0,9285.045455
8,12178.0,9285.045455
9,9105.0,9285.045455
10,9994.0,9285.045455


In [47]:
 # Run the chi square test with stats.chisquare()
stats.chisquare(chi_2020['observed'], chi_2020['expected'])

Power_divergenceResult(statistic=16443.998007548795, pvalue=0.0)

Conclusion

Since the chi square value of 16,443 exceeds the critical value of 32.67, we conclude that the results are statistically significant.

In [48]:
#total number of unique crime
crime_2021 = df_2021["Primary Type"].count()
crime_2021

199060

In [49]:
#crime by district
crime_by_district_2021 = df_2021.groupby("District")["Primary Type"].count()
crime_by_district_2021

District
1      8768
2      9313
3     10350
4     11805
5      9395
6     13307
7     10339
8     11845
9      8949
10     8938
11    12906
12    10506
14     6618
15     6869
16     7097
17     5315
18     9716
19     9216
20     4073
22     6473
24     6660
25    10602
Name: Primary Type, dtype: int64

In [50]:
#expected value
expected_2021 = crime_2021/22
expected_2021

9048.181818181818

In [51]:
# Observed counts of crime by district
observed_2021 = pd.Series([8768.000000000000,9313.000000000000,10350.000000000000,11805.000000000000,9395.000000000000,13307.000000000000,10339.000000000000,11845.000000000000,8949.000000000000,8938.000000000000,12906.000000000000,10506.000000000000,6618.000000000000,6869.000000000000,7097.000000000000,5315.000000000000,9716.000000000000,9216.000000000000,4073.000000000000,6473.000000000000,6660.000000000000,10602.000000000000], 
                     index=['1','2','3','4','5','6','7','8','9','10','11','12','14','15','16','17','18','19','20','22','24','25'])

In [52]:
# Create a data frame
chi_2021 = pd.DataFrame([observed_2021]).T

In [53]:
 # Add a column whose default values are the expected values
chi_2021[1] = 9048.181818181818

In [54]:
# Rename columns
chi_2021.columns = ["observed", "expected"]

In [55]:
# View the data frame
chi_2021

Unnamed: 0,observed,expected
1,8768.0,9048.181818
2,9313.0,9048.181818
3,10350.0,9048.181818
4,11805.0,9048.181818
5,9395.0,9048.181818
6,13307.0,9048.181818
7,10339.0,9048.181818
8,11845.0,9048.181818
9,8949.0,9048.181818
10,8938.0,9048.181818


In [56]:
 # Run the chi square test with stats.chisquare()
stats.chisquare(chi_2021['observed'], chi_2021['expected'])

Power_divergenceResult(statistic=13549.006611072038, pvalue=0.0)

Conclusion

Since the chi square value of 13,549 exceeds the critical value of 32.67, we conclude that the results are statistically significant.

In [57]:
#total number of unique crime
crime_2022 = df_2022["Primary Type"].count()
crime_2022

228855

In [58]:
#crime by district
crime_by_district_2022 = df_2022.groupby("District")["Primary Type"].count()
crime_by_district_2022

District
1     12496
2     11499
3     11747
4     13241
5      9594
6     14467
7     10164
8     13676
9     10177
10     9795
11    12888
12    13914
14     7873
15     7166
16     8164
17     6470
18    12000
19    11719
20     4794
22     7076
24     8246
25    11689
Name: Primary Type, dtype: int64

In [59]:
#expected value
expected_2022 = crime_2022/22
expected_2022

10402.5

In [60]:
# Observed counts of crime by district
observed_2022 = pd.Series([12496.0,11499.0,11747.0,13241.0,9594.0,14467.0,10164.0,13676.0,10177.0,9795.0,12888.0,13914.0,7873.0,7166.0,8164.0,6470.0,12000.0,11719.0,4794.0,7076.0,8246.0,11689.0], 
                     index=['1','2','3','4','5','6','7','8','9','10','11','12','14','15','16','17','18','19','20','22','24','25'])

In [61]:
# Create a data frame
chi_2022 = pd.DataFrame([observed_2022]).T

In [62]:
 # Add a column whose default values are the expected values
chi_2022[1] = 10402.5

In [63]:
# Rename columns
chi_2022.columns = ["observed", "expected"]

In [64]:
# View the data frame
chi_2022

Unnamed: 0,observed,expected
1,12496.0,10402.5
2,11499.0,10402.5
3,11747.0,10402.5
4,13241.0,10402.5
5,9594.0,10402.5
6,14467.0,10402.5
7,10164.0,10402.5
8,13676.0,10402.5
9,10177.0,10402.5
10,9795.0,10402.5


In [65]:
 # Run the chi square test with stats.chisquare()
stats.chisquare(chi_2022['observed'], chi_2022['expected'])

Power_divergenceResult(statistic=14687.330112953618, pvalue=0.0)

Conclusion

Since the chi square value of 14,687 exceeds the critical value of 32.67, we conclude that the results are statistically significant.

In [66]:
# Look at per capita crime rate per district over the five year timespan
district_df = crime_data[["Year", "District", "Primary Type", "Population", "Zipcode"]]
district_df

# Get the population per district by creating a new data frame and dropping any unique combinations of distict, zip code, and population
unique_pop = district_df[["District", "Zipcode", "Population"]].drop_duplicates()
pop_per_district = unique_pop.groupby("District")["Population"].sum()

# Get the per capita crime count
cnt_per_district = district_df.groupby("District")["Primary Type"].count()
cnt_per_district

# Get the per capita crime count
per_cap_crime_cnt = cnt_per_district/pop_per_district

# Put in a data frame
district_per_capita = pd.DataFrame({"District" : per_cap_crime_cnt.index, "Per Capita Crime Count" : per_cap_crime_cnt.values})
district_per_capita

Unnamed: 0,District,Per Capita Crime Count
0,1,0.09749
1,2,0.056675
2,3,0.055896
3,4,0.067163
4,5,0.036991
5,6,0.066932
6,7,0.079481
7,8,0.042925
8,9,0.024156
9,10,0.030581


In [115]:
total_capita = district_per_capita["Per Capita Crime Count"].sum()
total_capita

1.0285460386540908

In [116]:
# Observed counts of crime by district (cumulative, over the last 5 years)
per_capita_observed = pd.Series([0.097490000000000000,0.056675000000000000,0.055896000000000000,0.067163000000000000,0.036991000000000000,0.066932000000000000,0.079481000000000000,0.042925000000000000,0.024156000000000000,0.030581000000000000,0.060918000000000000,0.041275000000000000,0.025076000000000000,0.049419000000000000,0.024038000000000000,0.021608000000000000,0.075589000000000000,0.025457000000000000,0.019074000000000000,0.045885000000000000,0.053523000000000000,0.028393000000000000], 
                     index=['1','2','3','4','5','6','7','8','9','10','11','12','14','15','16','17','18','19','20','22','24','25'])

In [127]:
expected_per_capita = (per_capita_observed.sum()) / 22
expected_per_capita

0.04675204545454545

In [128]:
# Create a data frame
chi_per_capita = pd.DataFrame([per_capita_observed]).T

In [129]:
# Add a column whose default values are the expected values
chi_per_capita[1] = expected_per_capita

In [130]:
# Rename columns
chi_per_capita.columns = ["observed", "expected"]

In [131]:
# View the data frame
chi_per_capita 

Unnamed: 0,observed,expected
1,0.09749,0.046752
2,0.056675,0.046752
3,0.055896,0.046752
4,0.067163,0.046752
5,0.036991,0.046752
6,0.066932,0.046752
7,0.079481,0.046752
8,0.042925,0.046752
9,0.024156,0.046752
10,0.030581,0.046752


In [132]:
# Run the chi square test with stats.chisquare()
stats.chisquare(chi_per_capita['observed'], chi_per_capita['expected']) 

Power_divergenceResult(statistic=0.21013004358292542, pvalue=1.0)

Conclusion

Since the chi square value of 0.21 is below the critical value of 32.67, we conclude that the results are statistically significant.