In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
# convert JSON data to pandas df
df = pd.read_json('crime-data.json')

df

Unnamed: 0,jurisdiction,year,population,murder,rape,robbery,agg_assault,b_e,larceny_theft,m_v_theft,...,overall_percent_change_per_100_000_people,violent_crime_rate_percent_change_per_100_000_people,property_crime_rate_percent_change_per_100_000_people,murder_rate_percent_change_per_100_000_people,rape_rate_percent_change_per_100_000_people,robbery_rate_percent_change_per_100_000_people,agg_assault_rate_percent_change_per_100_000_people,b_e_rate_percent_change_per_100_000_people,larceny_theft_rate_percent_change_per_100_000_people,m_v_theft_rate_percent_change_per_100_000_people
0,Allegany County,1975,79655,3,5,20,114,669,1425,93,...,,,,,,,,,,
1,Allegany County,1976,83923,2,2,24,59,581,1384,73,...,-13.4,-41.8,-11.6,-36.7,-62.0,13.9,-50.9,-17.6,-7.8,-25.5
2,Allegany County,1977,82102,3,7,32,85,592,1390,102,...,6.4,49.2,4.5,53.3,257.8,36.3,47.3,4.2,2.7,42.8
3,Allegany County,1978,79966,1,2,18,81,539,1390,100,...,-1.0,-17.5,0.0,-65.8,-70.7,-42.2,-2.2,-6.5,2.7,0.7
4,Allegany County,1979,79721,1,7,18,84,502,1611,99,...,9.3,8.2,9.4,0.3,251.1,0.3,4.0,-6.6,16.3,-0.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Washington County,2004,138016,5,17,88,426,699,2138,270,...,1.1,7.2,0.1,390.9,-27.4,-19.3,16.2,0.5,0.4,-2.9
996,Washington County,2005,140687,4,18,111,388,702,2306,261,...,2.1,-4.6,3.2,-21.5,3.9,23.7,-10.6,-1.5,5.8,-5.2
997,Washington County,2006,142284,4,28,158,378,786,2367,300,...,4.9,7.8,4.4,-1.1,53.8,40.7,-3.7,10.7,1.5,13.7
998,Washington County,2007,144219,7,38,137,324,679,2338,240,...,-7.7,-12.1,-6.9,72.7,33.9,-14.5,-15.4,-14.8,-2.6,-21.1


In [32]:
# cleaning data
df = df.fillna(0)
df = df.set_index(['jurisdiction', 'year'])

df

Unnamed: 0_level_0,Unnamed: 1_level_0,population,murder,rape,robbery,agg_assault,b_e,larceny_theft,m_v_theft,grand_total,violent_crime_total,...,overall_percent_change_per_100_000_people,violent_crime_rate_percent_change_per_100_000_people,property_crime_rate_percent_change_per_100_000_people,murder_rate_percent_change_per_100_000_people,rape_rate_percent_change_per_100_000_people,robbery_rate_percent_change_per_100_000_people,agg_assault_rate_percent_change_per_100_000_people,b_e_rate_percent_change_per_100_000_people,larceny_theft_rate_percent_change_per_100_000_people,m_v_theft_rate_percent_change_per_100_000_people
jurisdiction,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Allegany County,1975,79655,3,5,20,114,669,1425,93,2329,142,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Allegany County,1976,83923,2,2,24,59,581,1384,73,2125,87,...,-13.4,-41.8,-11.6,-36.7,-62.0,13.9,-50.9,-17.6,-7.8,-25.5
Allegany County,1977,82102,3,7,32,85,592,1390,102,2211,127,...,6.4,49.2,4.5,53.3,257.8,36.3,47.3,4.2,2.7,42.8
Allegany County,1978,79966,1,2,18,81,539,1390,100,2131,102,...,-1.0,-17.5,0.0,-65.8,-70.7,-42.2,-2.2,-6.5,2.7,0.7
Allegany County,1979,79721,1,7,18,84,502,1611,99,2322,110,...,9.3,8.2,9.4,0.3,251.1,0.3,4.0,-6.6,16.3,-0.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Washington County,2004,138016,5,17,88,426,699,2138,270,3643,536,...,1.1,7.2,0.1,390.9,-27.4,-19.3,16.2,0.5,0.4,-2.9
Washington County,2005,140687,4,18,111,388,702,2306,261,3790,521,...,2.1,-4.6,3.2,-21.5,3.9,23.7,-10.6,-1.5,5.8,-5.2
Washington County,2006,142284,4,28,158,378,786,2367,300,4021,568,...,4.9,7.8,4.4,-1.1,53.8,40.7,-3.7,10.7,1.5,13.7
Washington County,2007,144219,7,38,137,324,679,2338,240,3763,506,...,-7.7,-12.1,-6.9,72.7,33.9,-14.5,-15.4,-14.8,-2.6,-21.1


In [33]:
# normalize number of crimes commited for each county, for every year
df_crimes = df[['murder', 'rape', 'robbery', 'agg_assault', 'b_e', 'larceny_theft', 'm_v_theft']]
df_crimes_normed = df_crimes.divide(df_crimes.sum(axis=1), axis=0)

df_crimes_normed

Unnamed: 0_level_0,Unnamed: 1_level_0,murder,rape,robbery,agg_assault,b_e,larceny_theft,m_v_theft
jurisdiction,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Allegany County,1975,0.001288,0.002147,0.008587,0.048948,0.287248,0.611851,0.039931
Allegany County,1976,0.000941,0.000941,0.011294,0.027765,0.273412,0.651294,0.034353
Allegany County,1977,0.001357,0.003166,0.014473,0.038444,0.267752,0.628675,0.046133
Allegany County,1978,0.000469,0.000939,0.008447,0.038010,0.252933,0.652276,0.046926
Allegany County,1979,0.000431,0.003015,0.007752,0.036176,0.216193,0.693798,0.042636
...,...,...,...,...,...,...,...,...
Washington County,2004,0.001372,0.004666,0.024156,0.116937,0.191875,0.586879,0.074115
Washington County,2005,0.001055,0.004749,0.029288,0.102375,0.185224,0.608443,0.068865
Washington County,2006,0.000995,0.006963,0.039294,0.094006,0.195474,0.588660,0.074608
Washington County,2007,0.001860,0.010098,0.036407,0.086102,0.180441,0.621313,0.063779


## Use Jaccard Similarity by Crime Percent Change to Predict Population

In [34]:
# create df with chosen crime statistics
df_population_rates = df.iloc[:, 8:]
df_population_rates['population'] = df['population']

df_population_rates

Unnamed: 0_level_0,Unnamed: 1_level_0,grand_total,violent_crime_total,violent_crime_percent,property_crime_totals,property_crime_percent,overall_crime_rate_per_100_000_people,violent_crime_rate_per_100_000_people,property_crime_rate_per_100_000_people,murder_per_100_000_people,rape_per_100_000_people,...,violent_crime_rate_percent_change_per_100_000_people,property_crime_rate_percent_change_per_100_000_people,murder_rate_percent_change_per_100_000_people,rape_rate_percent_change_per_100_000_people,robbery_rate_percent_change_per_100_000_people,agg_assault_rate_percent_change_per_100_000_people,b_e_rate_percent_change_per_100_000_people,larceny_theft_rate_percent_change_per_100_000_people,m_v_theft_rate_percent_change_per_100_000_people,population
jurisdiction,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Allegany County,1975,2329,142,6.1,2187,93.9,2923.9,178.3,2745.6,3.8,6.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,79655
Allegany County,1976,2125,87,4.1,2038,95.9,2532.1,103.7,2428.4,2.4,2.4,...,-41.8,-11.6,-36.7,-62.0,13.9,-50.9,-17.6,-7.8,-25.5,83923
Allegany County,1977,2211,127,5.7,2084,94.3,2693.0,154.7,2538.3,3.7,8.5,...,49.2,4.5,53.3,257.8,36.3,47.3,4.2,2.7,42.8,82102
Allegany County,1978,2131,102,4.8,2029,95.2,2664.9,127.6,2537.3,1.3,2.5,...,-17.5,0.0,-65.8,-70.7,-42.2,-2.2,-6.5,2.7,0.7,79966
Allegany County,1979,2322,110,4.7,2212,95.3,2912.7,138.0,2774.7,1.3,8.8,...,8.2,9.4,0.3,251.1,0.3,4.0,-6.6,16.3,-0.7,79721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Washington County,2004,3643,536,14.7,3107,85.3,2639.5,388.4,2251.2,3.6,12.3,...,7.2,0.1,390.9,-27.4,-19.3,16.2,0.5,0.4,-2.9,138016
Washington County,2005,3790,521,13.7,3269,86.3,2693.9,370.3,2323.6,2.8,12.8,...,-4.6,3.2,-21.5,3.9,23.7,-10.6,-1.5,5.8,-5.2,140687
Washington County,2006,4021,568,14.1,3453,85.9,2826.0,399.2,2426.8,2.8,19.7,...,7.8,4.4,-1.1,53.8,40.7,-3.7,10.7,1.5,13.7,142284
Washington County,2007,3763,506,13.4,3257,86.6,2609.2,350.9,2258.4,4.9,26.3,...,-12.1,-6.9,72.7,33.9,-14.5,-15.4,-14.8,-2.6,-21.1,144219


In [35]:
# split test/train data
all_ids = df.index.values
training_ids, testing_ids = train_test_split(all_ids, test_size=0.2)

training_ids.shape, testing_ids.shape

((800,), (200,))

In [36]:
k_nn = 10
prediction_rows = []

# find k most similar counties in the training set

for test_id in testing_ids:

    target_obj = set(df_population_rates.loc[test_id[0], test_id[1]].values)
    
    test_county_similarities = []

    for this_id in training_ids:
        this_obj = set(df_population_rates.loc[this_id[0], this_id[1]].values)

        this_intersect = this_obj.intersection(target_obj)
        this_union = this_obj.union(target_obj)

        jaccard = len(this_intersect) / len(this_union)

        test_county_similarities.append({
            "county": this_id,
            "jaccard": jaccard,
        })

    similarity_df = pd.DataFrame(test_county_similarities, columns=["county", "jaccard"])
    top_counties = similarity_df.sort_values(by="jaccard", ascending=False).head(k_nn)

    relevant_populations = []
    for this_id in top_counties["county"]:
        
        # add relevant populations
        relevant_populations.append(df['population'].loc[this_id[0], this_id[1]])
        
    # take average total population
    predicted_populations = np.mean(relevant_populations)

    # track predictions
    prediction_rows.append({
            "county": test_id,
            "population": predicted_populations,
        })
    
predicted_df = pd.DataFrame(prediction_rows).set_index("county")


In [37]:
correct_populations = []

# count number of accurate predictions
for county,row in predicted_df.iterrows():

    actual_population = df['population'].loc[county[0], county[1]]

    # mark correct if prediction is less/more than 40000 of the original total population
    correct_populations.append(1 if row['population'] >= actual_population-40000 and row['population'] <= actual_population+40000 else 0)
    
accuracy = sum(correct_populations) / len(correct_populations)
print("Accuracy:", accuracy)

Accuracy: 0.11


In [38]:
def find_real_population(row):
    return df['population'].loc[row.name[0], row.name[1]]

# create df comparing predicted and real populations
model = predicted_df
model['real_population'] = predicted_df.apply(find_real_population, axis=1)

model

Unnamed: 0_level_0,population,real_population
county,Unnamed: 1_level_1,Unnamed: 2_level_1
"(Dorchester County, 1977)",252232.6,29991
"(Cecil County, 2014)",93373.4,102296
"(Frederick County, 2005)",263072.4,219311
"(Allegany County, 2002)",55548.3,77216
"(Dorchester County, 2006)",156615.3,31487
...,...,...
"(Talbot County, 2013)",101771.3,38253
"(Anne Arundel County, 1977)",436927.8,347538
"(Harford County, 2018)",107116.8,251139
"(Somerset County, 2001)",105207.1,25115


## Find similarities for each County and Year

In [39]:
# create combined features column with all percent change rates
def combined_features(row):
    return str(row['murder_rate_percent_change_per_100_000_people'])+" "+str(row['rape_rate_percent_change_per_100_000_people'])+", "+str(row['robbery_rate_percent_change_per_100_000_people'])+", "+str(row['agg_assault_rate_percent_change_per_100_000_people'])+", "+str(row['b_e_rate_percent_change_per_100_000_people'])+", "+str(row['larceny_theft_rate_percent_change_per_100_000_people'])+", "+str(row['m_v_theft_rate_percent_change_per_100_000_people'])

df['all_change_rates'] = df.apply(combined_features, axis=1)
# count number of features present in each title
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['all_change_rates'])
print("Count Matrix: ", count_matrix.toarray())

# set similarity metric
cosine_sim = cosine_similarity(count_matrix)

Count Matrix:  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [40]:
df1 = df.reset_index()
similarity_dict = {}

for index, row in df1.iterrows():
    # generate distances from target county to all other counties
    similar_counties = list(enumerate(cosine_sim[index]))
    
    # sort to get most similar counties first
    sorted_similar_counties = sorted(similar_counties, key = lambda x:x[1], reverse = True)

    similarity_dict[index] = sorted_similar_counties

In [41]:
# sample similarities for target county and year

# Baltimore County, 1982
target_similar_counties = similarity_dict[99]
print(df1['jurisdiction'].loc[99], df1['year'].loc[99])

# print top ten most similar counties
i=0
for county in target_similar_counties:
    print(df1['jurisdiction'].loc[county[0]], df1['year'].loc[county[0]], county[1])
    i = i+1;
    if i>10:
        break

# Montgomery County, 2005
target_similar_counties = similarity_dict[720]
print(df1['jurisdiction'].loc[720], df1['year'].loc[720])

# print top ten most similar counties
i=0
for county in target_similar_counties:
    print(df1['jurisdiction'].loc[county[0]], df1['year'].loc[county[0]], county[1])
    i = i+1;
    if i>10:
        break

# St. Mary's County, 2011
target_similar_counties = similarity_dict[910]
print(df1['jurisdiction'].loc[910], df1['year'].loc[910])

# print top ten most similar countiess
i=0
for county in target_similar_counties:
    print(df1['jurisdiction'].loc[county[0]], df1['year'].loc[county[0]], county[1])
    i = i+1;
    if i>10:
        break

Baltimore City 1982
Baltimore City 1982 1.0000000000000002
Baltimore County 1999 0.8728715609439696
Baltimore County 1982 0.8660254037844388
Baltimore City 1977 0.816496580927726
Baltimore County 1996 0.816496580927726
Montgomery County 1990 0.816496580927726
Charles County 1996 0.7071067811865477
Anne Arundel County 1987 0.6666666666666669
Anne Arundel County 2001 0.6666666666666669
Anne Arundel County 2019 0.6666666666666669
Baltimore City 1980 0.6666666666666669
