In [57]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import json
import os
import warnings
warnings.filterwarnings('ignore')

# # Load the cleaned data file
# cwd = os.getcwd()
# print(cwd)
# parent_dir = os.path.dirname(cwd)
# print(parent_dir)
filename = r'C:\Users\jbh\Desktop\NYPD_Complaint_Data_Cleaned.csv'
df = pd.read_csv(filename)


df.shape

(8170556, 30)

In [58]:
# Focus crimes: FELONY ASSAULT, ROBBERY, RAPE
focus_crimes = ['FELONY ASSAULT', 'ROBBERY', 'RAPE']

df_focus = df[df['Offense_Description'].isin(focus_crimes)]


# Keep columns: Complaint_From_Date, Complaint_From_Time, Lattitude, Longtitude, Victim_Sex
df_focus = df_focus[['Complaint_From_Date', 'Complaint_From_Time', 'Latitude', 'Longitude', 'Victim_Sex']]

# Drop rows where Victim_Sex is not "M" or "F" (D=Business/Organization and E = PSNY/People of the State of New York which is not relevant to the case)
df_focus = df_focus[df_focus['Victim_Sex'] != 'D']
df_focus = df_focus[df_focus['Victim_Sex'] != 'E']
# The footnotes doesn't provide any useful information on the Victim_Sex == "L". We think it might be "Lady" or "LQBTQ" but we are not sure. So we will drop these rows. 
# In this subset of data, they only account for approx 800 rows out of 800k rows. So it's not a big deal to drop them.
df_focus = df_focus[df_focus['Victim_Sex'] != 'L']


# convert to datetime
df_focus['Complaint_From_Date'] = pd.to_datetime(df_focus['Complaint_From_Date'])
# create a column "DayOfWeek" and remove the column "Complaint_From_Date"
df_focus['weekday'] = df_focus['Complaint_From_Date'].dt.day_name()
df_focus = df_focus.drop(columns=['Complaint_From_Date'])

# convert Complaint_From_Time to datetime
df_focus['Complaint_From_Time'] = pd.to_datetime(df_focus['Complaint_From_Time'], format='%H:%M:%S').dt.time
# create a column "hour" and remove the column "Complaint_From_Time"
df_focus['hour'] = pd.to_datetime(df_focus['Complaint_From_Time'], format='%H:%M:%S').dt.hour
df_focus = df_focus.drop(columns=['Complaint_From_Time'])

# convert latitude and longitude to float
df_focus['Latitude'] = df_focus['Latitude'].astype(float)
df_focus['Longitude'] = df_focus['Longitude'].astype(float)

# rename to "lat" and "lon"
df_focus = df_focus.rename(columns={'Latitude': 'lat', 'Longitude': 'lon'})

# only keep 10k rows for now
# df_focus = df_focus.sample(n=10000)

# save the df to json with comma separated entries encapulated in square brackets
df_focus.to_json('data2.json', orient='records', lines=False)

In [59]:
# # focus crimes use case 2
# # busines owner wants to avoid all crimes towards businesses, i.e. Victim_Sex == "D"
df_focus2 = df

# Keep columns: Complaint_From_Date, Complaint_From_Time, Lattitude, Longtitude, Victim_Sex
df_focus2 = df_focus2[['Complaint_From_Date', 'Latitude', 'Longitude', 'Victim_Sex', 'Offense_Description']]

# keep rows where Victim_Sex is "D" (Business/Organization)
df_focus2 = df_focus2[df_focus2['Victim_Sex'] == 'D']

# drop Victim_Sex
df_focus2 = df_focus2.drop(columns=['Victim_Sex'])

# convert to datetime
df_focus2['Complaint_From_Date'] = pd.to_datetime(df_focus2['Complaint_From_Date'])
# create a column "Year" and remove the column "Complaint_From_Date"
df_focus2['Year'] = df_focus2['Complaint_From_Date'].dt.year
df_focus2 = df_focus2.drop(columns=['Complaint_From_Date'])

# set latitude and longitude to float
df_focus2['Latitude'] = df_focus2['Latitude'].astype(float)
df_focus2['Longitude'] = df_focus2['Longitude'].astype(float)

# rename to "lat" and "lon"
df_focus2 = df_focus2.rename(columns={'Latitude': 'lat', 'Longitude': 'lon'})

# rename "Offense_Description" to "crime_id"
df_focus2 = df_focus2.rename(columns={'Offense_Description': 'crime_id'})

# only keep the entries where the normalize crime value_count is above 0.01
value_counts = df_focus2['crime_id'].value_counts(normalize=True)
# print(value_counts)
to_keep = value_counts[value_counts > 0.01].index
df_focus2 = df_focus2[df_focus2['crime_id'].isin(to_keep)]

# loop through all unique Offense_Descriptions and assign it an arbitrary number
offense_description_dict = {}
for i, offense in enumerate(df_focus2['crime_id'].unique()):
    offense_description_dict[offense] = i
    
df_focus2['crime_id'] = df_focus2['crime_id'].map(offense_description_dict)

# save the offense_description_dict to json
with open('offense_description_dict.json', 'w') as fp:
    json.dump(offense_description_dict, fp)

# save the df to json with comma separated entries encapulated in square brackets
df_focus2.to_json('data_usecase2.json', orient='records', lines=False)