In [2]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import json
import os
import warnings
warnings.filterwarnings('ignore')

# # Load the cleaned data file
# cwd = os.getcwd()
# print(cwd)
# parent_dir = os.path.dirname(cwd)
# print(parent_dir)
filename = r'C:\Users\jbh\Desktop\NYPD_Complaint_Data_Cleaned.csv'
df = pd.read_csv(filename)


df.shape

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


(8170556, 30)

In [16]:
# Focus crimes: FELONY ASSAULT, ROBBERY, RAPE
focus_crimes = [
    'HARRASSMENT 2', 
    'ROBBERY', 
    'ASSAULT 3 & RELATED OFFENSES', 
    'FELONY ASSAULT', 
    'PETIT LARCENY', 
    'GRAND LARCENY',
    'SEX CRIMES',
    'RAPE'
]


df_focus = df[df['Offense_Description'].isin(focus_crimes)]


# Keep columns: Complaint_From_Date, Complaint_From_Time, Lattitude, Longtitude, Victim_Sex
df_focus = df_focus[['Complaint_From_Date', 'Complaint_From_Time', 'Latitude', 'Longitude', 'Victim_Sex', 'Offense_Description']]

# Drop rows where Victim_Sex is not "M" or "F" (D=Business/Organization and E = PSNY/People of the State of New York which is not relevant to the case)
df_focus = df_focus[df_focus['Victim_Sex'] != 'D']
df_focus = df_focus[df_focus['Victim_Sex'] != 'E']
# The footnotes doesn't provide any useful information on the Victim_Sex == "L". We think it might be "Lady" or "LQBTQ" but we are not sure. So we will drop these rows. 
# In this subset of data, they only account for approx 800 rows out of 800k rows. So it's not a big deal to drop them.
df_focus = df_focus[df_focus['Victim_Sex'] != 'L']


# convert to datetime
df_focus['Complaint_From_Date'] = pd.to_datetime(df_focus['Complaint_From_Date'])
# create a column "DayOfWeek" and remove the column "Complaint_From_Date"
df_focus['weekday'] = df_focus['Complaint_From_Date'].dt.day_name()

# create a column "year" with the year of the date
df_focus['year'] = df_focus['Complaint_From_Date'].dt.year

# df_focus = df_focus[df_focus['year'] >= 2019]

df_focus = df_focus.drop(columns=['Complaint_From_Date'])

# convert Complaint_From_Time to datetime
df_focus['Complaint_From_Time'] = pd.to_datetime(df_focus['Complaint_From_Time'], format='%H:%M:%S').dt.time
# create a column "hour" and remove the column "Complaint_From_Time"
df_focus['hour'] = pd.to_datetime(df_focus['Complaint_From_Time'], format='%H:%M:%S').dt.hour
df_focus = df_focus.drop(columns=['Complaint_From_Time'])

# loop through all unique Offense_Descriptions and assign it an arbitrary number
offense_description_dict = {}
for i, offense_description in enumerate(df_focus['Offense_Description'].unique()):
    offense_description_dict[offense_description] = i

# map the Offense_Description to the arbitrary number
df_focus['Offense_Description'] = df_focus['Offense_Description'].map(offense_description_dict)

# rename Offense_Description to "crime_id"
df_focus = df_focus.rename(columns={'Offense_Description': 'crime_id'})

# loop through weekday and assign number from 0 to 6
weekday_dict = {}
for i, weekday in enumerate(df_focus['weekday'].unique()):
    weekday_dict[weekday] = i

# map the weekday to the arbitrary number
df_focus['weekday'] = df_focus['weekday'].map(weekday_dict)

# rename weekday to "day"
df_focus = df_focus.rename(columns={'weekday': 'day'})

# rename Victim_Sex to "sex"
df_focus = df_focus.rename(columns={'Victim_Sex': 'sex'})

# convert latitude and longitude to float
df_focus['Latitude'] = df_focus['Latitude'].astype(float)
df_focus['Longitude'] = df_focus['Longitude'].astype(float)

# rename to "lat" and "lon"
df_focus = df_focus.rename(columns={'Latitude': 'lat', 'Longitude': 'lon'})

# only keep 10k rows for now
# df_focus = df_focus.sample(n=10000)

# save the df to json with comma separated entries encapulated in square brackets
df_focus.to_json('DataUseCase1-v2.json', orient='records', lines=False)

df_focus

# REMEMBER TO UPDATE THE HTML FOR USECASE1 WITH CHANGES MADE HERE

Unnamed: 0,lat,lon,sex,crime_id,day,year,hour
3,40.903862,-73.846994,M,0,0,2006,16
13,40.875756,-73.835718,M,0,1,2008,14
16,40.849288,-73.938936,F,0,2,2009,8
18,40.823281,-73.923923,M,0,2,2009,4
20,40.816002,-73.941332,M,0,0,2010,17
...,...,...,...,...,...,...,...
8170477,40.699324,-73.831571,F,1,2,2019,16
8170504,40.842353,-73.844951,M,0,5,2014,18
8170506,40.726529,-73.734865,F,1,5,2021,12
8170513,40.726529,-73.734865,F,1,1,2021,21


In [14]:
# # focus crimes use case 2
# # busines owner wants to avoid all crimes towards businesses, i.e. Victim_Sex == "D"
df_focus2 = df

# Keep columns: Complaint_From_Date, Complaint_From_Time, Lattitude, Longtitude, Victim_Sex
df_focus2 = df_focus2[['Complaint_From_Date', 'Latitude', 'Longitude', 'Victim_Sex', 'Offense_Description']]

# keep rows where Victim_Sex is "D" (Business/Organization)
df_focus2 = df_focus2[df_focus2['Victim_Sex'] == 'D']

# drop Victim_Sex
df_focus2 = df_focus2.drop(columns=['Victim_Sex'])

# convert to datetime
df_focus2['Complaint_From_Date'] = pd.to_datetime(df_focus2['Complaint_From_Date'])
# create a column "Year" and remove the column "Complaint_From_Date"
df_focus2['Year'] = df_focus2['Complaint_From_Date'].dt.year
df_focus2 = df_focus2.drop(columns=['Complaint_From_Date'])

# set latitude and longitude to float
df_focus2['Latitude'] = df_focus2['Latitude'].astype(float)
df_focus2['Longitude'] = df_focus2['Longitude'].astype(float)

# rename to "lat" and "lon"
df_focus2 = df_focus2.rename(columns={'Latitude': 'lat', 'Longitude': 'lon'})

# rename "Offense_Description" to "crime_id"
df_focus2 = df_focus2.rename(columns={'Offense_Description': 'crime_id'})

# only keep the entries where the normalize crime value_count is above 0.01
value_counts = df_focus2['crime_id'].value_counts(normalize=True)
# print(value_counts)
to_keep = value_counts[value_counts > 0.01].index
df_focus2 = df_focus2[df_focus2['crime_id'].isin(to_keep)]

# loop through all unique Offense_Descriptions and assign it an arbitrary number
offense_description_dict = {}
for i, offense in enumerate(df_focus2['crime_id'].unique()):
    offense_description_dict[offense] = i
    
df_focus2['crime_id'] = df_focus2['crime_id'].map(offense_description_dict)

# save the offense_description_dict to json
with open('offense_description_dict.json', 'w') as fp:
    json.dump(offense_description_dict, fp)


# only use the years 2019 to 2022
df_focus2 = df_focus2[df_focus2['Year'] >= 2019]
df_focus2.shape

# save the df to json with comma separated entries encapulated in square brackets
df_focus2.to_json('DataUseCase2-v2.json', orient='records', lines=False)