In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
# Loading in the dataset

file_path = "./Data/NYPD_Arrests_Data.csv"

crime_df = pd.read_csv(file_path)

crime_df.head()

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,...,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat,Zip Codes,Community Districts,Borough Boundaries,City Council Districts,Police Precincts
0,159838726,01/01/2017,115.0,RECKLESS ENDANGERMENT 2,355.0,OFFENSES AGAINST THE PERSON,PL 1202000,M,M,14,...,988912.0,212647.0,40.75035,-73.983175,POINT (-73.98317545899994 40.750350440000034),12080.0,11.0,4.0,51.0,8.0
1,159824786,01/01/2017,101.0,ASSAULT 3,344.0,ASSAULT 3 & RELATED OFFENSES,PL 1200001,M,M,18,...,992043.0,217246.0,40.762971,-73.97187,POINT (-73.97186979099996 40.76297132000008),12419.0,11.0,4.0,51.0,10.0
2,159840237,01/01/2017,729.0,"FORGERY,ETC.,UNCLASSIFIED-FELONY",113.0,FORGERY,PL 1702500,F,B,40,...,1006669.0,233631.0,40.807919,-73.919017,POINT (-73.91901728199997 40.80791877300004),10932.0,49.0,5.0,35.0,23.0
3,159828861,01/01/2017,744.0,BAIL JUMPING 3,359.0,OFFENSES AGAINST PUBLIC ADMINISTRATION,PL 2155500,M,K,67,...,1009039.0,176368.0,40.65074,-73.910667,POINT (-73.91066707899995 40.650739599000076),13827.0,61.0,2.0,25.0,40.0
4,159834533,01/01/2017,203.0,"TRESPASS 3, CRIMINAL",352.0,CRIMINAL TRESPASS,PL 1401000,M,K,60,...,986229.0,148311.0,40.573763,-73.992878,POINT (-73.99287775699997 40.573763375000055),18184.0,21.0,2.0,45.0,35.0


In [3]:
crime_df.dtypes

ARREST_KEY                  int64
ARREST_DATE                object
PD_CD                     float64
PD_DESC                    object
KY_CD                     float64
OFNS_DESC                  object
LAW_CODE                   object
LAW_CAT_CD                 object
ARREST_BORO                object
ARREST_PRECINCT             int64
JURISDICTION_CODE           int64
AGE_GROUP                  object
PERP_SEX                   object
PERP_RACE                  object
X_COORD_CD                float64
Y_COORD_CD                float64
Latitude                  float64
Longitude                 float64
Lon_Lat                    object
Zip Codes                 float64
Community Districts       float64
Borough Boundaries        float64
City Council Districts    float64
Police Precincts          float64
dtype: object

In [4]:
# Convert the ARREST_DATE column to a datetime data type

crime_df['ARREST_DATE'] = pd.to_datetime(crime_df['ARREST_DATE'])

print(crime_df.dtypes)

ARREST_KEY                         int64
ARREST_DATE               datetime64[ns]
PD_CD                            float64
PD_DESC                           object
KY_CD                            float64
OFNS_DESC                         object
LAW_CODE                          object
LAW_CAT_CD                        object
ARREST_BORO                       object
ARREST_PRECINCT                    int64
JURISDICTION_CODE                  int64
AGE_GROUP                         object
PERP_SEX                          object
PERP_RACE                         object
X_COORD_CD                       float64
Y_COORD_CD                       float64
Latitude                         float64
Longitude                        float64
Lon_Lat                           object
Zip Codes                        float64
Community Districts              float64
Borough Boundaries               float64
City Council Districts           float64
Police Precincts                 float64
dtype: object


In [5]:
# Listing all column names

crime_df.columns.tolist()

['ARREST_KEY',
 'ARREST_DATE',
 'PD_CD',
 'PD_DESC',
 'KY_CD',
 'OFNS_DESC',
 'LAW_CODE',
 'LAW_CAT_CD',
 'ARREST_BORO',
 'ARREST_PRECINCT',
 'JURISDICTION_CODE',
 'AGE_GROUP',
 'PERP_SEX',
 'PERP_RACE',
 'X_COORD_CD',
 'Y_COORD_CD',
 'Latitude',
 'Longitude',
 'Lon_Lat',
 'Zip Codes',
 'Community Districts',
 'Borough Boundaries',
 'City Council Districts',
 'Police Precincts']

In [6]:
# Removing Arrest_Key, Law_Code, X_COORD_CD, Y_COORD_CD, Zip Codes, Community Districts, Borough Boundaries, City Council Districts columns.

crime_df_cleaned = crime_df.drop(columns=['PD_CD', 'PD_DESC', 'KY_CD', 'OFNS_DESC', 'ARREST_PRECINCT', 'JURISDICTION_CODE', 'ARREST_KEY', 'LAW_CODE', 'X_COORD_CD', 'Y_COORD_CD', 'Zip Codes', 'Community Districts', 'Borough Boundaries', 'City Council Districts', 'Police Precincts'], axis=1)

crime_df_cleaned.head(10)

Unnamed: 0,ARREST_DATE,LAW_CAT_CD,ARREST_BORO,AGE_GROUP,PERP_SEX,PERP_RACE,Latitude,Longitude,Lon_Lat
0,2017-01-01,M,M,18-24,M,WHITE,40.75035,-73.983175,POINT (-73.98317545899994 40.750350440000034)
1,2017-01-01,M,M,25-44,M,WHITE,40.762971,-73.97187,POINT (-73.97186979099996 40.76297132000008)
2,2017-01-01,F,B,25-44,M,WHITE HISPANIC,40.807919,-73.919017,POINT (-73.91901728199997 40.80791877300004)
3,2017-01-01,M,K,25-44,M,BLACK,40.65074,-73.910667,POINT (-73.91066707899995 40.650739599000076)
4,2017-01-01,M,K,18-24,M,BLACK,40.573763,-73.992878,POINT (-73.99287775699997 40.573763375000055)
5,2017-01-01,M,Q,25-44,M,BLACK,40.675403,-73.792898,POINT (-73.79289777099996 40.67540344900006)
6,2017-01-01,M,B,25-44,M,BLACK HISPANIC,40.881793,-73.863713,POINT (-73.86371337599996 40.88179268700002)
7,2017-01-01,M,K,45-64,M,BLACK,40.678444,-73.947757,POINT (-73.94775722799994 40.678443525000034)
8,2017-01-01,M,M,25-44,F,BLACK,40.724793,-73.976785,POINT (-73.97678528999995 40.72479293300007)
9,2017-01-01,M,B,18-24,M,BLACK,40.881793,-73.863713,POINT (-73.86371337599996 40.88179268700002)


In [7]:
crime_df_cleaned.head(10)

Unnamed: 0,ARREST_DATE,LAW_CAT_CD,ARREST_BORO,AGE_GROUP,PERP_SEX,PERP_RACE,Latitude,Longitude,Lon_Lat
0,2017-01-01,M,M,18-24,M,WHITE,40.75035,-73.983175,POINT (-73.98317545899994 40.750350440000034)
1,2017-01-01,M,M,25-44,M,WHITE,40.762971,-73.97187,POINT (-73.97186979099996 40.76297132000008)
2,2017-01-01,F,B,25-44,M,WHITE HISPANIC,40.807919,-73.919017,POINT (-73.91901728199997 40.80791877300004)
3,2017-01-01,M,K,25-44,M,BLACK,40.65074,-73.910667,POINT (-73.91066707899995 40.650739599000076)
4,2017-01-01,M,K,18-24,M,BLACK,40.573763,-73.992878,POINT (-73.99287775699997 40.573763375000055)
5,2017-01-01,M,Q,25-44,M,BLACK,40.675403,-73.792898,POINT (-73.79289777099996 40.67540344900006)
6,2017-01-01,M,B,25-44,M,BLACK HISPANIC,40.881793,-73.863713,POINT (-73.86371337599996 40.88179268700002)
7,2017-01-01,M,K,45-64,M,BLACK,40.678444,-73.947757,POINT (-73.94775722799994 40.678443525000034)
8,2017-01-01,M,M,25-44,F,BLACK,40.724793,-73.976785,POINT (-73.97678528999995 40.72479293300007)
9,2017-01-01,M,B,18-24,M,BLACK,40.881793,-73.863713,POINT (-73.86371337599996 40.88179268700002)


In [8]:
# Convert the ARREST_DATE column to a datetime data type

crime_df_cleaned['ARREST_DATE'] = pd.to_datetime(crime_df_cleaned['ARREST_DATE'])

print(crime_df_cleaned.dtypes)

ARREST_DATE    datetime64[ns]
LAW_CAT_CD             object
ARREST_BORO            object
AGE_GROUP              object
PERP_SEX               object
PERP_RACE              object
Latitude              float64
Longitude             float64
Lon_Lat                object
dtype: object


In [9]:
# Get all unique values in the "ARREST_BORO" column

unique_values = crime_df_cleaned['ARREST_BORO'].unique()

# Print the unique values

print(unique_values)

['M' 'B' 'K' 'Q' 'S']


In [10]:
# Get all unique values in the "LAW_CAT_CD" column

unique_values = crime_df_cleaned['LAW_CAT_CD'].unique()

# Print the unique values

print(unique_values)

['M' 'F' 'V' 'I' nan]


In [11]:
# Get all unique values in the "AGE_GROUP" column

unique_values = crime_df_cleaned['AGE_GROUP'].unique()

# Print the unique values

print(unique_values)

['18-24' '25-44' '45-64' '<18' '65+']


In [12]:
# Get all unique values in the "PERP_RACE" column

unique_values = crime_df_cleaned['PERP_RACE'].unique()

# Print the unique values

print(unique_values)

['WHITE' 'WHITE HISPANIC' 'BLACK' 'BLACK HISPANIC'
 'ASIAN / PACIFIC ISLANDER' 'UNKNOWN' 'AMERICAN INDIAN/ALASKAN NATIVE']


In [13]:
# Delete rows with certain values in the "LAW_CAT_CD" column

crime_df_cleaned = crime_df_cleaned[crime_df_cleaned['LAW_CAT_CD'] != 'I']

# Print the updated dataframe

crime_df_cleaned.head()

Unnamed: 0,ARREST_DATE,LAW_CAT_CD,ARREST_BORO,AGE_GROUP,PERP_SEX,PERP_RACE,Latitude,Longitude,Lon_Lat
0,2017-01-01,M,M,18-24,M,WHITE,40.75035,-73.983175,POINT (-73.98317545899994 40.750350440000034)
1,2017-01-01,M,M,25-44,M,WHITE,40.762971,-73.97187,POINT (-73.97186979099996 40.76297132000008)
2,2017-01-01,F,B,25-44,M,WHITE HISPANIC,40.807919,-73.919017,POINT (-73.91901728199997 40.80791877300004)
3,2017-01-01,M,K,25-44,M,BLACK,40.65074,-73.910667,POINT (-73.91066707899995 40.650739599000076)
4,2017-01-01,M,K,18-24,M,BLACK,40.573763,-73.992878,POINT (-73.99287775699997 40.573763375000055)


In [14]:
# Get all unique values in the "LAW_CAT_CD" column

unique_values = crime_df_cleaned['LAW_CAT_CD'].unique()

# Print the unique values

print(unique_values)

['M' 'F' 'V' nan]


In [15]:
# Count the number of null values in each column

null_counts = crime_df_cleaned.isnull().sum()

# Print the null value counts

print(null_counts)

ARREST_DATE       0
LAW_CAT_CD     7411
ARREST_BORO       0
AGE_GROUP         0
PERP_SEX          0
PERP_RACE         0
Latitude          0
Longitude         0
Lon_Lat           0
dtype: int64


In [16]:
# Drop rows with null values in any column

crime_df_cleaned = crime_df_cleaned.dropna()

# Print the updated dataframe

crime_df_cleaned.head()

Unnamed: 0,ARREST_DATE,LAW_CAT_CD,ARREST_BORO,AGE_GROUP,PERP_SEX,PERP_RACE,Latitude,Longitude,Lon_Lat
0,2017-01-01,M,M,18-24,M,WHITE,40.75035,-73.983175,POINT (-73.98317545899994 40.750350440000034)
1,2017-01-01,M,M,25-44,M,WHITE,40.762971,-73.97187,POINT (-73.97186979099996 40.76297132000008)
2,2017-01-01,F,B,25-44,M,WHITE HISPANIC,40.807919,-73.919017,POINT (-73.91901728199997 40.80791877300004)
3,2017-01-01,M,K,25-44,M,BLACK,40.65074,-73.910667,POINT (-73.91066707899995 40.650739599000076)
4,2017-01-01,M,K,18-24,M,BLACK,40.573763,-73.992878,POINT (-73.99287775699997 40.573763375000055)


In [17]:
# Creating new dataframe

pd.set_option('display.max_rows', None)

pd.set_option('display.max_columns', None)

crime_df_grouped = crime_df_cleaned.groupby(['ARREST_DATE', 'LAW_CAT_CD', 'PERP_RACE', 'ARREST_BORO', 'AGE_GROUP', 'PERP_SEX']).size()

In [18]:
crime_df_grouped = crime_df_grouped.reset_index(name='counts')

In [19]:
crime_df_sorted = crime_df_grouped.sort_values(by=['counts'], ascending=False)

In [20]:
crime_df_sorted.head(50)

Unnamed: 0,ARREST_DATE,LAW_CAT_CD,PERP_RACE,ARREST_BORO,AGE_GROUP,PERP_SEX,counts
188626,2019-08-14,F,BLACK,K,25-44,M,124
159325,2019-03-11,F,WHITE,K,25-44,M,111
64032,2017-11-01,F,BLACK,M,25-44,F,95
234584,2020-06-01,F,BLACK,M,18-24,M,95
252960,2020-10-28,F,BLACK,M,25-44,M,93
45348,2017-08-02,M,WHITE,M,45-64,M,84
164802,2019-04-09,F,BLACK,M,25-44,M,77
299201,2021-09-21,F,WHITE HISPANIC,B,25-44,M,75
98850,2018-04-25,F,BLACK,K,18-24,M,75
234652,2020-06-01,M,BLACK,M,18-24,M,74


In [21]:
# Set the index to the ARREST_DATE column

crime_df_sorted.set_index('ARREST_DATE', inplace=True)

# Group the data by month and sum up the counts

monthly_counts = crime_df_sorted.groupby(['LAW_CAT_CD', 'PERP_RACE', 'ARREST_BORO', 'AGE_GROUP', 'PERP_SEX']).resample('M').sum()

# Reset the index and select only the necessary columns

monthly_counts = monthly_counts.reset_index()[['LAW_CAT_CD', 'PERP_RACE', 'ARREST_BORO', 'AGE_GROUP', 'PERP_SEX', 'ARREST_DATE', 'counts']]

In [22]:
monthly_counts.head()

Unnamed: 0,LAW_CAT_CD,PERP_RACE,ARREST_BORO,AGE_GROUP,PERP_SEX,ARREST_DATE,counts
0,F,AMERICAN INDIAN/ALASKAN NATIVE,B,18-24,F,2019-06-30,1
1,F,AMERICAN INDIAN/ALASKAN NATIVE,B,18-24,F,2019-07-31,0
2,F,AMERICAN INDIAN/ALASKAN NATIVE,B,18-24,F,2019-08-31,0
3,F,AMERICAN INDIAN/ALASKAN NATIVE,B,18-24,F,2019-09-30,0
4,F,AMERICAN INDIAN/ALASKAN NATIVE,B,18-24,F,2019-10-31,0


In [23]:
monthly_counts_sorted = monthly_counts.sort_values('counts', ascending=False)

monthly_counts_sorted.head()

Unnamed: 0,LAW_CAT_CD,PERP_RACE,ARREST_BORO,AGE_GROUP,PERP_SEX,ARREST_DATE,counts
23233,M,BLACK,K,25-44,M,2017-01-31,969
23235,M,BLACK,K,25-44,M,2017-03-31,969
23245,M,BLACK,K,25-44,M,2018-01-31,954
23238,M,BLACK,K,25-44,M,2017-06-30,942
23236,M,BLACK,K,25-44,M,2017-04-30,935
