# Arrest data in NYC, an exploration and regression analysis
## Author: Jack Robbins

In [1]:
# Important imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
from sklearn import preprocessing
from matplotlib.gridspec import GridSpec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier




In [None]:
# Read in our dataframe
arrests = pd.read_csv("data/NYPD_Arrests_Data__Historic__20241116.csv")

In [None]:
# Let's see what we're working with
arrests.head()

## Data Preprocessing
As we can see from above, we have a lot of NULLs and data that we may or may not want. We'll clean this data up before doing anything involving data analysis.

In [None]:
null_values=arrests.isnull().sum()
print("Detecting missing values:\n", null_values)

### Unneeded columns
Looking here, we have a column titled "ARREST KEY" which is likely the primary key for the database where this is stored. We don't need this column and therefore will drop it. We also don't need "LON_LAT" as it is just a combination of two other columns. The same can be said for X_COORD_CD and Y_COORD_CD, because these are just proxies for longitutde and latitute. Offense description is a free entry text field, and likely to contain large amounts of junk in it with no consistent pattern, so we'll get rid of that as well.

In [None]:
# Dropping columns
arrests.drop(['ARREST_KEY', 'X_COORD_CD', 'Y_COORD_CD',\
                 'OFNS_DESC','PD_DESC','Lon_Lat', 'Latitude', 'Longitude'], axis=1, inplace=True)

In [None]:
# Let's see how w're looking now
null_values=arrests.isnull().sum()
print("Detecting missing values:\n", null_values)

In [None]:
arrests.shape

### Removing NA's
As we can see, we have nearly 6 million rows of data to work with here. Additionally, there are only at most around 40,000 NA's in the dataset. In my opinion, dropping these is an accceptable loss. A lot of these NA's are also in categorical data columns, so there's really no way to fill them accordingly.

In [None]:
# Drop any rows that have at least one null column
arrests.dropna(how='any', inplace=True)

In [None]:
# Let's see how w're looking now
null_values=arrests.isnull().sum()
print("Detecting missing values:\n", null_values)

In [None]:
arrests.shape

As we can see, we still have well over 5 million records to analyze after doing the NA removal, and currently no more NA's in our dataset

### Cleaning up type mismatches and other miscellaneous preprocessing tasks

In [None]:
arrests.info()

Arrest date is one that specifically interests me, but I'm fairly certain that date is too granular. However **month** may not be, so I'm going to convert all of these dates to months.

In [None]:
#Convert a date to a month
def date_to_months(date):
    s = date.split("/")
    if(len(s) != 3):
        return Nan
    else:
        return int(s[0])

arrests['ARREST_MONTH'] = arrests['ARREST_DATE'].apply(date_to_months)

In [None]:
# Let's see how we're looking now
null_values=arrests.isnull().sum()
print("Detecting missing values:\n", null_values)

In [None]:
# It looks like age group is a mess here, so we'll have to remove these junk values...
arrests["AGE_GROUP"].unique()

In [None]:
# We only want these meaningful categories
indices_to_drop = arrests[~(arrests["AGE_GROUP"].isin(['25-44','45-64','18-24','65+', '<18']))].index
arrests.drop(indices_to_drop, inplace = True)

In [None]:
# Let's see how it looks now
arrests["AGE_GROUP"].describe()

In [None]:
arrests

In [None]:
# What about jurisdiction code?
arrests["PERP_RACE"].unique()

In [None]:
# Let's get rid of the unkown and other here
indices_to_drop = arrests[arrests["PERP_RACE"].isin(['UNKNOWN', 'OTHER'])].index
arrests.drop(indices_to_drop, inplace = True)

In [None]:
# Let's see how we did
arrests["PERP_RACE"].unique()

In [None]:
arrests.shape

Additionally, since we have the arrest month down, we no longer need the arrest date so we'll drop it

In [None]:
arrests.drop(['ARREST_DATE'], axis=1, inplace=True)

In [None]:
arrests.info()

We can also see that JURISDICTION_CODE and PD_CD and KEY_CD are all floats, but looking at the data shows us that they all encode values like "5.0" and the float part is not really needed. As such, we'll convert these all to ints

In [None]:
arrests['PD_CD'] = arrests['PD_CD'].apply(lambda x: int(x))
arrests['KY_CD'] = arrests['KY_CD'].apply(lambda x: int(x))
arrests['JURISDICTION_CODE'] = arrests['JURISDICTION_CODE'].apply(lambda x: int(x))

In [None]:
arrests

In [None]:
arrests.info()

### Encoding Categorical Data
We're almost there. Now we'll encode any/all categorical data using individual label encoders

In [None]:
#Initialize
law_code_le = preprocessing.LabelEncoder()
law_cat_le = preprocessing.LabelEncoder()
borough_le = preprocessing.LabelEncoder()
age_le = preprocessing.LabelEncoder()
sex_le = preprocessing.LabelEncoder()
race_le = preprocessing.LabelEncoder()

arrests['LAW_CODE'] = law_code_le.fit_transform(arrests['LAW_CODE'])
arrests['LAW_CAT_CD'] = law_cat_le.fit_transform(arrests['LAW_CAT_CD'])
arrests['ARREST_BORO'] = borough_le.fit_transform(arrests['ARREST_BORO'])
arrests['AGE_GROUP'] = age_le.fit_transform(arrests['AGE_GROUP'])
arrests['PERP_SEX'] = sex_le.fit_transform(arrests['PERP_SEX'])
arrests['PERP_RACE'] = race_le.fit_transform(arrests['PERP_RACE'])

In [None]:
arrests

In [None]:
arrests.info()

## Data Visualization - Hunting for correlations & patterns

In [None]:
display(HTML("<style>.container { width:100% !important; }</style>"))
corr = arrests.corr(method='pearson')
sns.set(font_scale=1)
plt.figure(figsize=(16,12))
sns_plot = sns.heatmap(
    corr,        
    cmap='RdBu_r', 
    annot=True, 
    vmin=-1, vmax=1);

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
corr = arrests.corr(method='pearson')
sns.set(font_scale=1)
plt.figure(figsize=(16,12))
sns_plot = sns.heatmap(
    corr,        
    cmap='RdBu_r', 
    annot=True, 
    vmin=-1, vmax=1);

In [None]:
columns = arrests.columns
for idx in range(len(columns)):
    x_value = columns[idx]
    t_value = columns[idx].title()
    fig = plt.figure(constrained_layout=True)
    gs = GridSpec(2, 2, figure=fig)
    # create sub plots as grid
    ax1 = fig.add_subplot(gs[0, :])
    sns.scatterplot(data=arrests,x=x_value, y='AGE_GROUP')
    ax2 = fig.add_subplot(gs[1, 0])
    sns.histplot(x=x_value,data=arrests,bins=16)
    ax3 = fig.add_subplot(gs[1, 1])
    sns.boxplot(data=arrests,x=x_value,orient='h')
 
    # depict illustration
    fig.suptitle(t_value)
    print('\n')

In [None]:
arrests["AGE_GROUP"].describe()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')

x = arrests['AGE_GROUP']
y = arrests['ARREST_BORO']
z = arrests['LAW_CAT_CD']

ax.set_xlabel("CRASH TIME")
ax.set_ylabel("VEHICLE TYPE CODE 1")
ax.set_zlabel("NUMBER INJURED")

ax.scatter(x, y, z)

 # Author: Daniyal Khan
 ### Split the dataset: Features and Target


In [None]:
#After cleaning up dataset, we split the dataset.
X = arrests.drop(columns=[['LAW_CAT_CD', 'PD_CD', 'KY_CD']])
y = arrests['LAW_CAT_CD']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=45)

print("Here is the dataset of all the features we will use to train the model")
print(X.shape)
X

In [None]:
print("Here is the dataset of the target variable we will use to test the model")
print(y.shape)
y

# Choosing prediction model:
- We can go with the K Nearest neighbors Model because our target variable is in **discrete categories**. Not numereous(Regression). Categorical values work well classification tasks.
- Aligns with classification task since target is predicted based on set of features.


# KNN Algorithm

In [None]:
#Lets train the model
model = KNeighborsClassifier(n_neighbors=13)
model.fit(X_train, y_train)


In [None]:
#Lets do a prediction
y_pred = model.predict(X_test)
y_pred

In [None]:
#Accuracy
accuracy = accuracy_score(y_test, y_pred)
# Precision 
precision = precision_score(y_test, y_pred,average='weighted', zero_division=0.0) 
# Recall 
recall = recall_score(y_test, y_pred,average='weighted', zero_division=0.0) 
# Find F1 score
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy of our Decision Tree Model:{accuracy: .3f}')
print(f'Precision of our Decision Tree Model:{precision: .3f}')
print(f'Recall of our Decision Tree Model:{recall: .3f}')
print(f'F1 score of our Decision Tree Model:{f1: .3f}')

# 2nd Algorithm to use: DecisionTree

In [None]:
#Lets train the model
model2 = DecisionTreeClassifier(criterion="entropy", random_state=42)
model2.fit(X_train, y_train)


In [None]:
#Lets do a prediction
y_pred2 = model2.predict(X_test)
y_pred2

In [None]:
#Accuracy
accuracy2 = accuracy_score(y_test, y_pred2)
# Precision 
precision2 = precision_score(y_test, y_pred2,average='weighted', zero_division=0.0) 
# Recall 
recall2 = recall_score(y_test, y_pred2,average='weighted', zero_division=0.0) 
# Find F1 score
f12 = f1_score(y_test, y_pred2, average='weighted')

print(f'Accuracy of our Decision Tree Model:{accuracy2: .3f}')
print(f'Precision of our Decision Tree Model:{precision2: .3f}')
print(f'Recall of our Decision Tree Model:{recall2: .3f}')
print(f'F1 score of our Decision Tree Model:{f12: .3f}')