## Packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
#from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
%matplotlib inline

# Preprocessing

In [2]:
#loading dataset
df = pd.read_csv('Edited_Crime_Data_from_2020_to_Present.csv')

In [3]:
df.isnull().sum()

Date Rptd              0
DATE OCC               0
TIME OCC               0
AREA                   0
AREA NAME              0
Rpt Dist No            0
Part 1-2               0
Crm Cd                 0
Crm Cd Desc            0
Vict Age               0
Vict Sex           41975
Vict Descent       41981
Premis Cd              5
Premis Desc          109
Weapon Used Cd    204007
Weapon Desc       204007
Status                 0
Status Desc            0
Crm Cd 1               2
LOCATION               0
Cross Street      264247
LAT                    0
LON                    0
dtype: int64

In [4]:
df = df.dropna(subset=['Weapon Used Cd'])
df = df.dropna(axis=1,thresh=len(df)*0.75)
df['Weapon Used Cd'] /= 100
df['Weapon Used Cd'] = df['Weapon Used Cd'].apply(np.floor)

In [5]:
df = df._get_numeric_data()

In [6]:
for value in set(df['Weapon Used Cd']):
    print(value)

1.0
2.0
3.0
4.0
5.0


In [7]:
X = df.drop('Weapon Used Cd', axis=1)
y = df['Weapon Used Cd']
X, y

(        TIME OCC  AREA  Rpt Dist No  Part 1-2  Crm Cd  Vict Age  Premis Cd  \
 0           2230     3          377         2     624        36      501.0   
 1            330     1          163         2     624        25      102.0   
 5             30     1          163         1     121        25      735.0   
 9             30    18         1871         2     930        21      101.0   
 10          2200     1          192         1     330        29      101.0   
 ...          ...   ...          ...       ...     ...       ...        ...   
 321980       915     2          246         2     624        54      101.0   
 321985       813     6          645         2     624        33      135.0   
 321991      2135    21         2143         2     624        44      101.0   
 321993      2210     5          564         2     434        41      502.0   
 321994      1050    17         1798         2     624        40      501.0   
 
         Crm Cd 1      LAT       LON  
 0         

In [8]:
label_quality = LabelEncoder()

In [9]:
y = label_quality.fit_transform(y)
y

array([3, 4, 4, ..., 3, 1, 4], dtype=int64)

In [10]:
# method for filling in empty spaces, save imp to normalize when using the model
imp=SimpleImputer(strategy='mean')
X=pd.DataFrame(imp.fit_transform(X), columns = X.columns)

X.isnull().sum()

TIME OCC       0
AREA           0
Rpt Dist No    0
Part 1-2       0
Crm Cd         0
Vict Age       0
Premis Cd      0
Crm Cd 1       0
LAT            0
LON            0
dtype: int64

In [11]:
# method for scaling down columns
sc_input = StandardScaler()
X=pd.DataFrame(sc_input.fit_transform(X), columns = X.columns)

X

Unnamed: 0,TIME OCC,AREA,Rpt Dist No,Part 1-2,Crm Cd,Vict Age,Premis Cd,Crm Cd 1,LAT,LON
0,1.296737,-1.221098,-1.174017,0.908887,0.484085,0.075093,0.904622,0.488062,0.071099,-0.082996
1,-1.532655,-1.543728,-1.520049,0.908887,0.484085,-0.545235,-0.980030,0.488062,0.081893,-0.078763
2,-1.979401,-1.543728,-1.520049,-1.100247,-1.614306,-0.545235,2.009907,-1.614835,0.081656,-0.078655
3,-1.979401,1.198624,1.241739,0.908887,1.760642,-0.770809,-0.984753,1.767360,0.033011,-0.082390
4,1.252062,-1.543728,-1.473157,-1.100247,-0.742410,-0.319662,-0.984753,-0.741066,0.078499,-0.079770
...,...,...,...,...,...,...,...,...,...,...
117985,-0.661500,-1.382413,-1.385840,0.908887,0.484085,1.090176,-0.984753,0.488062,0.086442,-0.080767
117986,-0.813394,-0.737154,-0.740669,0.908887,0.484085,-0.094088,-0.824156,0.488062,0.100802,-0.087034
117987,1.155267,1.682569,1.681555,0.908887,0.484085,0.526241,-0.984753,0.488062,0.129283,-0.115436
117988,1.266953,-0.898469,-0.871643,0.908887,-0.308548,0.357060,0.909346,-0.306273,-0.021134,-0.081393


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# Machine Learning Models

# Random Forest Classifier

In [13]:
rfc = RandomForestClassifier(n_estimators=50)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

In [14]:
print(classification_report(y_test, pred_rfc))
print(confusion_matrix(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.54      0.58      0.56      2888
           1       0.42      0.37      0.39      2113
           2       0.33      0.20      0.25      1439
           3       0.80      0.91      0.85     12470
           4       0.70      0.56      0.62      4688

    accuracy                           0.71     23598
   macro avg       0.56      0.52      0.53     23598
weighted avg       0.69      0.71      0.69     23598

[[ 1682   350   135   567   154]
 [  443   774   130   505   261]
 [  276   231   283   387   262]
 [  401   206   143 11305   415]
 [  290   301   166  1326  2605]]


# SVM Classifier

In [None]:
clf = SVC()
clf.fit(X_train,y_train)
pred_clf = clf.predict(X_test)

In [None]:
print(classification_report(y_test, pred_clf))
print(confusion_matrix(y_test, pred_clf))