In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Load the data
df = pd.read_csv('TransformGainesville_CrimesALL.csv', index_col = 0)

df.head()

Unnamed: 0,ID,CFS,CFS_Type,Classification,offenseDate,offenseHour,offenseDOW,reportDate,reportHour,reportDOW,...,longitude,location,date,month,day,year,fullDate,DOW,time,moonPhase
0,221009267,Domestic Aggravated Battery,Battery,Person,2021-07-04 21:24:00,21,Sunday,7/4/2021 22:37,22,Sunday,...,-82.326069,POINT (-82.326069 29.688534000000004),,,,,,,,
1,221009608,Domestic Aggravated Battery,Battery,Person,2021-07-11 22:54:00,22,Sunday,7/11/2021 22:55,22,Sunday,...,-82.387148,POINT (-82.387148 29.632687000000004),,,,,,,,
2,221009391,Domestic Aggravated Battery,Battery,Person,2021-07-07 19:12:00,19,Wednesday,7/7/2021 19:13,19,Wednesday,...,-82.29939,POINT (-82.29939 29.640249),,,,,,,,
3,221009308,Domestic Aggravated Battery,Battery,Person,2021-07-06 07:26:00,7,Tuesday,7/6/2021 7:27,7,Tuesday,...,-82.398242,POINT (-82.398242 29.641625),,,,,,,,
4,221011388,Domestic Aggravated Battery,Battery,Person,2021-08-16 17:25:00,17,Monday,8/16/2021 17:26,17,Monday,...,-82.326069,POINT (-82.326069 29.688534000000004),,,,,,,,


In [5]:
df.columns

Index(['ID', 'CFS', 'CFS_Type', 'Classification', 'offenseDate', 'offenseHour',
       'offenseDOW', 'reportDate', 'reportHour', 'reportDOW', 'city', 'state',
       'address', 'latitude', 'longitude', 'location', 'date ', 'month', 'day',
       'year', 'fullDate', 'DOW', 'time ', 'moonPhase'],
      dtype='object')

In [6]:
#Find out unique names of CFS, CFS_type, and moonPhase

In [7]:
df.Classification.unique()

array(['Person', 'Other ', 'Property', 'Government'], dtype=object)

In [8]:
df.CFS.unique()

array(['Domestic Aggravated Battery', 'Domestic Simple Battery',
       'Domestic Disturbance', 'Burglary to Residence', 'Fire',
       'Theft Grand - Retail', 'Driving Under the Influence',
       'Death Investigation', 'Robbery (armed)', 'Theft Petit - Other',
       'Weapons Violation (possessing/concealing)',
       'Drug Poss. of Controlled Substance', 'Damage to Property',
       'Robbery (strong Arm)', 'Theft Petit - Retail',
       'Suspicious Incident', 'Domestic Violence Injunction Violation',
       'Stolen Vehicle (auto)', 'Trespass', 'Burglary to Conveyance',
       'Loitering and Prowling', 'Stolen Vehicle (motorcycle)',
       'Identity Theft', 'Disturbance',
       'Fraud (obtain Money/property by False Pretense)',
       'Battery (simple)', 'Violation of Temporary Injunction',
       'Criminal Mischief (misdemeanor)', 'Robbery',
       'Domestic Battery by Strangulation', 'Making False 911 Call',
       'Fraud (credit Card/atm)', 'Found Property', 'Disorderly Conduct',

In [9]:
df.CFS_Type.unique()

array(['Battery', 'Quality of Life', 'Theft', 'Other', 'Alcohol',
       'Death Inv/Homicide', 'Gov Reg Vio', 'Drugs', 'Fraud', 'Assault',
       'Suicide'], dtype=object)

In [10]:
df.moonPhase.unique()

array([nan, 'Full Moon ', 'First Quarter', 'New  Moon ', 'Third Quarter'],
      dtype=object)

In [None]:
# I think previous cell took out all NaNs, which skews data because now there's no data in which there's not a moon phase attached to it?

In [11]:
# Dropping CFS column because there's too many data points

In [12]:
unused_cols = ['CFS','reportDate', 'offenseHour','reportHour','reportDOW','city','state','address','longitude','latitude','day','location','date ','month','year']

df = df.drop(unused_cols, axis = 1)

df.head(50)

Unnamed: 0,ID,CFS_Type,Classification,offenseDate,offenseDOW,fullDate,DOW,time,moonPhase
0,221009267,Battery,Person,2021-07-04 21:24:00,Sunday,,,,
1,221009608,Battery,Person,2021-07-11 22:54:00,Sunday,,,,
2,221009391,Battery,Person,2021-07-07 19:12:00,Wednesday,,,,
3,221009308,Battery,Person,2021-07-06 07:26:00,Tuesday,,,,
4,221011388,Battery,Person,2021-08-16 17:25:00,Monday,,,,
5,221011524,Battery,Person,2021-08-19 07:30:00,Thursday,,,,
6,221012057,Battery,Person,2021-08-28 16:24:00,Saturday,,,,
7,221012231,Battery,Person,2021-08-31 23:53:00,Tuesday,,,,
8,221012341,Battery,Person,2021-09-02 19:04:00,Thursday,,,,
9,221013249,Battery,Person,2021-09-19 19:35:00,Sunday,,,,


In [13]:
# Convert CFS_Type and moonPhase with get dummies

convert_cols = df[["CFS_Type", "moonPhase"]]
convert_cols

Unnamed: 0,CFS_Type,moonPhase
0,Battery,
1,Battery,
2,Battery,
3,Battery,
4,Battery,
...,...,...
44868,Battery,
44869,Other,
44870,Theft,
44871,Gov Reg Vio,


In [14]:
# Merge dummy columns into main dataframe and drop originals 

new_columns_df = pd.get_dummies(convert_cols)
new_columns_df

Unnamed: 0,CFS_Type_Alcohol,CFS_Type_Assault,CFS_Type_Battery,CFS_Type_Death Inv/Homicide,CFS_Type_Drugs,CFS_Type_Fraud,CFS_Type_Gov Reg Vio,CFS_Type_Other,CFS_Type_Quality of Life,CFS_Type_Suicide,CFS_Type_Theft,moonPhase_First Quarter,moonPhase_Full Moon,moonPhase_New Moon,moonPhase_Third Quarter
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44868,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
44869,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
44870,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
44871,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [15]:
# Drop old columns and merge or CONCATENATE on index

df = df.drop(['CFS_Type',"moonPhase"], axis = 1)


In [16]:
new_df = pd.merge(df,new_columns_df, left_index=True, right_index = True )
new_df

Unnamed: 0,ID,Classification,offenseDate,offenseDOW,fullDate,DOW,time,CFS_Type_Alcohol,CFS_Type_Assault,CFS_Type_Battery,...,CFS_Type_Fraud,CFS_Type_Gov Reg Vio,CFS_Type_Other,CFS_Type_Quality of Life,CFS_Type_Suicide,CFS_Type_Theft,moonPhase_First Quarter,moonPhase_Full Moon,moonPhase_New Moon,moonPhase_Third Quarter
0,221009267,Person,2021-07-04 21:24:00,Sunday,,,,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,221009608,Person,2021-07-11 22:54:00,Sunday,,,,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,221009391,Person,2021-07-07 19:12:00,Wednesday,,,,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,221009308,Person,2021-07-06 07:26:00,Tuesday,,,,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,221011388,Person,2021-08-16 17:25:00,Monday,,,,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44868,219004357,Person,2019-03-14 22:45:00,Thursday,,,,0,0,1,...,0,0,0,0,0,0,0,0,0,0
44869,218014815,Property,2018-08-15 12:15:00,Wednesday,,,,0,0,0,...,0,0,1,0,0,0,0,0,0,0
44870,218019557,Property,2018-10-23 14:53:00,Tuesday,,,,0,0,0,...,0,0,0,0,0,1,0,0,0,0
44871,218017216,Government,2018-09-18 21:46:00,Tuesday,,,,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [28]:
y = pd.get_dummies(new_df['Classification'])


X = new_df[['CFS_Type_Battery', 'CFS_Type_Quality of Life', 'CFS_Type_Theft', 'CFS_Type_Other', 'CFS_Type_Alcohol',
       'CFS_Type_Death Inv/Homicide', 'CFS_Type_Gov Reg Vio', 'CFS_Type_Drugs', 'CFS_Type_Fraud', 'CFS_Type_Assault',
       'CFS_Type_Suicide','moonPhase_Full Moon ', 'moonPhase_First Quarter', 'moonPhase_New  Moon ', 'moonPhase_Third Quarter']]

In [29]:
y.value_counts()

Government  Other   Person  Property
0           0       0       1           25223
                    1       0           11705
            1       0       0            4520
1           0       0       0            3425
dtype: int64

In [30]:
X.describe()

Unnamed: 0,CFS_Type_Battery,CFS_Type_Quality of Life,CFS_Type_Theft,CFS_Type_Other,CFS_Type_Alcohol,CFS_Type_Death Inv/Homicide,CFS_Type_Gov Reg Vio,CFS_Type_Drugs,CFS_Type_Fraud,CFS_Type_Assault,CFS_Type_Suicide,moonPhase_Full Moon,moonPhase_First Quarter,moonPhase_New Moon,moonPhase_Third Quarter
count,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0
mean,0.123237,0.209948,0.39077,0.108551,0.007867,0.013438,0.026319,0.041785,0.057585,0.02048,2.2e-05,0.004301,0.003454,0.003922,0.004368
std,0.328712,0.407276,0.487928,0.311078,0.088346,0.115142,0.160083,0.200099,0.232959,0.141637,0.004721,0.065442,0.058671,0.062505,0.065946
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
X.shape

(44873, 15)

In [32]:
y.shape

(44873, 4)

In [44]:
# You can modify test_size to account for 30% testing/70% training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

In [45]:
# Resample the training data with the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

In [46]:
clf.fit(X_train, y_train)

RandomForestClassifier()

In [48]:
y_pred = clf.predict(X_test)

In [None]:
# https://scikit-learn.org/stable/modules/multiclass.html

In [49]:
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7524884861090477


In [43]:
#Confusion Matrix

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test.values.argmax(axis=1), y_pred.argmax(axis=1))
print(cm)

[[ 853    3    0    0]
 [ 370  760    0    0]
 [ 802  343 1780    2]
 [1155  122    0 5029]]


In [53]:
# Print the  classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       989
           1       0.62      0.67      0.65      1316
           2       1.00      0.61      0.76      3497
           3       1.00      0.80      0.89      7660

   micro avg       0.95      0.75      0.84     13462
   macro avg       0.90      0.77      0.82     13462
weighted avg       0.96      0.75      0.84     13462
 samples avg       0.75      0.75      0.75     13462



In [38]:
# MODULE DIRECTIONS FOR CONFUSION MATRIX

# Display the confusion matrix
#cm = confusion_matrix(y_test, y_pred)
#cm_df = pd.DataFrame(
    #cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
#cm_df

ValueError: multilabel-indicator is not supported