In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import requests
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Load the data
df = pd.read_csv('TransformGainesville_CrimesALL.csv', index_col = 0)

df.head()

Unnamed: 0,ID,CFS,CFS_Type,Classification,offenseDate,offenseHour,offenseDOW,reportDate,reportHour,reportDOW,...,longitude,location,date,month,day,year,fullDate,DOW,time,moonPhase
0,221009267,Domestic Aggravated Battery,Battery,Person,2021-07-04 21:24:00,21,Sunday,7/4/2021 22:37,22,Sunday,...,-82.326069,POINT (-82.326069 29.688534000000004),,,,,,,,
1,221009608,Domestic Aggravated Battery,Battery,Person,2021-07-11 22:54:00,22,Sunday,7/11/2021 22:55,22,Sunday,...,-82.387148,POINT (-82.387148 29.632687000000004),,,,,,,,
2,221009391,Domestic Aggravated Battery,Battery,Person,2021-07-07 19:12:00,19,Wednesday,7/7/2021 19:13,19,Wednesday,...,-82.29939,POINT (-82.29939 29.640249),,,,,,,,
3,221009308,Domestic Aggravated Battery,Battery,Person,2021-07-06 07:26:00,7,Tuesday,7/6/2021 7:27,7,Tuesday,...,-82.398242,POINT (-82.398242 29.641625),,,,,,,,
4,221011388,Domestic Aggravated Battery,Battery,Person,2021-08-16 17:25:00,17,Monday,8/16/2021 17:26,17,Monday,...,-82.326069,POINT (-82.326069 29.688534000000004),,,,,,,,


In [5]:
# Change name of column "offenseDate" to date
df.rename(columns={'offenseDate': 'Date'}, inplace=True)
df.head()

Unnamed: 0,ID,CFS,CFS_Type,Classification,Date,offenseHour,offenseDOW,reportDate,reportHour,reportDOW,...,longitude,location,date,month,day,year,fullDate,DOW,time,moonPhase
0,221009267,Domestic Aggravated Battery,Battery,Person,2021-07-04 21:24:00,21,Sunday,7/4/2021 22:37,22,Sunday,...,-82.326069,POINT (-82.326069 29.688534000000004),,,,,,,,
1,221009608,Domestic Aggravated Battery,Battery,Person,2021-07-11 22:54:00,22,Sunday,7/11/2021 22:55,22,Sunday,...,-82.387148,POINT (-82.387148 29.632687000000004),,,,,,,,
2,221009391,Domestic Aggravated Battery,Battery,Person,2021-07-07 19:12:00,19,Wednesday,7/7/2021 19:13,19,Wednesday,...,-82.29939,POINT (-82.29939 29.640249),,,,,,,,
3,221009308,Domestic Aggravated Battery,Battery,Person,2021-07-06 07:26:00,7,Tuesday,7/6/2021 7:27,7,Tuesday,...,-82.398242,POINT (-82.398242 29.641625),,,,,,,,
4,221011388,Domestic Aggravated Battery,Battery,Person,2021-08-16 17:25:00,17,Monday,8/16/2021 17:26,17,Monday,...,-82.326069,POINT (-82.326069 29.688534000000004),,,,,,,,


In [6]:
df['Date'] = pd.to_datetime(df.Date, format='%Y-%m-%d')

In [7]:
df['Date'] = pd.to_datetime(df['Date']).dt.date


In [8]:
url = 'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Gainesville,FL/2018-01-01/2021-12-31?unitGroup=us&key=JVFDPCT4LWWPVKADN783XGRVA&include=days&elements=datetime,moonphase'

In [9]:
r = requests.get(url)

In [10]:
json = r.json()
json

{'queryCost': 1461,
 'latitude': 29.652,
 'longitude': -82.3228,
 'resolvedAddress': 'Gainesville, FL, United States',
 'address': 'Gainesville,FL',
 'timezone': 'America/New_York',
 'tzoffset': -5.0,
 'days': [{'datetime': '2018-01-01', 'moonphase': 0.5},
  {'datetime': '2018-01-02', 'moonphase': 0.5},
  {'datetime': '2018-01-03', 'moonphase': 0.52},
  {'datetime': '2018-01-04', 'moonphase': 0.54},
  {'datetime': '2018-01-05', 'moonphase': 0.58},
  {'datetime': '2018-01-06', 'moonphase': 0.63},
  {'datetime': '2018-01-07', 'moonphase': 0.68},
  {'datetime': '2018-01-08', 'moonphase': 0.73},
  {'datetime': '2018-01-09', 'moonphase': 0.78},
  {'datetime': '2018-01-10', 'moonphase': 0.83},
  {'datetime': '2018-01-11', 'moonphase': 0.88},
  {'datetime': '2018-01-12', 'moonphase': 0.92},
  {'datetime': '2018-01-13', 'moonphase': 0.95},
  {'datetime': '2018-01-14', 'moonphase': 0.98},
  {'datetime': '2018-01-15', 'moonphase': 0.99},
  {'datetime': '2018-01-16', 'moonphase': 1.0},
  {'dateti

In [11]:
json.keys()

dict_keys(['queryCost', 'latitude', 'longitude', 'resolvedAddress', 'address', 'timezone', 'tzoffset', 'days'])

In [12]:
moonphases_df = pd.DataFrame(json['days'])
moonphases_df

Unnamed: 0,datetime,moonphase
0,2018-01-01,0.50
1,2018-01-02,0.50
2,2018-01-03,0.52
3,2018-01-04,0.54
4,2018-01-05,0.58
...,...,...
1456,2021-12-27,0.78
1457,2021-12-28,0.83
1458,2021-12-29,0.88
1459,2021-12-30,0.93


In [13]:
json['days']

[{'datetime': '2018-01-01', 'moonphase': 0.5},
 {'datetime': '2018-01-02', 'moonphase': 0.5},
 {'datetime': '2018-01-03', 'moonphase': 0.52},
 {'datetime': '2018-01-04', 'moonphase': 0.54},
 {'datetime': '2018-01-05', 'moonphase': 0.58},
 {'datetime': '2018-01-06', 'moonphase': 0.63},
 {'datetime': '2018-01-07', 'moonphase': 0.68},
 {'datetime': '2018-01-08', 'moonphase': 0.73},
 {'datetime': '2018-01-09', 'moonphase': 0.78},
 {'datetime': '2018-01-10', 'moonphase': 0.83},
 {'datetime': '2018-01-11', 'moonphase': 0.88},
 {'datetime': '2018-01-12', 'moonphase': 0.92},
 {'datetime': '2018-01-13', 'moonphase': 0.95},
 {'datetime': '2018-01-14', 'moonphase': 0.98},
 {'datetime': '2018-01-15', 'moonphase': 0.99},
 {'datetime': '2018-01-16', 'moonphase': 1.0},
 {'datetime': '2018-01-17', 'moonphase': 0.0},
 {'datetime': '2018-01-18', 'moonphase': 0.01},
 {'datetime': '2018-01-19', 'moonphase': 0.03},
 {'datetime': '2018-01-20', 'moonphase': 0.05},
 {'datetime': '2018-01-21', 'moonphase': 0.0

In [14]:
moonphases_df.dtypes

datetime      object
moonphase    float64
dtype: object

In [15]:
bins = [.0 , .25 , .50 , .75, 1 ]

In [16]:
moonphases_df['moonPhases_cat'] = pd.cut(moonphases_df['moonphase'], bins)



In [17]:
moonphases_df

Unnamed: 0,datetime,moonphase,moonPhases_cat
0,2018-01-01,0.50,"(0.25, 0.5]"
1,2018-01-02,0.50,"(0.25, 0.5]"
2,2018-01-03,0.52,"(0.5, 0.75]"
3,2018-01-04,0.54,"(0.5, 0.75]"
4,2018-01-05,0.58,"(0.5, 0.75]"
...,...,...,...
1456,2021-12-27,0.78,"(0.75, 1.0]"
1457,2021-12-28,0.83,"(0.75, 1.0]"
1458,2021-12-29,0.88,"(0.75, 1.0]"
1459,2021-12-30,0.93,"(0.75, 1.0]"


In [18]:
# Labels = 1 = New Moon , 2 = Full Moon, 3 Third Quarter =  4 = back to new moon

labels =[1,2,3,4]

moonphases_df['moonPhases'] = pd.cut(moonphases_df['moonphase'], bins,labels=labels)
print (moonphases_df)


        datetime  moonphase moonPhases_cat moonPhases
0     2018-01-01       0.50    (0.25, 0.5]          2
1     2018-01-02       0.50    (0.25, 0.5]          2
2     2018-01-03       0.52    (0.5, 0.75]          3
3     2018-01-04       0.54    (0.5, 0.75]          3
4     2018-01-05       0.58    (0.5, 0.75]          3
...          ...        ...            ...        ...
1456  2021-12-27       0.78    (0.75, 1.0]          4
1457  2021-12-28       0.83    (0.75, 1.0]          4
1458  2021-12-29       0.88    (0.75, 1.0]          4
1459  2021-12-30       0.93    (0.75, 1.0]          4
1460  2021-12-31       0.97    (0.75, 1.0]          4

[1461 rows x 4 columns]


In [19]:
moonphases_df

Unnamed: 0,datetime,moonphase,moonPhases_cat,moonPhases
0,2018-01-01,0.50,"(0.25, 0.5]",2
1,2018-01-02,0.50,"(0.25, 0.5]",2
2,2018-01-03,0.52,"(0.5, 0.75]",3
3,2018-01-04,0.54,"(0.5, 0.75]",3
4,2018-01-05,0.58,"(0.5, 0.75]",3
...,...,...,...,...
1456,2021-12-27,0.78,"(0.75, 1.0]",4
1457,2021-12-28,0.83,"(0.75, 1.0]",4
1458,2021-12-29,0.88,"(0.75, 1.0]",4
1459,2021-12-30,0.93,"(0.75, 1.0]",4


In [20]:
new_moonphasesdf = moonphases_df[['datetime','moonPhases']]
new_moonphasesdf

Unnamed: 0,datetime,moonPhases
0,2018-01-01,2
1,2018-01-02,2
2,2018-01-03,3
3,2018-01-04,3
4,2018-01-05,3
...,...,...
1456,2021-12-27,4
1457,2021-12-28,4
1458,2021-12-29,4
1459,2021-12-30,4


In [21]:
new_moonphasesdf.rename(columns={'datetime': 'Date'}, inplace=True)
new_moonphasesdf.head()

Unnamed: 0,Date,moonPhases
0,2018-01-01,2
1,2018-01-02,2
2,2018-01-03,3
3,2018-01-04,3
4,2018-01-05,3


In [22]:
new_moonphasesdf['Date'] = pd.to_datetime(new_moonphasesdf.Date, format='%Y-%m-%d')


In [23]:
new_moonphasesdf['Date'] = pd.to_datetime(new_moonphasesdf['Date']).dt.date

In [24]:
new_moonphasesdf

Unnamed: 0,Date,moonPhases
0,2018-01-01,2
1,2018-01-02,2
2,2018-01-03,3
3,2018-01-04,3
4,2018-01-05,3
...,...,...
1456,2021-12-27,4
1457,2021-12-28,4
1458,2021-12-29,4
1459,2021-12-30,4


In [25]:
df.columns

Index(['ID', 'CFS', 'CFS_Type', 'Classification', 'Date', 'offenseHour',
       'offenseDOW', 'reportDate', 'reportHour', 'reportDOW', 'city', 'state',
       'address', 'latitude', 'longitude', 'location', 'date ', 'month', 'day',
       'year', 'fullDate', 'DOW', 'time ', 'moonPhase'],
      dtype='object')

In [26]:
#Find out unique names of CFS, CFS_type, and moonPhase

In [27]:
df.Classification.unique()

array(['Person', 'Other ', 'Property', 'Government'], dtype=object)

In [28]:
df.CFS.unique()

array(['Domestic Aggravated Battery', 'Domestic Simple Battery',
       'Domestic Disturbance', 'Burglary to Residence', 'Fire',
       'Theft Grand - Retail', 'Driving Under the Influence',
       'Death Investigation', 'Robbery (armed)', 'Theft Petit - Other',
       'Weapons Violation (possessing/concealing)',
       'Drug Poss. of Controlled Substance', 'Damage to Property',
       'Robbery (strong Arm)', 'Theft Petit - Retail',
       'Suspicious Incident', 'Domestic Violence Injunction Violation',
       'Stolen Vehicle (auto)', 'Trespass', 'Burglary to Conveyance',
       'Loitering and Prowling', 'Stolen Vehicle (motorcycle)',
       'Identity Theft', 'Disturbance',
       'Fraud (obtain Money/property by False Pretense)',
       'Battery (simple)', 'Violation of Temporary Injunction',
       'Criminal Mischief (misdemeanor)', 'Robbery',
       'Domestic Battery by Strangulation', 'Making False 911 Call',
       'Fraud (credit Card/atm)', 'Found Property', 'Disorderly Conduct',

In [29]:
df.CFS_Type.unique()

array(['Battery', 'Quality of Life', 'Theft', 'Other', 'Alcohol',
       'Death Inv/Homicide', 'Gov Reg Vio', 'Drugs', 'Fraud', 'Assault',
       'Suicide'], dtype=object)

In [30]:
df.moonPhase.unique()

array([nan, 'Full Moon ', 'First Quarter', 'New  Moon ', 'Third Quarter'],
      dtype=object)

In [31]:
# Assign the uncessary columns to variable and drop

unused_cols = ['CFS','reportDate', 'offenseHour','reportHour','reportDOW','city','state','address','longitude','latitude','day','location','date ','month','year', 'moonPhase']

df = df.drop(unused_cols, axis = 1)

df.head(50)

Unnamed: 0,ID,CFS_Type,Classification,Date,offenseDOW,fullDate,DOW,time
0,221009267,Battery,Person,2021-07-04,Sunday,,,
1,221009608,Battery,Person,2021-07-11,Sunday,,,
2,221009391,Battery,Person,2021-07-07,Wednesday,,,
3,221009308,Battery,Person,2021-07-06,Tuesday,,,
4,221011388,Battery,Person,2021-08-16,Monday,,,
5,221011524,Battery,Person,2021-08-19,Thursday,,,
6,221012057,Battery,Person,2021-08-28,Saturday,,,
7,221012231,Battery,Person,2021-08-31,Tuesday,,,
8,221012341,Battery,Person,2021-09-02,Thursday,,,
9,221013249,Battery,Person,2021-09-19,Sunday,,,


In [32]:
# Gather CFS_Type and moonPhase columns for get dummies

convert_cols = df["CFS_Type"]
convert_cols

0            Battery
1            Battery
2            Battery
3            Battery
4            Battery
            ...     
44868        Battery
44869          Other
44870          Theft
44871    Gov Reg Vio
44872          Theft
Name: CFS_Type, Length: 44873, dtype: object

In [33]:
# Convert CFS_Type and moonPhase with get dummies in dataframe

new_columns_df = pd.get_dummies(df['CFS_Type'])
new_columns_df

Unnamed: 0,Alcohol,Assault,Battery,Death Inv/Homicide,Drugs,Fraud,Gov Reg Vio,Other,Quality of Life,Suicide,Theft
0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
44868,0,0,1,0,0,0,0,0,0,0,0
44869,0,0,0,0,0,0,0,1,0,0,0
44870,0,0,0,0,0,0,0,0,0,0,1
44871,0,0,0,0,0,0,1,0,0,0,0


In [34]:
# Go back and take out the original CFS_Type columns and moonPhase  in the original df

df = df.drop(['CFS_Type'], axis = 1)


In [35]:
new_df = pd.merge(df,new_columns_df, left_index=True, right_index = True )
new_df

Unnamed: 0,ID,Classification,Date,offenseDOW,fullDate,DOW,time,Alcohol,Assault,Battery,Death Inv/Homicide,Drugs,Fraud,Gov Reg Vio,Other,Quality of Life,Suicide,Theft
0,221009267,Person,2021-07-04,Sunday,,,,0,0,1,0,0,0,0,0,0,0,0
1,221009608,Person,2021-07-11,Sunday,,,,0,0,1,0,0,0,0,0,0,0,0
2,221009391,Person,2021-07-07,Wednesday,,,,0,0,1,0,0,0,0,0,0,0,0
3,221009308,Person,2021-07-06,Tuesday,,,,0,0,1,0,0,0,0,0,0,0,0
4,221011388,Person,2021-08-16,Monday,,,,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44868,219004357,Person,2019-03-14,Thursday,,,,0,0,1,0,0,0,0,0,0,0,0
44869,218014815,Property,2018-08-15,Wednesday,,,,0,0,0,0,0,0,0,1,0,0,0
44870,218019557,Property,2018-10-23,Tuesday,,,,0,0,0,0,0,0,0,0,0,0,1
44871,218017216,Government,2018-09-18,Tuesday,,,,0,0,0,0,0,0,1,0,0,0,0


In [36]:
# Join the new_moonphasesdf to df on datetime 

dfcomplete = pd.merge(new_df,new_moonphasesdf, left_on = "Date", right_on = 'Date')
dfcomplete.head

<bound method NDFrame.head of               ID Classification        Date offenseDOW fullDate  DOW time   \
0      221009267         Person  2021-07-04     Sunday      NaN  NaN   NaN   
1      221009266         Person  2021-07-04     Sunday      NaN  NaN   NaN   
2      221009262         Person  2021-07-04     Sunday      NaN  NaN   NaN   
3      221009243         Person  2021-07-04     Sunday      NaN  NaN   NaN   
4      221009236         Person  2021-07-04     Sunday      NaN  NaN   NaN   
...          ...            ...         ...        ...      ...  ...   ...   
44868  221010748       Property  2021-07-28  Wednesday      NaN  NaN   NaN   
44869  221012007         Other   2021-07-28  Wednesday      NaN  NaN   NaN   
44870  221010745       Property  2021-07-27    Tuesday      NaN  NaN   NaN   
44871  221013954         Other   2021-07-27    Tuesday      NaN  NaN   NaN   
44872  221013953         Other   2021-07-27    Tuesday      NaN  NaN   NaN   

       Alcohol  Assault  Battery 

In [37]:
y = pd.get_dummies(new_df['Classification'])


X = dfcomplete[['Battery', 'Quality of Life', 'Theft', 'Other', 'Alcohol',
       'Death Inv/Homicide', 'Gov Reg Vio', 'Drugs', 'Fraud', 'Assault',
       'Suicide','moonPhases']]

In [38]:
y.value_counts()

Government  Other   Person  Property
0           0       0       1           25223
                    1       0           11705
            1       0       0            4520
1           0       0       0            3425
dtype: int64

In [39]:
X.describe()

Unnamed: 0,Battery,Quality of Life,Theft,Other,Alcohol,Death Inv/Homicide,Gov Reg Vio,Drugs,Fraud,Assault,Suicide
count,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0,44873.0
mean,0.123237,0.209948,0.39077,0.108551,0.007867,0.013438,0.026319,0.041785,0.057585,0.02048,2.2e-05
std,0.328712,0.407276,0.487928,0.311078,0.088346,0.115142,0.160083,0.200099,0.232959,0.141637,0.004721
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [40]:
X.shape

(44873, 12)

In [41]:
y.shape

(44873, 4)

In [42]:
# You can modify test_size to account for 30% testing/70% training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .5)

In [43]:
# Resample the training data with the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

In [44]:
clf.fit(X_train, y_train)

RandomForestClassifier()

In [45]:
y_pred = clf.predict(X_test)

In [46]:
# https://scikit-learn.org/stable/modules/multiclass.html

In [47]:
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5478450773276284


In [48]:
#Confusion Matrix

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test.values.argmax(axis=1), y_pred.argmax(axis=1))
print(cm)

[[   67     0     0  1629]
 [  113     0     4  2153]
 [  222     0     7  5555]
 [  384     0    18 12285]]


In [49]:
# Print the  classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1696
           1       0.00      0.00      0.00      2270
           2       0.24      0.00      0.00      5784
           3       0.57      0.97      0.72     12687

   micro avg       0.57      0.55      0.56     22437
   macro avg       0.20      0.24      0.18     22437
weighted avg       0.38      0.55      0.41     22437
 samples avg       0.55      0.55      0.55     22437



In [50]:
# MODULE DIRECTIONS FOR CONFUSION MATRIX

# Display the confusion matrix
#cm = confusion_matrix(y_test, y_pred)
#cm_df = pd.DataFrame(
    #cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
#cm_df

In [51]:
# We need data with every single moon phase attached to each event to have a better data set. The outputs from the classification report tell you that because its getting 1.00 precision on 3/4 data points because theres only 8 moon phases for 40k events.  