<a href="https://colab.research.google.com/github/hastyjr/Group_2_Final_project/blob/mschimmy/Project_Machine_Learning_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning Model Mockup

## Setup and Imports

In [None]:
# Import Dependencies
import csv
import io
import os
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from m_config import db_password

In [None]:
# Read in data from PostgreSQL table and load into a pandas DataFrame
# Create a connection string
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/crime_data"

# Create database engine
engine = create_engine(db_string)

# Connect to PostgreSQL server
dbConnection = engine.connect()

# Read data from PostgreSQL database table and load into a DataFrame instance
crime_df = pd.read_sql("SELECT * FROM crime", dbConnection)

# Show DataFrame
crime_df.head()

# Close the database connection
# dbConnection.close()

Unnamed: 0,index,Incident Number,Highest Offense Description,Highest Offense Code,Family Violence,Occurred Date Time,Occurred Date,Occurred Time,Report Date Time,Report Date,...,Census Tract,Clearance Status,Clearance Date,UCR Category,Category Description,X-coordinate,Y-coordinate,Latitude,Longitude,Location
0,0,2017471000.0,THEFT,600,N,2/16/17 18:00,2/16/17,1800,2/16/17 18:22,2/16/17,...,21.0,N,3/29/17,23H,Theft,3128234.0,3128234.0,30.274788,-97.698514,"(30.27478825, -97.69851396)"
1,1,20172170000.0,THEFT BY SHOPLIFTING,607,N,8/5/17 18:00,8/5/17,1800,8/5/17 18:00,8/5/17,...,19.1,C,8/5/17,23C,Theft,3094135.0,3094135.0,30.257886,-97.807007,"(30.25788603, -97.80700704)"
2,5,20205050000.0,THEFT BY SHOPLIFTING,607,N,11/20/20 13:30,11/20/20,1330,11/20/20 15:22,11/20/20,...,24.37,N,11/23/20,23C,Theft,3099306.0,3099306.0,30.162911,-97.793123,"(30.16291061, -97.79312325)"
3,6,20191440000.0,AUTO THEFT,700,N,5/23/19 20:00,5/23/19,2000,5/24/19 13:43,5/24/19,...,6.0,N,7/2/19,240,Auto Theft,3112186.0,3112186.0,30.284284,-97.749112,"(30.28428417, -97.74911194)"
4,9,20175050000.0,BURGLARY OF VEHICLE,601,N,12/19/17 22:00,12/19/17,2200,12/20/17 6:26,12/20/17,...,23.13,N,12/21/17,23F,Theft,3118326.0,3118326.0,30.228133,-97.731168,"(30.22813316, -97.73116767)"


In [None]:
# Check that all rows have been imported to the DataFrame
crime_df.count()

index                          200912
Incident Number                200912
Highest Offense Description    200912
Highest Offense Code           200912
Family Violence                200912
Occurred Date Time             200912
Occurred Date                  200912
Occurred Time                  200912
Report Date Time               200912
Report Date                    200912
Report Time                    200912
Location Type                  200912
Address                        200912
Zip Code                       200912
Council District               200912
APD Sector                     200912
APD District                   200912
PRA                            200912
Census Tract                   200912
Clearance Status               200912
Clearance Date                 200912
UCR Category                   200912
Category Description           200912
X-coordinate                   200912
Y-coordinate                   200912
Latitude                       200912
Longitude   


## Exploratory Data Analysis

In [None]:
# Print list of columns 
crime_df.columns

Index(['index', 'Incident Number', 'Highest Offense Description',
       'Highest Offense Code', 'Family Violence', 'Occurred Date Time',
       'Occurred Date', 'Occurred Time', 'Report Date Time', 'Report Date',
       'Report Time', 'Location Type', 'Address', 'Zip Code',
       'Council District', 'APD Sector', 'APD District', 'PRA', 'Census Tract',
       'Clearance Status', 'Clearance Date', 'UCR Category',
       'Category Description', 'X-coordinate', 'Y-coordinate', 'Latitude',
       'Longitude', 'Location'],
      dtype='object')

In [None]:
# Print column data types
crime_df.dtypes

index                            int64
Incident Number                float64
Highest Offense Description     object
Highest Offense Code             int64
Family Violence                 object
Occurred Date Time              object
Occurred Date                   object
Occurred Time                    int64
Report Date Time                object
Report Date                     object
Report Time                      int64
Location Type                   object
Address                         object
Zip Code                       float64
Council District               float64
APD Sector                      object
APD District                    object
PRA                             object
Census Tract                   float64
Clearance Status                object
Clearance Date                  object
UCR Category                    object
Category Description            object
X-coordinate                   float64
Y-coordinate                   float64
Latitude                 

In [None]:
# Determine the number of unique values in each column
crime_df.nunique()

index                          200912
Incident Number                200904
Highest Offense Description        51
Highest Offense Code               44
Family Violence                     2
Occurred Date Time             148024
Occurred Date                    2099
Occurred Time                    1440
Report Date Time               191584
Report Date                      2099
Report Time                      1440
Location Type                      45
Address                         52315
Zip Code                           50
Council District                   10
APD Sector                         12
APD District                       15
PRA                               500
Census Tract                      236
Clearance Status                    3
Clearance Date                   2118
UCR Category                       14
Category Description                7
X-coordinate                    34421
Y-coordinate                    34421
Latitude                        51434
Longitude   

In [None]:
# Find any null values
for column in crime_df.columns:
  print(f"Column {column} has {crime_df[column].isnull().sum()} null values")

Column index has 0 null values
Column Incident Number has 0 null values
Column Highest Offense Description has 0 null values
Column Highest Offense Code has 0 null values
Column Family Violence has 0 null values
Column Occurred Date Time has 0 null values
Column Occurred Date has 0 null values
Column Occurred Time has 0 null values
Column Report Date Time has 0 null values
Column Report Date has 0 null values
Column Report Time has 0 null values
Column Location Type has 0 null values
Column Address has 0 null values
Column Zip Code has 0 null values
Column Council District has 0 null values
Column APD Sector has 0 null values
Column APD District has 0 null values
Column PRA has 0 null values
Column Census Tract has 0 null values
Column Clearance Status has 0 null values
Column Clearance Date has 0 null values
Column UCR Category has 0 null values
Column Category Description has 0 null values
Column X-coordinate has 0 null values
Column Y-coordinate has 0 null values
Column Latitude has

In [None]:
# Check for duplicates
print(f"Duplicate entries: {crime_df.duplicated().sum()}")

Duplicate entries: 0


In [None]:
# Drop unnecessary or redundant columns
crime_df = crime_df.drop(columns=["index", "Highest Offense Description", "Occurred Date Time", "Report Date Time", "Address", "Category Description", "X-coordinate", "Y-coordinate", "Location"])

In [None]:
# Print out the County value counts
# county_counts = crime_df.County.value_counts()
# county_counts

# Visualize value counts
# county_counts.plot.density()

## Preprocessing

In [None]:
# Generate the categorical variable list
crime_cat = crime_df.dtypes[crime_df.dtypes == "object"].index.tolist()

In [None]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transfor the OneHotEncoder using the categorical variables list
encode_df = pd.DataFrame(enc.fit_transform(crime_df[crime_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(crime_cat)



In [None]:
# Merge one-hot encoded features and drop originals
crime_df = crime_df.merge(encode_df, left_index=True, right_index=True)
crime_df = crime_df.drop(crime_cat, 1)
crime_df.head()

  crime_df = crime_df.drop(crime_cat, 1)


Unnamed: 0,Incident Number,Highest Offense Code,Occurred Time,Report Time,Zip Code,Council District,Census Tract,Latitude,Longitude,Family Violence_N,...,UCR Category_220,UCR Category_23A,UCR Category_23B,UCR Category_23C,UCR Category_23D,UCR Category_23E,UCR Category_23F,UCR Category_23G,UCR Category_23H,UCR Category_240
0,2017471000.0,600,1800,1822,78721.0,1.0,21.0,30.274788,-97.698514,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,20172170000.0,607,1800,1800,78746.0,8.0,19.1,30.257886,-97.807007,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20205050000.0,607,1330,1522,78748.0,5.0,24.37,30.162911,-97.793123,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20191440000.0,700,2000,1343,78705.0,9.0,6.0,30.284284,-97.749112,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,20175050000.0,601,2200,626,78741.0,3.0,23.13,30.228133,-97.731168,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
# Split the preprocessed data into the features and target arrays
y = crime_df["Family Violence_Y"].values
X = crime_df.drop(["Family Violence_Y", "Family Violence_N"], 1).values

# Split the preprocessed data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Logistic Regression Model

In [None]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs", max_iter=200, random_state=1)

In [None]:
# Train the model
log_classifier.fit(X_train, y_train)

In [None]:
# Evaluate the model
# Calculate the accuracy score
y_pred = log_classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head()
print(f"Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
print(f"Confusion Matrix")
cm_df

# Generate classification report
print("Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
# Visualize the predictions
plt.scatter(X,y)
plt.plot(X, y_pred, color="red")
plt.show()

In [None]:
# Print the slope and the y-intercept
print(model.)

## Random Forest Model

In [None]:
# Create a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [None]:
# Train the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Evaluate the model
# Calculate the accuracy score
y_pred = rf_model.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head()
print(f"Random forest predictive accuracy: {accuracy_score(y_test, y_pred):.3f}")

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
print(f"Confusion Matrix")
cm_df

# Generate classification report
print("Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate feature importance in the Random Forest Model
importances = rf_model.feature_importance_
importances

In [None]:
# Sort features by their importance
sorted(zip(rf_model.features_importances_, X.columns), reverse=True)

In [None]:
# Pierce Correlation

## Deep Learning Model

In [None]:
# Define the model
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 6
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the strucutre of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train, y_train, epochs=100)

In [None]:
# Create a DataFrame containing training history
history_df = pd.DataFrame(fit_model.history, index=range(1, len(fit_model.history["loss"])+1))

# Plot the loss
history_df.plot(y="loss")

# Plot the accuracy
history_df.plot(y="accuracy")

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test, y_test, verbose=2)
print(f"Deep Learning Model Loss: {model_loss}, Accuracy {model_accuracy}")