<a href="https://colab.research.google.com/github/hastyjr/Group_2_Final_project/blob/mschimmy/Project_Machine_Learning_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning Model Mockup

## Setup and Imports

In [None]:
# Import Dependencies
import csv
import pandas as pandas
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
# Load in the data set from local Resources folder
crime_df = pd.read_csv(Resources/Crime_Reports.csv)
crime_df.head()

In [None]:
# Print list of columns 
crime_df.columns

In [None]:
# Print column data types
crime_df.dtypes

In [None]:
# Find null values
for column in crime_df.columns:
  print(f"Column {column} has {crime_df[column].isnull().sum()} null values")

In [None]:
# Drop null rows
crime_df = crime_df.dropna()

In [None]:
# Check for duplicates
print(f"Duplicate entries: {crime_df.duplicated().sum()}")

In [None]:
# Generate the categorical variable list
crime_cat = crime_df.dtypes[crime_df == "object"].index.tolist()

In [None]:
# Check the number of unique values in each column
crime_df[crime_df].nunique()

In [None]:
# Print out the County value counts
county_counts = crime_df.County.value_counts()
county_counts

# Visualize value counts
country_counts.plot.density()

In [None]:
# Save and export the cleaned data as csv file
file_path = "Resources/crime_data_cleaned.csv"
crime_df.to_csv(file_path, index=False)

## Preprocessing

In [None]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transfor the OneHotEncoder using the categorical variables list
encode_df = pd.DataFrame(enc.fit_transform(crime_df[crime_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(crime_cat)

In [None]:
# Merge one-hot encoded features and drop originals
crime_df = crime_df.merge(encode_df, left_inside=True, right_index=True)
crime_df = crime_df.drop(crime_cat, 1)
crime_df.head()

In [None]:
# Split the preprocessed data into the features and target arrays
y = crime_df["<TARGET VARIABLE NAME>"].values
X = crime_df.drop(["<TARGET VARIABLE NAME>"], 1).value

# Split the preprocessed data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Logistic Regression Model

In [None]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs", max_iter=200, random_state=1)

In [None]:
# Train the model
log_classifier.fit(X_train, y_train)

In [None]:
# Evaluate the model
# Calculate the accuracy score
y_pred = log_classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head()
print(f"Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
print(f"Confusion Matrix")
cm_df

# Generate classification report
print("Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
# Visualize the predictions
plt.scatter(X,y)
plt.plot(X, y_pred, color="red")
plt.show()

In [None]:
# Print the slope and the y-intercept
print(model.)

## Random Forest Model

In [None]:
# Create a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [None]:
# Train the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Evaluate the model
# Calculate the accuracy score
y_pred = rf_model.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head()
print(f"Random forest predictive accuracy: {accuracy_score(y_test, y_pred):.3f}")

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
print(f"Confusion Matrix")
cm_df

# Generate classification report
print("Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate feature importance in the Random Forest Model
importances = rf_model.feature_importance_
importances

In [None]:
# Sort features by their importance
sorted(zip(rf_model.features_importances_, X.columns), reverse=True)

## Support Vector Machine

In [None]:
# Create the SVM model
svm = SVC(kernel = "linear")

In [None]:
# Train the model
svm.fit(X_train, y_train)

In [None]:
# Evaluate the model
# Calculate the accuracy score
y_pred = svm.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head()
print(f"SVM model accuracy: {accuracy_score(y_test, y_pred):.3f})

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
print(f"Confusion Matrix")
cm_df

# Generate classification report
print("Classification Report")
print(classification_report(y_test, y_pred))

## Deep Learning Model

In [None]:
# Define the model
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 6
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the strucutre of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train, y_train, epochs=100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test, y_test, verbose=2)
print(f"Deep Learning Model Loss: {model_loss}, Accuracy {model_accuracy}")