In [7]:
!pip install folium
!pip install numpy
!pip install findspark
!pip install pandas
!pip install pyarrow
!pip install seaborn
!pip install matplotlib
!pip install scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip

In [None]:
!wget -q http://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar xf spark-3.5.1-bin-hadoop3.tgz

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/default"
os.environ["SPARK_HOME"] = "spark-3.5.1-bin-hadoop3"

In [3]:
import findspark
findspark.init()

In [8]:

# === SETUP ===
# Define color map used across models
crime_color_map = {
    'Person': 'red',
    'Property': 'blue',
    'Society': 'orange',
    'Unknown': 'gray'
}

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, udf
from pyspark.sql.types import StringType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, MultilayerPerceptronClassifier
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from sklearn.metrics import silhouette_score
import pandas as pd
import numpy as np
import folium
from folium.plugins import HeatMap
import zipfile


In [9]:
# Initialize Spark session
spark = SparkSession.builder.appName("KC_Crime_Prediction").getOrCreate()

df = spark.read.csv("hdfs://localhost:9010/data/final_crime_data_F.csv", header=True, inferSchema=True)
# df = spark.read.csv("data/final_crime_data_F.csv", header=True, inferSchema=True)

# === CRIME TYPE LABELING ===
df = df.withColumn("Crime_Type", expr("""
    CASE
        WHEN Person >= Property AND Person >= Society AND Person >= Unknown THEN 'Person'
        WHEN Property >= Person AND Property >= Society AND Property >= Unknown THEN 'Property'
        WHEN Society >= Person AND Society >= Property AND Society >= Unknown THEN 'Society'
        ELSE 'Unknown'
    END
"""))


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/28 00:29:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/28 00:29:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

In [10]:
# === SPLIT DATA ===
train_df = df.filter(col("Year").isin([2022, 2023]))
predict_df = df.filter(col("Year") == 2024)

# === RANDOM FOREST PIPELINE ===
label_indexer = StringIndexer(inputCol="Crime_Type", outputCol="Crime_Label")
feature_cols = ["Lat", "Lon", "Month", "Hour", "Day_of_Week", "Bus_Stop_Distance"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
rf = RandomForestClassifier(labelCol="Crime_Label", featuresCol="features", numTrees=100)
pipeline_rf = Pipeline(stages=[label_indexer, assembler, rf])
model_rf = pipeline_rf.fit(train_df)
predictions_rf = model_rf.transform(predict_df)

# Convert predictions to labels
rf_labels = model_rf.stages[0].labels
label_udf = udf(lambda i: rf_labels[int(i)], StringType())
predictions_rf = predictions_rf.withColumn("Predicted_Crime_Type", label_udf(col("prediction")))

# === EXPORT RF PREDICTIONS ===
predict_rf_pd = predictions_rf.select("Lat", "Lon", "Month", "Hour", "Day_of_Week", "Bus_Stop_Distance", "Zip Code", "Predicted_Crime_Type").toPandas()
predict_rf_pd.to_csv("predicted_crimes_2024_rf.csv", index=False)
print("Random Forest predictions saved to predicted_crimes_2024_rf.csv")



[Stage 19:>                                                         (0 + 1) / 1]

Random Forest predictions saved to predicted_crimes_2024_rf.csv


                                                                                

In [11]:
# === RANDOM FOREST MAP ZIPPING ===
rf_output_dir = "updated_crime_prediction_maps"
os.makedirs(rf_output_dir, exist_ok=True)
rf_map_files = []

monthly_crime_counts_rf = predict_rf_pd.groupby(["Month", "Predicted_Crime_Type"]).size().unstack(fill_value=0)

for month in sorted(predict_rf_pd["Month"].unique()):
    month_df = predict_rf_pd[predict_rf_pd["Month"] == month]
    crime_map = folium.Map(location=[month_df["Lat"].mean(), month_df["Lon"].mean()], zoom_start=11)

    for _, row in month_df.iterrows():
        folium.CircleMarker(
            location=[row["Lat"], row["Lon"]],
            tooltip=f"Zip Code: {row['Zip Code']}",
            radius=3,
            color=crime_color_map.get(row["Predicted_Crime_Type"], 'gray'),
            fill=True,
            fill_opacity=0.6
        ).add_to(crime_map)

    crime_counts = monthly_crime_counts_rf.loc[month]
    legend_items = ''.join([
        f"&nbsp;<i style='color:{crime_color_map.get(crime, 'gray')};'>●</i> {crime}: {count}<br>"
        for crime, count in crime_counts.items()
    ])
    legend_html = f"""
    <div style="position: fixed;
    bottom: 50px; left: 50px; width: 180px; height: auto;
    border:2px solid grey; z-index:9999; font-size:14px;
    background-color:white; padding: 5px;">
    &nbsp;<b>RF Crime Prediction - {month}</b><br>
    {legend_items}
    </div>
    """
    crime_map.get_root().html.add_child(folium.Element(legend_html))
    map_file = f"{rf_output_dir}/RF_Crime_Map_Month_{month}.html"
    crime_map.save(map_file)
    rf_map_files.append(map_file)

with zipfile.ZipFile("RF_Crime_Prediction_Maps_2024.zip", 'w') as zipf:
    for file in rf_map_files:
        zipf.write(file, arcname=os.path.basename(file))
print("Zipped RF maps: RF_Crime_Prediction_Maps_2024.zip")



Zipped RF maps: RF_Crime_Prediction_Maps_2024.zip


In [12]:
# === ZIP-CODE HEATMAP RF ===
zip_grouped = predict_rf_pd.groupby("Zip Code")

m = folium.Map(location=[predict_rf_pd['Lat'].mean(), predict_rf_pd['Lon'].mean()], zoom_start=11)
for zip_code, group in zip_grouped:
    heat_data = [[row['Lat'], row['Lon']] for _, row in group.iterrows()]
    HeatMap(heat_data, radius=10, blur=15, max_zoom=1).add_to(m)

    lat_center = group['Lat'].mean()
    lon_center = group['Lon'].mean()
    folium.map.Marker(
        [lat_center, lon_center],
        icon=folium.DivIcon(html=f"""<div style='font-size: 12px; color: white;'><b>{zip_code}</b></div>"""))

crime_type_counts = predict_rf_pd['Predicted_Crime_Type'].value_counts().to_dict()
legend_items = ''.join([
    f"&nbsp;<i style='color:{crime_color_map.get(crime, 'gray')};'>●</i> {crime}: {count}<br>"
    for crime, count in crime_type_counts.items()
])
legend_html = f"""
 <div style="position: fixed;
 bottom: 50px; left: 50px; width: 200px; height: auto;
 border:2px solid grey; z-index:9999; font-size:14px;
 background-color:white; padding: 10px;">
 <b>2024 RF Predicted Crime Types</b><br>
 {legend_items}
 </div>
"""
m.get_root().html.add_child(folium.Element(legend_html))
m.save("2024_RF_Crime_Heat_Map_by_Zip_Code.html")



In [13]:
# === NEURAL NETWORK PIPELINE ===
# Determine class count and input size for MLP
class_count = len(rf_labels)
input_size = len(feature_cols)
layers = [6, 64, 124, 256, 512, 256, 124, 64, class_count]  # Deep MLP to match PyTorch-style architecture  # Match structure more closely to sklearn MLP used earlier  # Example MLP architecture

mlp = MultilayerPerceptronClassifier(labelCol="Crime_Label", featuresCol="features", maxIter=100, layers=layers, blockSize=128, seed=42)
pipeline_mlp = Pipeline(stages=[label_indexer, assembler, mlp])
model_mlp = pipeline_mlp.fit(train_df)
predictions_mlp = model_mlp.transform(predict_df)

# Convert MLP predictions to labels
mlp_labels = model_mlp.stages[0].labels
label_udf_mlp = udf(lambda i: mlp_labels[int(i)], StringType())
predictions_mlp = predictions_mlp.withColumn("Predicted_Crime_Type_MLP", label_udf_mlp(col("prediction")))

# Export MLP predictions if needed
predict_mlp_pd = predictions_mlp.select("Lat", "Lon", "Month", "Hour", "Day_of_Week", "Bus_Stop_Distance", "Zip Code", "Predicted_Crime_Type_MLP").toPandas()
predict_mlp_pd.to_csv("predicted_crimes_2024_mlp.csv", index=False)
print("Neural Network predictions saved to predicted_crimes_2024_mlp.csv")


25/04/28 01:11:33 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
[Stage 181:>                                                        (0 + 1) / 1]

Neural Network predictions saved to predicted_crimes_2024_mlp.csv


                                                                                

In [14]:
# === KMEANS CLUSTERING ===
kmeans_df = predictions_rf.select("Lat", "Lon").dropna()
kmeans_assembler = VectorAssembler(inputCols=["Lat", "Lon"], outputCol="features")
kmeans_features = kmeans_assembler.transform(kmeans_df)

kmeans = KMeans(k=16, seed=42, featuresCol="features", predictionCol="Cluster")
kmeans_model = kmeans.fit(kmeans_features)
kmeans_result = kmeans_model.transform(kmeans_features).select("Lat", "Lon", "Cluster")
kmeans_pd = kmeans_result.toPandas()

sil_score = silhouette_score(kmeans_pd[["Lat", "Lon"]], kmeans_pd["Cluster"])

cluster_colors = [
    'red', 'green', 'blue', 'orange', 'purple', 'brown', 'pink', 'gray',
    'olive', 'cyan', 'magenta', 'lime', 'yellow', 'navy', 'maroon', 'salmon']

kmeans_map = folium.Map(location=[kmeans_pd['Lat'].mean(), kmeans_pd['Lon'].mean()], zoom_start=11)

for _, row in kmeans_pd.iterrows():
    folium.CircleMarker(
        location=[row['Lat'], row['Lon']],
        radius=3,
        color = cluster_colors[int(row['Cluster']) % len(cluster_colors)],
        fill=True,
        fill_opacity=0.6
    ).add_to(kmeans_map)

legend_items = ''.join([
    f"&nbsp;<i style='color:{cluster_colors[i]};'>●</i> Cluster {i+1}<br>"
    for i in range(16)
])
legend_html = f"""
 <div style="position: fixed;
 bottom: 50px; left: 50px; width: 200px; height: auto;
 border:2px solid grey; z-index:9999; font-size:14px;
 background-color:white; padding: 10px;">
 <b>K-Means Clusters (2024)</b><br>
 {legend_items}
 <br><b>Silhouette Score:</b> {sil_score:.3f}
 </div>
"""
kmeans_map.get_root().html.add_child(folium.Element(legend_html))
kmeans_map.save("2024_KMeans_Crime_Clusters.html")


In [15]:

# === MLP PREDICTION MAPS ===
mlp_output_dir = "mlp_crime_prediction_maps"
os.makedirs(mlp_output_dir, exist_ok=True)
mlp_map_files = []

mlp_monthly_counts = predict_mlp_pd.groupby(["Month", "Predicted_Crime_Type_MLP"]).size().unstack(fill_value=0)

for month in sorted(predict_mlp_pd["Month"].unique()):
    month_df = predict_mlp_pd[predict_mlp_pd["Month"] == month]
    crime_map = folium.Map(location=[month_df["Lat"].mean(), month_df["Lon"].mean()], zoom_start=11)

    for _, row in month_df.iterrows():
        folium.CircleMarker(
            location=[row["Lat"], row["Lon"]],
            radius=3,
            color=crime_color_map.get(row["Predicted_Crime_Type_MLP"], 'gray'),
            fill=True,
            fill_opacity=0.6
        ).add_to(crime_map)

    crime_counts = mlp_monthly_counts.loc[month]
    legend_items = ''.join([
        f"&nbsp;<i style='color:{crime_color_map.get(crime, 'gray')};'>●</i> {crime}: {count}<br>"
        for crime, count in crime_counts.items()
    ])
    legend_html = f"""
    <div style="position: fixed;
    bottom: 50px; left: 50px; width: 180px; height: auto;
    border:2px solid grey; z-index:9999; font-size:14px;
    background-color:white; padding: 5px;">
    &nbsp;<b>MLP Crime Prediction - {month}</b><br>
    {legend_items}
    </div>
    """
    crime_map.get_root().html.add_child(folium.Element(legend_html))
    map_file = f"{mlp_output_dir}/MLP_Crime_Map_Month_{month}.html"
    crime_map.save(map_file)
    mlp_map_files.append(map_file)

# Zip MLP map results
with zipfile.ZipFile("MLP_Crime_Prediction_Maps_2024.zip", 'w') as zipf:
    for file in mlp_map_files:
        zipf.write(file, arcname=os.path.basename(file))
print("Zipped MLP maps: MLP_Crime_Prediction_Maps_2024.zip")



Zipped MLP maps: MLP_Crime_Prediction_Maps_2024.zip
