In [None]:
%run ../notebooks/00_setup_paths.ipynb

from spark_init import init_spark
spark = init_spark("Phase5_UnderReportVICTIM", driver_memory="12g")
spark

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

base_dir = Path("C:/Users/akaas/crime-projectMain")
master_path = base_dir / "data_processed" / "incidents_master"

df = pd.concat([
    pd.read_parquet(f) for f in master_path.glob("*.parquet")
]).reset_index(drop=True)

df.head()


In [None]:
df["incident_date"] = pd.to_datetime(df["incident_date"], errors="coerce")

df["hour"] = df["incident_date_hour"].fillna(0).astype(float)
df["day_of_week"] = df["incident_date"].dt.dayofweek
df["month"] = df["incident_date"].dt.month
df["year"] = df["incident_date"].dt.year


In [None]:
df["offense_text"] = df["offense_codes"].apply(lambda x: " ".join(x) if isinstance(x, list) else "")


In [None]:
df["has_property_loss"] = df["total_property_value"].fillna(0).astype(float) > 0
df["has_multiple_victims"] = df["num_victims"].fillna(0).astype(int) > 1
df["has_multiple_offenders"] = df["num_offenders"].fillna(0).astype(int) > 1
df["has_arrestee"] = df["num_arrestees"].fillna(0).astype(int) > 0


In [None]:
mo_features = [
    "hour", "day_of_week", "month",
    "has_property_loss", "has_multiple_victims",
    "has_multiple_offenders", "has_arrestee"
]

df_mo = df[["unique_incident_id"] + mo_features].copy()

for c in ["has_property_loss","has_multiple_victims",
          "has_multiple_offenders","has_arrestee"]:
    df_mo[c] = df_mo[c].astype(int)


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

df["offense_embedding"] = model.encode(
    df["offense_text"].tolist(),
    convert_to_numpy=True,
    normalize_embeddings=True
)


In [None]:
def combine_vectors(row):
    mo_vec = row[mo_features].values.astype(float)
    emb = row["offense_embedding"]
    return np.concatenate([mo_vec, emb])

df["combined_vector"] = df.apply(combine_vectors, axis=1)


In [None]:
pairs = []
WINDOW = 180  # days

df = df.sort_values(["ori", "incident_date"])

for ori, group in df.groupby("ori"):
    g = group.reset_index(drop=True)
    for i in range(len(g)):
        for j in range(i+1, len(g)):
            d1 = g.loc[i]
            d2 = g.loc[j]

            # stop if > window
            if abs((d2["incident_date"] - d1["incident_date"]).days) > WINDOW:
                break

            pairs.append((d1["unique_incident_id"], d2["unique_incident_id"]))


In [None]:
pairs_df = pd.DataFrame(pairs, columns=["id1","id2"])
len(pairs_df)


In [None]:
pairs_features["label"] = (
    (pairs_features["cosine_sim"] > 0.75) &
    (pairs_features["time_sim"] > 0.2)
).astype(int)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = pairs_features.drop(columns=["id1","id2","label"])
y = pairs_features["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    random_state=42
)

clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, roc_auc_score

preds = clf.predict(X_test)
probs = clf.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, preds))
print("AUC:", roc_auc_score(y_test, probs))


In [None]:
pairs_features["link_probability"] = clf.predict_proba(X)[:,1]

pairs_features.to_csv(base_dir / "linkage_results.csv", index=False)


In [None]:
import networkx as nx
import matplotlib.pyplot as plt

G = nx.Graph()

for idx in pairs_features.itertuples():
    if idx.link_probability > 0.7:
        G.add_edge(idx.id1, idx.id2, weight=idx.link_probability)

plt.figure(figsize=(12,10))
pos = nx.spring_layout(G, k=0.5)

nx.draw_networkx_nodes(G, pos, node_size=30)
nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.title("Crime Linkage Similarity Network (prob > 0.7)")
plt.axis("off")
plt.show()


In [None]:
out = base_dir / "crime_linkage_outputs"
out.mkdir(exist_ok=True)

pairs_features.to_csv(out / "linkage_pairs_with_probabilities.csv", index=False)

nx.write_gexf(G, out / "crime_linkage_network.gexf")
