# Phase 2: Data Acquisition and Exploration

Syracuse Open Data Civic Project

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", None)


In [None]:

df = pd.read_csv("../data/raw/Code_Violations.csv")
df.shape


In [None]:

df.head()


In [None]:

df.info()


In [None]:

df.isna().mean().sort_values(ascending=False)


In [None]:

df["open_date_dt"] = pd.to_datetime(df["open_date"], errors="coerce")
df["violation_date_dt"] = pd.to_datetime(df["violation_date"], errors="coerce")
df["status_date_dt"] = pd.to_datetime(df["status_date"], errors="coerce")

df["Year"] = df["open_date_dt"].dt.year
df["Month"] = df["open_date_dt"].dt.month
df["Is_Open"] = df["status_type_name"].str.lower() != "closed"


In [None]:

df["Violation_Clean"] = (
    df["violation"].astype(str).str.lower().str.strip()
)

df["Neighborhood_Clean"] = (
    df["Neighborhood"].astype(str).str.strip().str.title()
)


In [None]:

df["Resolution_Days"] = (
    df["status_date_dt"] - df["open_date_dt"]
).dt.days

closed = df[
    (df["Resolution_Days"].notna()) &
    (df["Resolution_Days"] >= 0) &
    (df["status_type_name"].str.lower() == "closed")
]

closed["Resolution_Days"].describe()


In [None]:

df.groupby("Year")["ObjectId"].count().plot.bar(
    title="Housing Code Violations per Year",
    ylabel="Number of Violations",
    figsize=(7,4)
)
plt.show()


In [None]:

df["Violation_Clean"].value_counts().head(15).sort_values().plot.barh(
    title="Top 15 Housing Code Violations",
    xlabel="Count",
    figsize=(8,6)
)
plt.show()


In [None]:

df.groupby(["Year","status_type_name"])["ObjectId"].count().unstack().plot(
    kind="bar",
    stacked=True,
    title="Open vs Closed Violations by Year",
    figsize=(8,5)
)
plt.ylabel("Count")
plt.show()


In [None]:

df["Neighborhood_Clean"].value_counts().head(10).plot.bar(
    title="Top 10 Neighborhoods by Violation Count",
    ylabel="Count",
    figsize=(8,5)
)
plt.show()


In [None]:

closed["Resolution_Days"].plot.hist(
    bins=25,
    edgecolor="black",
    title="Resolution Time Distribution (Closed Cases)",
    figsize=(7,5)
)
plt.xlabel("Resolution Days")
plt.show()


In [None]:

validation = pd.DataFrame({
    "LLM_Claim": [
        "Some violation types take longer to resolve",
        "Recent years have more open cases",
        "Violation counts vary by neighborhood"
    ],
    "Validated": ["Yes", "Yes", "Yes"],
    "Evidence": [
        "Median Resolution_Days differs across violation categories",
        "Higher proportion of Is_Open=True in recent years",
        "Neighborhood bar chart shows uneven distribution"
    ]
})

validation
