In [None]:
%pip install missingno

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy.stats import skew
from pathlib import Path

In [None]:
sns.set(style="whitegrid")
pd.set_option("display.max_columns", 100)
DATA_PATH = Path("../data/nyc_311_2022_to_2025_sample_150k.csv")

In [None]:
df = pd.read_csv(DATA_PATH, low_memory=False)
print(f" Loaded shape: {df.shape}")


print("Earliest:", df['created_date'].min())
print("Latest:", df['created_date'].max())



In [None]:
display(df.head())
display(df.info())
display(df.describe())

In [None]:
msno.matrix(df.sample(2000))
plt.title("Missing Values Sample (1000 rows)")
plt.show()

In [None]:
# Drop columns with more than 50% missing
from matplotlib import axis


threshold = 0.5
missing_ratio = df.isnull().mean()
to_drop = missing_ratio[missing_ratio > threshold].index.tolist()
COLUMNS_TO_DROP = [
                "unique_key",
                "incident_address",
                "intersection_street_1",
                "intersection_street_2",
                "street_name",
                "cross_street_1",
                "cross_street_2",
                "landmark",
                "address_type",
                "x_coordinate_state_plane",
                "y_coordinate_state_plane",
                "bbl",
                "park_facility_name",
                "park_borough",
                "vehicle_type",
                "taxi_company_borough",
                "taxi_pick_up_location",
                "bridge_highway_name",
                "bridge_highway_direction",
                "road_ramp",
                "bridge_highway_segment",
                "open_data_channel_type",
                "due_date",
                "resolution_action_updated_date",
                "resolution_description",
                "community_board",
                "location",
                "city"
            ]
COLUMNS_TO_DROP.extend(to_drop)
print(COLUMNS_TO_DROP)
print(f"Dropping {len(to_drop)} columns: {to_drop}")

df_cleaned = df.drop(columns=COLUMNS_TO_DROP)


In [None]:
msno.matrix(df_cleaned.sample(2000))
plt.title("Missing Values Sample (1000 rows)")
plt.show()

In [None]:
df_cleaned["closed_date"].isna().sum()

In [None]:
df[df["closed_date"].isna()]["complaint_type"].value_counts().head(10)
#


In [None]:
df[df["closed_date"].isna()]["agency"].value_counts().head(10)

In [None]:
cleaned_df = df_cleaned.dropna(subset=["closed_date"])


In [None]:
msno.matrix(df_cleaned.sample(2000))
plt.title("Missing Values Sample (1000 rows)")
plt.show()

In [None]:
cleaned_df.head()

In [None]:
msno.matrix(df_cleaned)

plt.title("Missing Values Sample (1000 rows)")
plt.show()


In [None]:
cleaned_df.isna().sum()

In [None]:
zip_mode = cleaned_df["incident_zip"].mode()[0]
cleaned_df.loc[:, "incident_zip"] = cleaned_df["incident_zip"].fillna(zip_mode)

In [None]:
cleaned_df.loc[:, "descriptor"] = (
    cleaned_df.groupby("complaint_type")["descriptor"]
    .transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else "Missing"))
)

In [None]:
cleaned_df = cleaned_df[cleaned_df["latitude"].notna()]


In [None]:
cleaned_df.isna().sum()

In [None]:
msno.matrix(cleaned_df)

plt.title("Missing Values Sample (1000 rows)")
plt.show()

In [None]:
cleaned_df["location_type"].isna().mean()


In [None]:
cleaned_df.loc[:, "location_type"] = cleaned_df["location_type"].fillna("Missing")


In [None]:
msno.matrix(cleaned_df)

plt.title("Missing Values Sample (1000 rows)")
plt.show()

In [None]:
#Computing the target variable using created_date and  closed date

cleaned_df["created_date"]= pd.to_datetime(cleaned_df["created_date"], errors="coerce")
cleaned_df["closed_date"]= pd.to_datetime(cleaned_df["closed_date"],errors="coerce")


cleaned_df["resolution_time_hrs"] = (
    (cleaned_df["closed_date"] - cleaned_df["created_date"]).dt.total_seconds() / 3600
)

In [None]:
(cleaned_df.loc[:, "resolution_time_hrs"] < 0).sum()

In [None]:
cleaned_df = cleaned_df[cleaned_df["resolution_time_hrs"] >= 0]


In [None]:
cleaned_df["resolution_time_hrs"].describe()

In [None]:
cleaned_df["resolution_time_hrs"].hist(bins=50)

In [None]:
import numpy as np

# Only apply log1p to non-negative values
cleaned_df.loc[:,"log_resolution_time_hrs"] = np.log1p(cleaned_df.loc[:,"resolution_time_hrs"])


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(cleaned_df["log_resolution_time_hrs"], kde=True)
plt.title("Log-transformed Resolution Time (hrs)")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.figure(figsize=(10, 8))
sns.set(style="whitegrid")

# Count top 20 complaint types
top_20_complaints = (
    cleaned_df["complaint_type"]
    .value_counts()
    .head(25)
    .sort_values(ascending=True)
)

# Plot
sns.barplot(
    x=top_20_complaints.values,
    y=top_20_complaints.index,
    palette="viridis"
)

plt.title("Top 25 Complaint Types in NYC 311")
plt.xlabel("Count")
plt.ylabel("Complaint Type")
plt.tight_layout()
plt.show()


In [None]:
complaint_counts = cleaned_df["complaint_type"].value_counts()
print(complaint_counts.head(30))  # Top 30 most common types


In [None]:
mapping = {
    # Noise-related
    "Noise - Residential": "Noise",
    "Noise - Commercial": "Noise",
    "Noise - Street/Sidewalk": "Noise",
    "Noise - Vehicle": "Noise",
    "Noise": "Noise",

    # Parking-related
    "Illegal Parking": "Parking",
    "Blocked Driveway": "Parking",

    # Water/Plumbing
    "Water Leak": "Plumbing",
    "Water System": "Plumbing",
    "PAINT/PLASTER": "Plumbing",
    "PLUMBING": "Plumbing",

    # Heat and Hot Water
    "HEAT/HOT WATER": "Heat/Water",

    # General Sanitation
    "UNSANITARY CONDITION": "Sanitation",
    "Dirty Condition": "Sanitation",

    # Street/Traffic
    "Street Condition": "Street",
    "Traffic Signal Condition": "Street",
    "Street Light Condition": "Street",
    "Derelict Vehicles": "Street",
    "Abandoned Vehicle": "Street",

    # Misc
    "Missed Collection": "Sanitation",
    "Encampment": "Public Safety",
    "DOOR/WINDOW": "Maintenance",
    "Snow or Ice": "Weather",
    "General": "Other"
}

cleaned_df["complaint_grouped"] = (
    cleaned_df["complaint_type"]
    .map(mapping)
    .fillna("Other")
)



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6))
sns.countplot(data=cleaned_df, y="complaint_grouped", order=cleaned_df["complaint_grouped"].value_counts().index)
plt.title("Grouped Complaint Types")
plt.show()


In [None]:
import seaborn as sns



print(cleaned_df["borough"].value_counts())
cleaned_df=cleaned_df[cleaned_df["borough"]!="Unspecified"]
sns.countplot(cleaned_df["borough"])


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x="agency", data=cleaned_df)
plt.xticks(rotation=45)
plt.title("Count of Complaints by Agency")
plt.show()

cleaned_df["agency"].value_counts()

In [None]:
# Threshold: Keep agencies with at least 5000 complaints
threshold = 7000
agency_counts = cleaned_df["agency"].value_counts()
agencies_to_keep = agency_counts[agency_counts >= threshold].index

# Replace low-frequency agencies with 'Other'
cleaned_df["agency"] = cleaned_df["agency"].apply(lambda x: x if x in agencies_to_keep else "Other")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x="agency", data=cleaned_df)
plt.xticks(rotation=45)
plt.title("Count of Complaints by Agency")
plt.show()

cleaned_df["agency"].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Get top 15 most common ZIP codes
top_zips = cleaned_df["incident_zip"].value_counts().nlargest(15).index

# Filter for those
filtered_df = cleaned_df[cleaned_df["incident_zip"].isin(top_zips)]

plt.figure(figsize=(12, 6))
sns.countplot(x="incident_zip", data=filtered_df, order=top_zips)
plt.xticks(rotation=45)
plt.title("Top 15 Most Frequent Incident Zipcodes")
plt.xlabel("Zip Code")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:
top_types = cleaned_df["location_type"].value_counts().loc[lambda x: x >= 1000].index


cleaned_df["location_type_grouped"] = cleaned_df["location_type"].apply(
    lambda x: x if x in top_types else "Other"
)





In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(y="location_type_grouped", data=cleaned_df,
              order=cleaned_df["location_type_grouped"].value_counts().index)
plt.title("Grouped Location Type Distribution")
plt.xlabel("Count")
plt.ylabel("Location Type")
plt.tight_layout()
plt.show()


In [None]:
cleaned_df['created_date'] = pd.to_datetime(cleaned_df['created_date'])
cleaned_df['year'] = cleaned_df['created_date'].dt.year
cleaned_df['month'] = cleaned_df['created_date'].dt.month
cleaned_df['year_month'] = cleaned_df['created_date'].dt.to_period('M')



In [None]:
monthly_trend = cleaned_df['year_month'].value_counts().sort_index()

plt.figure(figsize=(14,5))
sns.lineplot(x=monthly_trend.index.astype(str), y=monthly_trend.values, marker='o')
plt.title("Monthly Complaint Volume Over Time")
plt.xlabel("Year-Month")
plt.ylabel("Number of Complaints")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=cleaned_df, x='month', order=range(1,13))
plt.title("Complaint Count by Month (Across Years)")
plt.xlabel("Month")
plt.ylabel("Number of Complaints")
plt.show()


In [None]:
cleaned_df['created_date'].dt.year.value_counts()

plt.figure(figsize=(8,4))
sns.countplot(data=cleaned_df, x='year')
plt.title("Complaint Count by Year")
plt.xlabel("Year")
plt.ylabel("Number of Complaints")
plt.show()




## Bivariate Analysis


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
sns.boxplot(x="complaint_grouped", y="resolution_time_hrs", data=cleaned_df)
plt.xticks(rotation=45)
plt.title("Resolution Time by Complaint Type")
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x="borough", y="resolution_time_hrs", data=cleaned_df)
plt.title("Resolution Time by Borough")
plt.show()


In [None]:
# Step 1: Count frequency of each agency
agency_counts = cleaned_df['agency'].value_counts()

# Step 2: Define threshold (e.g., keep top 7, rest as "Other")
top_agencies = agency_counts[agency_counts > 10000].index  # adjust threshold if needed

# Step 3: Create a new column
cleaned_df['agency_grouped'] = cleaned_df['agency'].apply(lambda x: x if x in top_agencies else 'Other')



In [None]:
plt.figure(figsize=(12, 5))
sns.boxplot(x="agency_grouped", y="resolution_time_hrs", data=cleaned_df)
plt.xticks(rotation=45)
plt.title("Resolution Time by Agency")
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter only numerical columns
numeric_cols = cleaned_df.select_dtypes(include=["float64", "int64"]).columns

# Compute the correlation matrix
corr_matrix = cleaned_df[numeric_cols].corr()

# Plot heatmap
plt.figure(figsize=(6, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, cbar_kws={"shrink": 0.75})
plt.title("Heatmap of Correlation Between Numeric Features")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
