In [None]:
import pandas as pd
import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt

Note: Data is from https://opendata.dc.gov/

In [None]:
def load_parking_violation_data(data_folder):
    files = os.listdir(data_folder)
    print(files)
    all_csvs = glob.glob(os.path.join(data_folder, "*.csv"))
    dfs = []
    for c in all_csvs:
        try:
          df = pd.read_csv(c)
          dfs.append(df)
          print(f"successfully loaded {c}")
        except Exception as e:
          print(f"error opening {c}: {e}")
    return pd.concat(dfs, ignore_index=True)

In [None]:
df = load_parking_violation_data("../Data/")

In [None]:
len(df)

In [None]:
print(df.count())

In [None]:
df.duplicated().sum()

In [None]:
df.dtypes
df['ISSUE_TIME'] = df['ISSUE_TIME'].astype('int64')

In [None]:
df.isnull().sum()[df.isnull().sum() > 0]

In [None]:
df['ISSUE_DATE'] = pd.to_datetime(df['ISSUE_DATE'], errors='coerce')
df['ISSUE_DATE'] = df['ISSUE_DATE'].dt.tz_localize(None)

In [None]:
df['parsed_time'] = pd.to_datetime(df['ISSUE_TIME'].astype(str).str.zfill(4), format='%H%M', errors='coerce')
df['hour'] = df['parsed_time'].dt.hour
df['minute'] = df['parsed_time'].dt.minute

In [None]:
df = df[df['ISSUE_DATE'].notna() & df['hour'].notna() & df['minute'].notna()]

In [None]:
df['issue_datetime'] = df['ISSUE_DATE'].dt.floor('D') \
    + pd.to_timedelta(df['hour'], unit='h') \
    + pd.to_timedelta(df['minute'], unit='m')

In [None]:
df.drop(columns=['hour', 'minute', 'ISSUE_DATE', 'ISSUE_TIME', 'parsed_time'], inplace=True)

In [None]:
# Almost all of the data is missing these values, so dropping
df.drop(columns=[
    'VEHICLE_TYPE',
    'DISPOSITION_DESC',
    'DISPOSITION_DATE'], inplace=True)

In [None]:
# New date + time related columns for future use

df['year'] = df['issue_datetime'].dt.year
df['month'] = df['issue_datetime'].dt.month
df['day'] = df['issue_datetime'].dt.day
df['day_of_week'] = df['issue_datetime'].dt.day_name()
df['hour'] = df['issue_datetime'].dt.hour
df['is_weekend'] = df['day_of_week'].isin(['Saturday', 'Sunday'])

In [None]:
# Filling missing values
df['VIOLATION_PROC_DESC'] = df['VIOLATION_PROC_DESC'].fillna("UNKNOWN")

In [None]:
df[''] = df['VIOLATION_TYPE_DESC'].fillna("UNKNOWN")

In [None]:
df.drop(columns=['XCOORD', 'YCOORD', 'MAR_ID'], inplace=True)

In [None]:
df['MULTI_OWNER_NUMBER'].value_counts(dropna=False)

In [None]:
# from the metadata - "If number exists, this is referring to either a fleet, government, or rental vehicle." so probably more valuable to represent as a boolean
df['is_fleet_gov_or_rental'] = df['MULTI_OWNER_NUMBER'].notna()
df.drop(columns=['MULTI_OWNER_NUMBER'], inplace=True)

In [None]:
# care about location in DC, so drop violations with null latitude and longitude
df = df.dropna(subset=['LATITUDE', 'LONGITUDE'])

In [None]:
summary = pd.DataFrame({
    'dtype': df.dtypes,
    'nulls': df.isnull().sum(),
    'unique_vals': df.nunique()
})
summary

### Initial Visualization

In [None]:
df.groupby(df['issue_datetime'].dt.to_period('M')).size().plot(kind='bar', figsize=(14,5))

In [None]:
sns.countplot(data=df, x='day_of_week', order=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])

In [None]:
sns.histplot(df['hour'], bins=24, kde=False)

In [None]:
agency_counts = df['ISSUING_AGENCY_NAME'].value_counts()
top_agencies = agency_counts.head(3)

plt.figure(figsize=(10,6))
sns.barplot(x=top_agencies.index, y=top_agencies.values)

plt.title("Tickets Issued by Agency")
plt.xlabel("Issuing Agency")
plt.ylabel("Number of Tickets")
plt.tight_layout()
plt.show()

In [None]:
df.columns = df.columns.str.lower()

In [None]:
df.head()

In [None]:
df[df['issue_datetime'].isna()]

In [None]:
df.to_csv('../CleanData/cleaned_parking_violations_v2.csv', index=False)