# 🧼 Virginia Food Facility Inspection Data Prep
This notebook merges and cleans the 2022–2025 violation datasets.

In [2]:
import pandas as pd
from pathlib import Path

data_folder = Path("data")
years = [2022, 2023, 2024, 2025]

In [3]:
# Load datasets for each year
dfs = []
for year in years:
    file = data_folder / f"Food_Facility_Inspection_Violations_{year if year < 2025 else '2025toDate'}.xlsx"
    df = pd.read_excel(file)
    df["Year"] = year
    dfs.append(df)

# Merge all dataframes into one
merged_df = pd.concat(dfs, ignore_index=True)
merged_df.columns = merged_df.columns.str.strip()
merged_df.head()

Unnamed: 0,healthDistrict,permitName,program,permitType,permitNumber,status,Facility Address,City,Zip,LocalityName,...,category,violationNumber,vnLabel,observation,code,description,class,comments,violationType,Year
0,Alexandria,Thai Herb I,Food Establishment,Mobile Food Unit,-FD-2738,Business Closed,8501 Richmond Hwy,Alexandria,22309,ALEXANDRIA,...,TCS Food Time/Temperature,22,Proper cold holding temperatures.,OUT,3-501.16 (A) (2),"Time/Temperature Control for Safety Food, Cold...",Core,"Observed sliced tomatoes (46), cole slaw (46),...",COS,2022
1,Alexandria,Thai Herb I,Food Establishment,Mobile Food Unit,-FD-2738,Business Closed,8501 Richmond Hwy,Alexandria,22309,ALEXANDRIA,...,TCS Food Time/Temperature,21,Proper hot holding temperatures.,OUT,3-501.16 (A)(1),Time/Temperature Control for Hot Holding,Core,Observed cooked papusa with pork and cheese (5...,COS,2022
2,Alexandria,Thai Herb I,Food Establishment,Mobile Food Unit,-FD-2738,Business Closed,8501 Richmond Hwy,Alexandria,22309,ALEXANDRIA,...,Physical Facilities,50,"Hot & cold water available, adequate pressure.",OUT,5-103.11,Water Capacity,Priority Foundation,The water source and system serving this food ...,COS,2022
3,Alexandria,Thai Herb I,Food Establishment,Mobile Food Unit,-FD-2738,Business Closed,8501 Richmond Hwy,Alexandria,22309,ALEXANDRIA,...,Chemical,28,"Toxic substances properly identified, stored &...",OUT,7-102.11,Toxics - Common Name/working Containers of Toxics,Core,Containers of personal care items do not have ...,COS,2022
4,Alexandria,Thai Herb I,Food Establishment,Mobile Food Unit,-FD-2738,Business Closed,8501 Richmond Hwy,Alexandria,22309,ALEXANDRIA,...,"Utensils, Equipment, and Vending",47,"Food & non-food contact surfaces cleanable, pr...",OUT,4-501.12,Equipment - Cutting Surfaces,Priority Foundation,The cutting board along the cold line are heav...,Violation,2022


In [None]:
# Standardize column names
merged_df['InspectionDate'] = pd.to_datetime(merged_df['InspectionDate'], errors='coerce')
merged_df['facilityRiskRating'] = pd.to_numeric(merged_df['facilityRiskRating'], errors='coerce')

# Filter only currently permitted facilities
merged_df = merged_df[merged_df['status'].str.lower() == 'permitted']


In [None]:
# Add derived column flags
merged_df['isRepeat'] = merged_df['violationType'].fillna('').str.contains('R')
merged_df['isCorrected'] = merged_df['violationType'].fillna('').str.contains('COS')
merged_df['isPriority'] = merged_df['class'].str.strip().str.lower() == 'priority'

merged_df['permitType'] = merged_df['permitType'].str.strip().str.title()
merged_df['City'] = merged_df['City'].str.strip().str.title()
merged_df['Zip'] = merged_df['Zip'].astype(str).str[:5]
merged_df['Year'] = merged_df['InspectionDate'].dt.year

merged_df.sort_values(by='InspectionDate', inplace=True)


In [4]:
# Check for missing values in key columns
merged_df[['InspectionDate', 'violationType', 'permitType']].isnull().sum()


InspectionDate      0
violationType      22
permitType        152
dtype: int64

In [5]:
# Quick Summary
print("Top Violation Types:")
print(merged_df['violationType'].value_counts().head(10))

print("\nTop ZIP Codes:")
print(merged_df['Zip'].value_counts().head(10))

print("\nTop Permit Types:")
print(merged_df['permitType'].value_counts().head(10))


Top Violation Types:
violationType
Violation               144794
COS                      89264
V                        37823
COS,Violation            31747
COS,V                    14947
Repeat                   11732
Repeat,Violation          9577
COS,Repeat,Violation      4009
COS,Repeat                3447
R,V                       2703
Name: count, dtype: int64

Top ZIP Codes:
Zip
23320    8351
23502    5821
23233    5679
23294    5086
22314    4885
23451    4812
20147    4378
23230    4194
22401    4087
24060    4003
Name: count, dtype: int64

Top Permit Types:
permitType
Full Service Restaurant              195629
Fast Food                             94478
Educational Facility Food Service     11532
Carry Out                             11493
Mobile Food Unit                       7656
Adult Care Home Service                7132
Child Care Food Service                6030
Health Care Food Facility              5127
Continental Breakfast                  4219
Convenience Store

In [9]:
# Convert all object-type columns to string
for col in merged_df.select_dtypes(include='object').columns:
    merged_df[col] = merged_df[col].astype(str)

In [None]:
# Save as CSV (no issue here)
merged_df.to_csv("cleaned_inspections.csv", index=False)

# Fix all object columns for Parquet export
for col in merged_df.select_dtypes(include='object').columns:
    merged_df[col] = merged_df[col].astype(str)

# Save as Parquet
merged_df.to_parquet("cleaned_inspections.parquet", index=False)