<a href="https://colab.research.google.com/github/ipeirotis-org/datasets/blob/main/Restaurant_Inspections/DOH_Restaurant_Inspections.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NYC Department of Health Restaurant Inspections

This notebook downloads, cleans, and normalizes the NYC DOH Restaurant Inspection data, then loads it to BigQuery.

**References:**
- [DOH Inspection Process](https://www1.nyc.gov/site/doh/business/food-operators/the-inspection-process.page)
- [Inspection Blue Book (PDF)](https://www1.nyc.gov/assets/doh/downloads/pdf/rii/blue-book.pdf)
- [NYC Open Data Source](https://data.cityofnewyork.us/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/43nn-pn8j)

## Setup and Authentication

In [1]:
# Install required packages
!pip install -q google-cloud-bigquery pandas-gbq

# Authenticate with Google Cloud
from google.colab import auth
auth.authenticate_user()

In [2]:
import pandas as pd
import numpy as np

# Configure display
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Download Data

In [3]:
# Download the latest dataset from NYC Open Data
!curl -L 'https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD' -o restaurants.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  138M    0  138M    0     0  3217k      0 --:--:--  0:00:44 --:--:-- 3065k


In [4]:
# Load data - read all columns as strings initially for cleaning
df = pd.read_csv("restaurants.csv", dtype='object')
initial_size = len(df)
print(f"Loaded {initial_size:,} records")

Loaded 296,870 records


## 2. Initial Column Cleanup

In [5]:
# Standardize column names: replace spaces with underscores, uppercase
df.columns = df.columns.str.replace(' ', '_').str.upper()

# Drop redundant location column (lat/lon are separate columns)
df = df.drop('LOCATION', axis=1, errors='ignore')

print("Columns:", list(df.columns))

Columns: ['CAMIS', 'DBA', 'BORO', 'BUILDING', 'STREET', 'ZIPCODE', 'PHONE', 'CUISINE_DESCRIPTION', 'INSPECTION_DATE', 'ACTION', 'VIOLATION_CODE', 'VIOLATION_DESCRIPTION', 'CRITICAL_FLAG', 'SCORE', 'GRADE', 'GRADE_DATE', 'RECORD_DATE', 'INSPECTION_TYPE', 'LATITUDE', 'LONGITUDE', 'COMMUNITY_BOARD', 'COUNCIL_DISTRICT', 'CENSUS_TRACT', 'BIN', 'BBL', 'NTA']


In [6]:
# Initialize deletion tracking column
# We'll mark rows to delete rather than dropping immediately,
# so we can analyze all columns first
df['TO_DELETE'] = False

## 3. Data Cleaning and Validation

### 3.1 INSPECTION_TYPE

We only keep Cycle Inspections (Initial and Re-inspection). Other types like Administrative, Pre-permit, Trans Fat, etc. are dropped.

In [8]:
# Mark rows with null inspection type for deletion
df['TO_DELETE'] |= df.INSPECTION_TYPE.isnull()

# Show distribution of inspection types
display("Inspection types:")
display(df.INSPECTION_TYPE.value_counts())

'Inspection types:'

Unnamed: 0_level_0,count
INSPECTION_TYPE,Unnamed: 1_level_1
Cycle Inspection / Initial Inspection,155654
Cycle Inspection / Re-inspection,60540
Pre-permit (Operational) / Initial Inspection,40936
Pre-permit (Operational) / Re-inspection,10830
Administrative Miscellaneous / Initial Inspection,9031
Pre-permit (Non-operational) / Initial Inspection,3765
Cycle Inspection / Reopening Inspection,2748
Pre-permit (Operational) / Compliance Inspection,2091
Administrative Miscellaneous / Re-inspection,1877
Cycle Inspection / Compliance Inspection,1372


In [9]:
# Keep only cycle inspections (Initial and Re-inspection)
valid_inspection_types = [
    'Cycle Inspection / Initial Inspection',
    'Cycle Inspection / Re-inspection'
]
df['TO_DELETE'] |= ~df.INSPECTION_TYPE.isin(valid_inspection_types)

# Simplify inspection type names
df['INSPECTION_TYPE'] = df['INSPECTION_TYPE'].replace({
    'Cycle Inspection / Initial Inspection': 'Initial Inspection',
    'Cycle Inspection / Re-inspection': 'Re-inspection'
})

print(f"Rows marked for deletion: {df['TO_DELETE'].sum():,}")

Rows marked for deletion: 80,676


### 3.2 Location Fields (BORO, BUILDING, STREET, ZIPCODE)

In [10]:
# Remove entries with missing/invalid borough
df['TO_DELETE'] |= (df.BORO == '0') | df.BORO.isnull()

# Remove entries with missing address components
df['TO_DELETE'] |= df.BUILDING.isnull()
df['TO_DELETE'] |= df.STREET.isnull()
df['TO_DELETE'] |= df.ZIPCODE.isnull()

print(f"Rows marked for deletion: {df['TO_DELETE'].sum():,}")

Rows marked for deletion: 83,991


### 3.3 INSPECTION_DATE

In [11]:
# Convert to datetime
df['INSPECTION_DATE'] = pd.to_datetime(df['INSPECTION_DATE'], format='%m/%d/%Y')

# Drop invalid dates (1900-01-01 is a placeholder for missing dates)
df['TO_DELETE'] |= (df['INSPECTION_DATE'] == '1900-01-01')

# Keep only inspections from 2016 onwards (data quality issues before that)
df['TO_DELETE'] |= (df['INSPECTION_DATE'] < '2016-01-01')

print(f"Date range: {df.INSPECTION_DATE.min()} to {df.INSPECTION_DATE.max()}")
print(f"Rows marked for deletion: {df['TO_DELETE'].sum():,}")

Date range: 1900-01-01 00:00:00 to 2026-01-24 00:00:00
Rows marked for deletion: 84,202


### 3.4 ACTION

In [12]:
# Simplify action descriptions
action_mapping = {
    'Violations were cited in the following area(s).': 'Violations found',
    'No violations were recorded at the time of this inspection.': 'No violations',
    'Establishment Closed by DOHMH. Violations were cited in the following area(s) and those requiring immediate action were addressed.': 'Establishment closed',
    'Establishment re-opened by DOHMH.': 'Establishment re-opened',
    'Establishment re-closed by DOHMH.': 'Establishment re-closed'
}
df['ACTION'] = df['ACTION'].replace(action_mapping)

# Remove re-opened/re-closed entries (scores can be misleading)
df['TO_DELETE'] |= df.ACTION.isin(['Establishment re-closed', 'Establishment re-opened'])
df['TO_DELETE'] |= df.ACTION.isnull()

print("Action distribution:")
print(df[~df.TO_DELETE].ACTION.value_counts())

Action distribution:
ACTION
Violations found        205252
Establishment closed      7041
No violations              375
Name: count, dtype: int64


### 3.5 SCORE

In [13]:
# Convert score to numeric
df['SCORE'] = pd.to_numeric(df['SCORE'], errors='coerce')

# Remove invalid scores
df['TO_DELETE'] |= (df.SCORE < 0)

# Remove inconsistent score/action combinations
df['TO_DELETE'] |= (df.SCORE > 0) & (df.ACTION == 'No violations')
df['TO_DELETE'] |= (df.SCORE == 0) & (df.ACTION == 'Violations found')

print(f"Score statistics (remaining data):")
print(df[~df.TO_DELETE].SCORE.describe())

Score statistics (remaining data):
count    209668.000000
mean         24.335373
std          17.077374
min           0.000000
25%          12.000000
50%          21.000000
75%          32.000000
max         154.000000
Name: SCORE, dtype: float64


### 3.6 GRADE

Grades follow DOH scoring: A (0-13), B (14-27), C (28+)

In [14]:
# Remove inconsistent grade/score combinations
df['TO_DELETE'] |= (df.GRADE == 'A') & (df.SCORE > 13)
df['TO_DELETE'] |= (df.GRADE == 'B') & ((df.SCORE < 14) | (df.SCORE > 27))
df['TO_DELETE'] |= (df.GRADE == 'C') & (df.SCORE < 28)

# Remove non-standard grades (Z=pending, P=reopening, N=not yet graded)
invalid_grades = ['Z', 'P', 'N', 'Not Yet Graded']
df['TO_DELETE'] |= df.GRADE.isin(invalid_grades)

print(f"Rows marked for deletion: {df['TO_DELETE'].sum():,}")

Rows marked for deletion: 92,203


### 3.7 VIOLATION_CODE and VIOLATION_DESCRIPTION

In [15]:
# Remove calorie posting violations (not relevant to food safety)
calorie_violation = 'Caloric content not posted on menus, menu boards or food tags, in a food service establishment that is 1 of 15 or more outlets operating the same type of business nationally under common ownership or control, or as a franchise or doing business under the same name, for each menu item that is served in portions, the size and content of which are standardized.'
df['TO_DELETE'] |= (df.VIOLATION_DESCRIPTION == calorie_violation)

### 3.8 Date Fields Cleanup

In [16]:
# Convert GRADE_DATE and verify it matches INSPECTION_DATE
df['GRADE_DATE'] = pd.to_datetime(df['GRADE_DATE'], format='%m/%d/%Y', errors='coerce')

# Drop GRADE_DATE (redundant with INSPECTION_DATE)
df = df.drop('GRADE_DATE', axis=1)

# Drop RECORD_DATE (single value, not useful)
df = df.drop('RECORD_DATE', axis=1, errors='ignore')

print("Remaining columns:", list(df.columns))

Remaining columns: ['CAMIS', 'DBA', 'BORO', 'BUILDING', 'STREET', 'ZIPCODE', 'PHONE', 'CUISINE_DESCRIPTION', 'INSPECTION_DATE', 'ACTION', 'VIOLATION_CODE', 'VIOLATION_DESCRIPTION', 'CRITICAL_FLAG', 'SCORE', 'GRADE', 'INSPECTION_TYPE', 'LATITUDE', 'LONGITUDE', 'COMMUNITY_BOARD', 'COUNCIL_DISTRICT', 'CENSUS_TRACT', 'BIN', 'BBL', 'NTA', 'TO_DELETE']


### 3.9 Convert Coordinates

In [17]:
# Convert lat/lon to numeric
df['LATITUDE'] = pd.to_numeric(df['LATITUDE'], errors='coerce')
df['LONGITUDE'] = pd.to_numeric(df['LONGITUDE'], errors='coerce')

## 4. Apply Deletions

In [18]:
# Verify we haven't lost rows during cleaning
assert len(df) == initial_size, "Unexpected row count change before filtering"

# Apply the deletion filter
print(f"Removing {df.TO_DELETE.sum():,} rows ({df.TO_DELETE.mean()*100:.1f}%)")
df = df[~df.TO_DELETE].copy()
print(f"Remaining rows: {len(df):,}")

Removing 92,203 rows (31.1%)
Remaining rows: 204,667


## 5. Data Validation

In [19]:
# Validate no nulls in required fields
assert df.INSPECTION_TYPE.isnull().sum() == 0, "Null INSPECTION_TYPE found"
assert df.BORO.isnull().sum() == 0, "Null BORO found"
assert df.BUILDING.isnull().sum() == 0, "Null BUILDING found"
assert df.STREET.isnull().sum() == 0, "Null STREET found"
assert df.CUISINE_DESCRIPTION.isnull().sum() == 0, "Null CUISINE_DESCRIPTION found"
assert df.ACTION.isnull().sum() == 0, "Null ACTION found"

# Validate expected values
assert set(df.INSPECTION_TYPE.unique()) == {'Initial Inspection', 'Re-inspection'}
assert set(df.ACTION.unique()) == {'Violations found', 'No violations', 'Establishment closed'}

# Validate grade/score consistency
assert ((df.GRADE == 'A') & (df.SCORE > 13)).sum() == 0, "Invalid A grade scores"
assert ((df.GRADE == 'B') & ((df.SCORE < 14) | (df.SCORE > 27))).sum() == 0, "Invalid B grade scores"
assert ((df.GRADE == 'C') & (df.SCORE < 28)).sum() == 0, "Invalid C grade scores"

print("✓ All validations passed")

✓ All validations passed


## 6. Convert to Categorical Types

In [20]:
# Convert string columns to categorical for efficiency
df['INSPECTION_TYPE'] = pd.Categorical(df['INSPECTION_TYPE'], ordered=False)
df['BORO'] = pd.Categorical(df['BORO'], ordered=False)
df['CUISINE_DESCRIPTION'] = pd.Categorical(df['CUISINE_DESCRIPTION'], ordered=False)
df['ACTION'] = pd.Categorical(df['ACTION'], ordered=False)
df['GRADE'] = pd.Categorical(df['GRADE'], categories=['A', 'B', 'C'], ordered=True)
df['VIOLATION_CODE'] = pd.Categorical(df['VIOLATION_CODE'], ordered=False)
df['CRITICAL_FLAG'] = pd.Categorical(df['CRITICAL_FLAG'], ordered=False)

## 7. Create Normalized Tables

We normalize the data into four tables:
- **restaurants**: Restaurant information (one row per restaurant)
- **inspections**: Inspection records (one row per inspection)
- **violations**: Violations found during inspections (junction table)
- **violation_codes**: Reference table for violation code descriptions

### 7.1 Violation Codes Table

In [22]:
# Create violation codes reference table
violation_codes = (
    df[['VIOLATION_CODE', 'VIOLATION_DESCRIPTION', 'CRITICAL_FLAG']]
    .drop_duplicates()
    .dropna(subset=['VIOLATION_CODE'])
    .rename(columns={
        'VIOLATION_DESCRIPTION': 'DESCRIPTION',
        'CRITICAL_FLAG': 'CRITICAL'
    })
    .sort_values('VIOLATION_CODE')
    .set_index('VIOLATION_CODE')
)

# Keep only one description per violation code (in case of duplicates)
violation_codes = violation_codes.groupby('VIOLATION_CODE', observed=True).first()

print(f"Violation codes: {len(violation_codes)}")
violation_codes.head()

Violation codes: 72


Unnamed: 0_level_0,DESCRIPTION,CRITICAL
VIOLATION_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1
02A,Food not cooked to required minimum temperature.,Critical
02B,Hot food item not held at or above 140º F.,Critical
02C,Hot TCS food item that has been cooked and co...,Critical
02D,Precooked potentially hazardous food from comm...,Critical
02F,"Meat, fish, molluscan shellfish, unpasteurized...",Critical


### 7.2 Restaurants Table

In [23]:
# Define restaurant columns
restaurant_columns = [
    'CAMIS', 'DBA', 'BUILDING', 'STREET', 'ZIPCODE', 'BORO', 'PHONE',
    'CUISINE_DESCRIPTION', 'LATITUDE', 'LONGITUDE',
    'COMMUNITY_BOARD', 'COUNCIL_DISTRICT', 'CENSUS_TRACT', 'BIN', 'BBL', 'NTA'
]

# Create restaurants table (one row per unique restaurant)
restaurants = df[restaurant_columns].drop_duplicates()

# Verify each CAMIS maps to unique attributes
assert len(restaurants) == restaurants.CAMIS.nunique(), "Duplicate CAMIS values found"

# Fix phone numbers that are too long
restaurants['PHONE'] = restaurants['PHONE'].replace('19292290938', '9292290938')

print(f"Restaurants: {len(restaurants):,}")
restaurants.head()

Restaurants: 21,290


Unnamed: 0,CAMIS,DBA,BUILDING,STREET,ZIPCODE,BORO,PHONE,CUISINE_DESCRIPTION,LATITUDE,LONGITUDE,COMMUNITY_BOARD,COUNCIL_DISTRICT,CENSUS_TRACT,BIN,BBL,NTA
9,41688112,HENNESSY BASELINE BAR,620,ATLANTIC AVENUE,11217,Brooklyn,9176186310,American,40.683447,-73.975691,302,35,12902,3398156,3011180001,BK37
10,50074402,MEGUMI,5102,AVENUE U,11234,Brooklyn,9084321527,Chinese,40.60997,-73.92222,318,46,69800,3238574,3084700050,BK45
49,40664052,SOHO CIGAR BAR,32,WATTS STREET,10013,Manhattan,2129411781,American,40.723578,-74.004517,102,3,4700,1007099,1004760015,MN24
131,50077476,L. M. CAFE,11-11,44 ROAD,11101,Queens,3476124200,American,40.74923,-73.949064,402,26,1900,4005224,4004460023,QN31
139,50012465,TIMES SQUARE DINER & GRILL,807,8 AVENUE,10019,Manhattan,2123152400,American,40.761444,-73.986727,104,3,12700,1025144,1010390032,MN15


### 7.3 Inspections Table

In [24]:
# Drop columns that belong to other tables from main df
df_inspections = df.drop(
    columns=restaurant_columns[1:] + ['VIOLATION_CODE', 'VIOLATION_DESCRIPTION', 'CRITICAL_FLAG', 'TO_DELETE'],
    errors='ignore'
)

# Create inspection table (unique by CAMIS + DATE)
inspection = (
    df_inspections
    .drop_duplicates()
    .sort_values(['INSPECTION_DATE', 'CAMIS'])
    .reset_index(drop=True)
)

# Create inspection ID
inspection = inspection.reset_index().rename(columns={'index': 'INSPECTION_ID'})

# Check for duplicate inspections (same restaurant, same date)
duplicates = inspection.groupby(['INSPECTION_DATE', 'CAMIS']).size()
duplicates = duplicates[duplicates > 1]

if len(duplicates) > 0:
    print(f"Warning: Found {len(duplicates)} duplicate inspection records")
    # Remove duplicates, keeping the first occurrence
    for (date, camis), count in duplicates.items():
        idx_to_drop = inspection[
            (inspection.CAMIS == camis) &
            (inspection.INSPECTION_DATE == date)
        ].index[1:]
        inspection = inspection.drop(idx_to_drop)

print(f"Inspections: {len(inspection):,}")
inspection.head()

Inspections: 61,969


Unnamed: 0,INSPECTION_ID,CAMIS,INSPECTION_DATE,ACTION,SCORE,GRADE,INSPECTION_TYPE
0,0,50041177,2016-01-21,Violations found,6.0,A,Initial Inspection
1,1,41627984,2016-02-14,Violations found,19.0,,Initial Inspection
2,2,50003303,2016-02-17,Violations found,17.0,,Initial Inspection
3,3,50016872,2016-02-25,Violations found,37.0,,Initial Inspection
4,4,41717867,2016-03-09,Violations found,11.0,,Initial Inspection


### 7.4 Violations Table

In [25]:
# Create violations junction table by merging with inspections
violations = pd.merge(
    inspection[['INSPECTION_ID', 'CAMIS', 'INSPECTION_DATE']],
    df[['CAMIS', 'INSPECTION_DATE', 'VIOLATION_CODE']],
    on=['CAMIS', 'INSPECTION_DATE'],
    how='inner'
)

# Keep only the relevant columns and remove duplicates
violations = violations[['INSPECTION_ID', 'VIOLATION_CODE']].drop_duplicates()

print(f"Violations: {len(violations):,}")

Violations: 204,663


### 7.5 Summary

In [26]:
print("=" * 50)
print("NORMALIZED TABLE SUMMARY")
print("=" * 50)
print(f"Restaurants:     {len(restaurants):>10,} rows")
print(f"Inspections:     {len(inspection):>10,} rows")
print(f"Violations:      {len(violations):>10,} rows")
print(f"Violation Codes: {len(violation_codes):>10,} rows")
print("=" * 50)

NORMALIZED TABLE SUMMARY
Restaurants:         21,290 rows
Inspections:         61,969 rows
Violations:         204,663 rows
Violation Codes:         72 rows


## 8. Load to BigQuery

In [27]:
from google.cloud import bigquery
import pandas_gbq

# Configuration
PROJECT_ID = "nyu-datasets"
DATASET_ID = "doh_restaurants"

# Initialize BigQuery client
client = bigquery.Client(project=PROJECT_ID)

In [28]:
# Create the dataset if it doesn't exist
dataset_ref = f"{PROJECT_ID}.{DATASET_ID}"

try:
    client.get_dataset(DATASET_ID)
    print(f"Dataset {DATASET_ID} already exists.")
except Exception:
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = "US"
    dataset = client.create_dataset(dataset, exists_ok=True)
    print(f"Created dataset {DATASET_ID}")

Dataset doh_restaurants already exists.


In [29]:
# Define schemas for BigQuery tables
restaurants_schema = [
    {"name": "CAMIS", "type": "STRING", "mode": "REQUIRED", "description": "Restaurant unique identifier"},
    {"name": "DBA", "type": "STRING", "mode": "NULLABLE", "description": "Doing Business As (restaurant name)"},
    {"name": "BUILDING", "type": "STRING", "mode": "NULLABLE", "description": "Building number"},
    {"name": "STREET", "type": "STRING", "mode": "NULLABLE", "description": "Street name"},
    {"name": "ZIPCODE", "type": "STRING", "mode": "NULLABLE", "description": "ZIP code"},
    {"name": "BORO", "type": "STRING", "mode": "NULLABLE", "description": "Borough (Manhattan, Brooklyn, Queens, Bronx, Staten Island)"},
    {"name": "PHONE", "type": "STRING", "mode": "NULLABLE", "description": "Phone number"},
    {"name": "CUISINE_DESCRIPTION", "type": "STRING", "mode": "NULLABLE", "description": "Type of cuisine"},
    {"name": "LATITUDE", "type": "FLOAT", "mode": "NULLABLE", "description": "Latitude coordinate"},
    {"name": "LONGITUDE", "type": "FLOAT", "mode": "NULLABLE", "description": "Longitude coordinate"},
    {"name": "COMMUNITY_BOARD", "type": "STRING", "mode": "NULLABLE", "description": "Community Board"},
    {"name": "COUNCIL_DISTRICT", "type": "STRING", "mode": "NULLABLE", "description": "Council District"},
    {"name": "CENSUS_TRACT", "type": "STRING", "mode": "NULLABLE", "description": "Census Tract"},
    {"name": "BIN", "type": "STRING", "mode": "NULLABLE", "description": "Building Identification Number"},
    {"name": "BBL", "type": "STRING", "mode": "NULLABLE", "description": "Borough Block Lot"},
    {"name": "NTA", "type": "STRING", "mode": "NULLABLE", "description": "Neighborhood Tabulation Area"}
]

inspections_schema = [
    {"name": "INSPECTION_ID", "type": "INTEGER", "mode": "REQUIRED", "description": "Unique identifier for each inspection"},
    {"name": "CAMIS", "type": "STRING", "mode": "NULLABLE", "description": "Restaurant unique identifier"},
    {"name": "INSPECTION_DATE", "type": "TIMESTAMP", "mode": "NULLABLE", "description": "Date of inspection"},
    {"name": "ACTION", "type": "STRING", "mode": "NULLABLE", "description": "Action taken during inspection"},
    {"name": "SCORE", "type": "INTEGER", "mode": "NULLABLE", "description": "Inspection score (lower is better)"},
    {"name": "GRADE", "type": "STRING", "mode": "NULLABLE", "description": "Inspection grade (A, B, or C)"},
    {"name": "INSPECTION_TYPE", "type": "STRING", "mode": "NULLABLE", "description": "Type of inspection (Initial Inspection or Re-inspection)"}
]

violation_codes_schema = [
    {"name": "VIOLATION_CODE", "type": "STRING", "mode": "REQUIRED", "description": "Code for the violation"},
    {"name": "DESCRIPTION", "type": "STRING", "mode": "NULLABLE", "description": "Description of the violation"},
    {"name": "CRITICAL", "type": "STRING", "mode": "NULLABLE", "description": "Criticality (Critical, Not Critical, Not Applicable)"}
]

violations_schema = [
    {"name": "INSPECTION_ID", "type": "INTEGER", "mode": "REQUIRED", "description": "Foreign key to inspections table"},
    {"name": "VIOLATION_CODE", "type": "STRING", "mode": "NULLABLE", "description": "Foreign key to violation_codes table"}
]

In [30]:
# Write tables to BigQuery
print("Writing tables to BigQuery...")

pandas_gbq.to_gbq(
    restaurants,
    f"{DATASET_ID}.restaurants",
    project_id=PROJECT_ID,
    if_exists='replace',
    table_schema=restaurants_schema
)
print("✓ restaurants table written")

pandas_gbq.to_gbq(
    inspection,
    f"{DATASET_ID}.inspections",
    project_id=PROJECT_ID,
    if_exists='replace',
    table_schema=inspections_schema
)
print("✓ inspections table written")

pandas_gbq.to_gbq(
    violation_codes.reset_index(),
    f"{DATASET_ID}.violation_codes",
    project_id=PROJECT_ID,
    if_exists='replace',
    table_schema=violation_codes_schema
)
print("✓ violation_codes table written")

pandas_gbq.to_gbq(
    violations,
    f"{DATASET_ID}.violations",
    project_id=PROJECT_ID,
    if_exists='replace',
    table_schema=violations_schema
)
print("✓ violations table written")

print("\nAll tables successfully written to BigQuery!")

Writing tables to BigQuery...


100%|██████████| 1/1 [00:00<00:00, 8097.11it/s]


✓ restaurants table written


100%|██████████| 1/1 [00:00<00:00, 8128.50it/s]


✓ inspections table written


100%|██████████| 1/1 [00:00<00:00, 9238.56it/s]


✓ violation_codes table written


100%|██████████| 1/1 [00:00<00:00, 11618.57it/s]

✓ violations table written

All tables successfully written to BigQuery!





## 9. BigQuery Post-Setup (Run in BigQuery Console)

After loading the tables, run these SQL commands in BigQuery to add table descriptions and relationships:

In [31]:
# SQL commands for BigQuery console (copy and run manually)
bigquery_setup_sql = '''
-- Add table descriptions
ALTER TABLE `nyu-datasets.doh_restaurants.restaurants`
SET OPTIONS (description = 'Information about NYC restaurants, including their location and cuisine.');

ALTER TABLE `nyu-datasets.doh_restaurants.inspections`
SET OPTIONS (description = 'Details of health inspections conducted at restaurants, including date, score, and grade.');

ALTER TABLE `nyu-datasets.doh_restaurants.violation_codes`
SET OPTIONS (description = 'Reference table for violation codes and their descriptions.');

ALTER TABLE `nyu-datasets.doh_restaurants.violations`
SET OPTIONS (description = 'Records of specific violations found during inspections, linking inspections to violation codes.');

-- Add Primary Keys (NOT ENFORCED - for documentation purposes)
ALTER TABLE `nyu-datasets.doh_restaurants.restaurants`
ADD PRIMARY KEY (CAMIS) NOT ENFORCED;

ALTER TABLE `nyu-datasets.doh_restaurants.inspections`
ADD PRIMARY KEY (INSPECTION_ID) NOT ENFORCED;

ALTER TABLE `nyu-datasets.doh_restaurants.violation_codes`
ADD PRIMARY KEY (VIOLATION_CODE) NOT ENFORCED;

-- Add Foreign Keys (NOT ENFORCED - for documentation purposes)
ALTER TABLE `nyu-datasets.doh_restaurants.inspections`
ADD CONSTRAINT fk_inspections_restaurants
FOREIGN KEY (CAMIS) REFERENCES `nyu-datasets.doh_restaurants.restaurants`(CAMIS) NOT ENFORCED;

ALTER TABLE `nyu-datasets.doh_restaurants.violations`
ADD CONSTRAINT fk_violations_inspections
FOREIGN KEY (INSPECTION_ID) REFERENCES `nyu-datasets.doh_restaurants.inspections`(INSPECTION_ID) NOT ENFORCED;

ALTER TABLE `nyu-datasets.doh_restaurants.violations`
ADD CONSTRAINT fk_violations_violation_codes
FOREIGN KEY (VIOLATION_CODE) REFERENCES `nyu-datasets.doh_restaurants.violation_codes`(VIOLATION_CODE) NOT ENFORCED;
'''

print(bigquery_setup_sql)


-- Add table descriptions
ALTER TABLE `nyu-datasets.doh_restaurants.restaurants`
SET OPTIONS (description = 'Information about NYC restaurants, including their location and cuisine.');

ALTER TABLE `nyu-datasets.doh_restaurants.inspections`
SET OPTIONS (description = 'Details of health inspections conducted at restaurants, including date, score, and grade.');

ALTER TABLE `nyu-datasets.doh_restaurants.violation_codes`
SET OPTIONS (description = 'Reference table for violation codes and their descriptions.');

ALTER TABLE `nyu-datasets.doh_restaurants.violations`
SET OPTIONS (description = 'Records of specific violations found during inspections, linking inspections to violation codes.');

-- Add Primary Keys (NOT ENFORCED - for documentation purposes)
ALTER TABLE `nyu-datasets.doh_restaurants.restaurants`
ADD PRIMARY KEY (CAMIS) NOT ENFORCED;

ALTER TABLE `nyu-datasets.doh_restaurants.inspections`
ADD PRIMARY KEY (INSPECTION_ID) NOT ENFORCED;

ALTER TABLE `nyu-datasets.doh_restaurants.

## 10. Sample Queries

Here are some useful queries to get started with the data:

In [32]:
sample_queries = '''
-- Restaurants with their latest inspection grade
WITH latest_inspection AS (
    SELECT CAMIS, MAX(INSPECTION_DATE) AS INSPECTION_DATE
    FROM `nyu-datasets.doh_restaurants.inspections`
    GROUP BY CAMIS
)
SELECT
    R.DBA, R.BORO, R.CUISINE_DESCRIPTION,
    I.INSPECTION_DATE, I.SCORE, I.GRADE
FROM `nyu-datasets.doh_restaurants.restaurants` R
JOIN latest_inspection L ON R.CAMIS = L.CAMIS
JOIN `nyu-datasets.doh_restaurants.inspections` I
    ON I.CAMIS = L.CAMIS AND I.INSPECTION_DATE = L.INSPECTION_DATE
ORDER BY I.SCORE DESC
LIMIT 100;

-- Most common violations
SELECT
    VC.VIOLATION_CODE,
    VC.DESCRIPTION,
    VC.CRITICAL,
    COUNT(*) as violation_count
FROM `nyu-datasets.doh_restaurants.violations` V
JOIN `nyu-datasets.doh_restaurants.violation_codes` VC
    ON V.VIOLATION_CODE = VC.VIOLATION_CODE
GROUP BY VC.VIOLATION_CODE, VC.DESCRIPTION, VC.CRITICAL
ORDER BY violation_count DESC
LIMIT 20;

-- Average score by cuisine type
SELECT
    R.CUISINE_DESCRIPTION,
    COUNT(DISTINCT R.CAMIS) as restaurant_count,
    ROUND(AVG(I.SCORE), 1) as avg_score
FROM `nyu-datasets.doh_restaurants.restaurants` R
JOIN `nyu-datasets.doh_restaurants.inspections` I ON R.CAMIS = I.CAMIS
WHERE I.SCORE IS NOT NULL
GROUP BY R.CUISINE_DESCRIPTION
HAVING restaurant_count >= 100
ORDER BY avg_score;
'''

print(sample_queries)


-- Restaurants with their latest inspection grade
WITH latest_inspection AS (
    SELECT CAMIS, MAX(INSPECTION_DATE) AS INSPECTION_DATE
    FROM `nyu-datasets.doh_restaurants.inspections`
    GROUP BY CAMIS
)
SELECT 
    R.DBA, R.BORO, R.CUISINE_DESCRIPTION,
    I.INSPECTION_DATE, I.SCORE, I.GRADE
FROM `nyu-datasets.doh_restaurants.restaurants` R
JOIN latest_inspection L ON R.CAMIS = L.CAMIS
JOIN `nyu-datasets.doh_restaurants.inspections` I 
    ON I.CAMIS = L.CAMIS AND I.INSPECTION_DATE = L.INSPECTION_DATE
ORDER BY I.SCORE DESC
LIMIT 100;

-- Most common violations
SELECT 
    VC.VIOLATION_CODE,
    VC.DESCRIPTION,
    VC.CRITICAL,
    COUNT(*) as violation_count
FROM `nyu-datasets.doh_restaurants.violations` V
JOIN `nyu-datasets.doh_restaurants.violation_codes` VC 
    ON V.VIOLATION_CODE = VC.VIOLATION_CODE
GROUP BY VC.VIOLATION_CODE, VC.DESCRIPTION, VC.CRITICAL
ORDER BY violation_count DESC
LIMIT 20;

-- Average score by cuisine type
SELECT 
    R.CUISINE_DESCRIPTION,
    COUNT(DI