<a href="https://colab.research.google.com/github/ipeirotis-org/datasets/blob/main/Flight_Stats/Load_Table6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install google-cloud-bigquery pandas pyarrow db-dtypes



In [9]:
# @title Load DOT Airfare Data to BigQuery
# Prerequisite: !pip install google-cloud-bigquery pandas pyarrow db-dtypes

import pandas as pd
from google.cloud import bigquery
from google.colab import auth

# ---------------------------------------------------------------------------
# 1. AUTHENTICATE
# ---------------------------------------------------------------------------
print("Authenticating User...")
auth.authenticate_user()
print("Authenticated.")


Authenticating User...
Authenticated.


In [10]:
# ---------------------------------------------------------------------------
# CONFIGURATION
# ---------------------------------------------------------------------------
# The destination project and dataset
PROJECT_ID = "nyu-datasets"
DATASET_ID = "flights"
TABLE_NAME = "raw_table_6_all_markets" # Using Table 6 as it covers "All" markets

# URL for Consumer Airfare Report: Table 6 (All City-Pair Markets > 10 pax)
# This is the "All Flights" dataset you requested.
DATA_URL = "https://data.transportation.gov/api/views/yj5y-b2ir/rows.csv?accessType=DOWNLOAD"

# If you prefer the filtered Table 1a (Multi-airport cities only), uncomment this line:
# DATA_URL = "https://data.transportation.gov/api/views/tfrh-tu9e/rows.csv?accessType=DOWNLOAD"


In [11]:
# ---------------------------------------------------------------------------
# DATASET & COLUMN DESCRIPTIONS
# ---------------------------------------------------------------------------
# This dictionary maps the normalized column names to their official DOT descriptions.
COLUMN_DESCRIPTIONS = {
    "year": "The calendar year of the data.",
    "quarter": "The calendar quarter (1-4).",
    "citymarketid_1": "US DOT identification number for the first city market (consolidates airports in the same city).",
    "citymarketid_2": "US DOT identification number for the second city market.",
    "city1": "Name of the first city in the pair (alphabetical order).",
    "city2": "Name of the second city in the pair (alphabetical order).",
    "nsmiles": "Non-stop distance in miles between the two cities.",
    "passengers": "Average number of passengers per day traveling this route.",
    "fare": "Average market fare (price) paid by all passengers.",
    "carrier_lg": "The airline carrier code with the largest market share.",
    "large_ms": "The market share (percentage) of the largest carrier.",
    "fare_lg": "The average fare charged by the largest carrier.",
    "carrier_low": "The airline carrier code with the lowest average fare in the market.",
    "lf_ms": "The market share (percentage) of the lowest fare carrier.",
    "fare_low": "The average fare charged by the lowest fare carrier.",
    "table_1_flag": "1 if the market is part of the top 1,000 markets (Table 1); 0 otherwise.",
    "geocoded_city1": "Geospatial coordinates for City 1.",
    "geocoded_city2": "Geospatial coordinates for City 2."
}

DATASET_DESCRIPTION = (
    "DOT Consumer Airfare Report - Table 6. "
    "Lists all city-pair markets in the contiguous United States that average at least 10 passengers each day. "
    "All records are aggregated as directionless city pair markets (traffic in both directions is added together)."
)

In [12]:


# ---------------------------------------------------------------------------
# 2. LOAD DATA FROM SOURCE
# ---------------------------------------------------------------------------
print(f"Downloading data from {DATA_URL}...")
# We use pandas to read directly from the URL.
# 'low_memory=False' helps if the dataset is large and types are mixed.
df = pd.read_csv(DATA_URL, low_memory=False)

print(f"Data loaded into memory. Rows: {len(df)}, Columns: {len(df.columns)}")


Downloading data from https://data.transportation.gov/api/views/yj5y-b2ir/rows.csv?accessType=DOWNLOAD...
Data loaded into memory. Rows: 649604, Columns: 20


In [15]:
# ---------------------------------------------------------------------------
# 3. CLEANUP & FORMATTING
# ---------------------------------------------------------------------------
# Normalize headers: remove spaces, special chars, lower case
df.columns = df.columns.str.replace(' ', '_').str.replace(r'[^a-zA-Z0-9_]', '', regex=True).str.lower()

# Ensure numeric columns are strictly numeric
numeric_cols = ['passengers', 'fare', 'large_ms', 'fare_lg', 'lf_ms', 'fare_low', 'nsmiles', 'year', 'quarter']
for col in df.columns:
    if col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

print("Columns normalized:", list(df.columns))

Columns normalized: ['tbl', 'year', 'quarter', 'citymarketid_1', 'citymarketid_2', 'city1', 'city2', 'nsmiles', 'passengers', 'fare', 'carrier_lg', 'large_ms', 'fare_lg', 'carrier_low', 'lf_ms', 'fare_low', 'table_1_flag', 'geocoded_city1', 'geocoded_city2', 'tbl6pk']


In [17]:
# ---------------------------------------------------------------------------
# 4. PREPARE SCHEMA WITH DESCRIPTIONS
# ---------------------------------------------------------------------------
# We construct the BigQuery schema manually so we can attach descriptions
schema = []
for col_name in df.columns:
    # Determine type based on pandas dtype
    dtype = "STRING"
    if pd.api.types.is_integer_dtype(df[col_name]):
        dtype = "INTEGER"
    elif pd.api.types.is_float_dtype(df[col_name]):
        dtype = "FLOAT"
    elif pd.api.types.is_bool_dtype(df[col_name]):
        dtype = "BOOLEAN"

    # Get description from our dictionary, default to empty string if missing
    description = COLUMN_DESCRIPTIONS.get(col_name, "")

    schema.append(bigquery.SchemaField(col_name, dtype, description=description))

# ---------------------------------------------------------------------------
# 5. UPLOAD TO BIGQUERY
# ---------------------------------------------------------------------------
client = bigquery.Client(project=PROJECT_ID)
table_id = f"{PROJECT_ID}.{DATASET_ID}.{TABLE_NAME}"

# Update table description after creation/load
table = bigquery.Table(table_id, schema=schema)
table.description = DATASET_DESCRIPTION

job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",
    schema=schema  # Use our custom schema with descriptions
    # Removed: source_format=bigquery.SourceFormat.CSV
)

print(f"Uploading to BigQuery table: {table_id}...")
job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
job.result()

# Apply the table-level description (requires a separate update for the metadata)
table_ref = client.get_table(table_id)
table_ref.description = DATASET_DESCRIPTION
client.update_table(table_ref, ["description"])

# ---------------------------------------------------------------------------
# 6. VERIFICATION
# ---------------------------------------------------------------------------
table = client.get_table(table_id)
print(f"Success! Loaded {table.num_rows} rows.")
print(f"Table Description: {table.description}")

Uploading to BigQuery table: nyu-datasets.flights.raw_table_6_all_markets...
Success! Loaded 649604 rows.
Table Description: DOT Consumer Airfare Report - Table 6. Lists all city-pair markets in the contiguous United States that average at least 10 passengers each day. All records are aggregated as directionless city pair markets (traffic in both directions is added together).
