# ✈️ Flight Delay Analysis Mini Project
## Databricks Community Edition Practice
---### Step 1: Ingest Airlines Dataset

In [None]:
# Read CSV file into a DataFrame
df = spark.read.option('header', True).option('inferSchema', True).csv('/databricks-datasets/airlines/part-00000')
df.show(5)

### Step 2: Clean Missing Values

In [None]:
# Drop rows with missing important fields
df_clean = df.na.drop(subset=['DepDelay', 'ArrDelay'])
df_clean.count()

### Step 3: Save as Delta Table

In [None]:
# Save cleaned data as Delta
df_clean.write.format('delta').mode('overwrite').save('/tmp/airlines_delta_mini')

### Step 4: Create SQL Table from Delta

In [None]:
# Create SQL Table
spark.sql('CREATE TABLE IF NOT EXISTS airlines_delta_mini USING DELTA LOCATION "/tmp/airlines_delta_mini"')

### Step 5: Top 10 Delayed Origin Airports

In [None]:
spark.sql('SELECT Origin, AVG(DepDelay) as AvgDepDelay FROM airlines_delta_mini GROUP BY Origin ORDER BY AvgDepDelay DESC LIMIT 10').show()

### Step 6: Airlines with Best Average Arrival Time

In [None]:
spark.sql('SELECT UniqueCarrier, AVG(ArrDelay) as AvgArrDelay FROM airlines_delta_mini GROUP BY UniqueCarrier ORDER BY AvgArrDelay ASC LIMIT 10').show()

### Step 7: (Bonus) Visualize Delay Distribution

In [None]:
# Optional if Databricks CE UI allows
df_clean.select('DepDelay').sample(False, 0.1).toPandas().hist(column='DepDelay', bins=50)