# MAGIC %md
# 01 – Explore NYC Taxi Data
Basic exploration of the dataset structure and statistics.

In [None]:
# Load required packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml

In [None]:
# Load example dataset from OpenML (can be replaced with actual source)
# We'll use the 'nyc-taxi-green-dec-2016' dataset if available
data = fetch_openml(name='nyc-taxi-green-dec-2016', version=1, as_frame=True)
df = data.frame

# Show first few rows
df.head()

In [None]:
# Dataset shape
print("Shape:", df.shape)

In [None]:
# Check missing values
df.isnull().sum().sort_values(ascending=False).head(10)

In [None]:
# Basic statistics
df.describe()

In [None]:
# Visualize trip distance distribution
plt.figure(figsize=(10, 4))
sns.histplot(df["trip_distance"], bins=100, kde=True)
plt.title("Trip Distance Distribution")
plt.xlim(0, 20)
plt.xlabel("Distance (miles)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
# Correlation matrix
corr = df.corr(numeric_only=True)
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0)
plt.title("Feature Correlation Heatmap")
plt.show()