# EDA for `porto.csv`

This notebook performs a lightweight exploratory data analysis (EDA) on the `porto.csv` dataset:

- Load CSV and inspect shape, dtypes, head
- Check missing values
- Summary statistics for numeric columns
- Correlation heatmap
- Distributions for numeric columns (capped)
- Sampled pairplot

Outputs are also saved to `eda_reports/` for reuse in reports.


In [1]:
# Setup
import os
from typing import Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Inline plotting
%matplotlib inline
sns.set_theme(context="notebook", style="whitegrid")

OUTPUT_DIR = "eda_reports"
FIG_DIR = os.path.join(OUTPUT_DIR, "figures")

os.makedirs(FIG_DIR, exist_ok=True)

CSV_PATH = "porto.csv"  # update if needed


In [10]:
# Helpers

def save_text(content: str, out_path: str) -> None:
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(content)


def basic_overview(df: pd.DataFrame) -> str:
    lines = []
    lines.append("=== Shape ===")
    lines.append(str(df.shape))
    lines.append("")

    lines.append("=== Dtypes ===")
    lines.append(str(df.dtypes))
    lines.append("")

    lines.append("=== Head (first 5 rows) ===")
    lines.append(str(df.head()))
    lines.append("")

    lines.append("=== Missing values per column ===")
    missing = df.isna().sum().sort_values(ascending=False)
    lines.append(str(missing))
    lines.append("")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    lines.append("=== Numeric columns ===")
    lines.append(", ".join(num_cols) if num_cols else "<none>")
    lines.append("")

    if num_cols:
        lines.append("=== Describe (numeric) ===")
        lines.append(str(df[num_cols].describe(include="all")))

    return "\n".join(lines)


def correlation_heatmap(df: pd.DataFrame, out_file: str, max_cols: int = 40) -> Optional[str]:
    num_df = df.select_dtypes(include=[np.number])
    if num_df.shape[1] == 0:
        return "No numeric columns for correlation heatmap."
    if num_df.shape[1] > max_cols:
        num_df = num_df.iloc[:, :max_cols]

    plt.figure(figsize=(min(24, 1 + num_df.shape[1] * 0.6), min(18, 1 + num_df.shape[1] * 0.6)))
    corr = num_df.corr(numeric_only=True)
    sns.heatmap(corr, cmap="coolwarm", center=0, square=False)
    plt.title("Correlation Heatmap")
    plt.tight_layout()
    plt.savefig(out_file, dpi=200)
    plt.close()
    return None


def distributions(df: pd.DataFrame, out_dir: str, max_cols: int = 20) -> str:
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if not num_cols:
        return "No numeric columns for distributions."

    cols_to_plot = num_cols[:max_cols]
    for col in cols_to_plot:
        plt.figure(figsize=(8, 5))
        sns.histplot(df[col].dropna(), kde=True, bins=30)
        plt.title(f"Distribution: {col}")
        plt.xlabel(col)
        plt.ylabel("Count")
        plt.tight_layout()
        plt.savefig(os.path.join(out_dir, f"dist_{col}.png"), dpi=200)
        plt.close()
    if len(num_cols) > max_cols:
        return f"Plotted first {max_cols} of {len(num_cols)} numeric columns."
    return ""


def pairplot_sample(df: pd.DataFrame, out_file: str, max_cols: int = 6, sample_size: int = 1000) -> str:
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()[:max_cols]
    if len(num_cols) < 2:
        return "Not enough numeric columns for pairplot."

    sample_df = df[num_cols].dropna()
    if sample_df.shape[0] > sample_size:
        sample_df = sample_df.sample(sample_size, random_state=42)

    g = sns.pairplot(sample_df, corner=True, diag_kind="hist")
    g.fig.suptitle("Pairplot (sample)", y=1.02)
    plt.tight_layout()
    g.savefig(out_file, dpi=200)
    plt.close()
    return ""


In [11]:
# Load data
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV not found at {CSV_PATH}. Update CSV_PATH above.")

df = pd.read_csv(CSV_PATH)
df.head()


Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [12]:
# Overview text
overview = basic_overview(df)
print(overview)

# Save to reports
with open(os.path.join(OUTPUT_DIR, "overview.txt"), "w", encoding="utf-8") as f:
    f.write(overview)


=== Shape ===
(1710670, 9)

=== Dtypes ===
TRIP_ID           int64
CALL_TYPE        object
ORIGIN_CALL     float64
ORIGIN_STAND    float64
TAXI_ID           int64
TIMESTAMP         int64
DAY_TYPE         object
MISSING_DATA       bool
POLYLINE         object
dtype: object

=== Head (first 5 rows) ===
               TRIP_ID CALL_TYPE  ORIGIN_CALL  ORIGIN_STAND   TAXI_ID  \
0  1372636858620000589         C          NaN           NaN  20000589   
1  1372637303620000596         B          NaN           7.0  20000596   
2  1372636951620000320         C          NaN           NaN  20000320   
3  1372636854620000520         C          NaN           NaN  20000520   
4  1372637091620000337         C          NaN           NaN  20000337   

    TIMESTAMP DAY_TYPE  MISSING_DATA  \
0  1372636858        A         False   
1  1372637303        A         False   
2  1372636951        A         False   
3  1372636854        A         False   
4  1372637091        A         False   

                  

In [13]:
# Correlation heatmap
note_corr = correlation_heatmap(df, os.path.join(FIG_DIR, "correlation_heatmap.png"))
if note_corr:
    print(note_corr)


In [14]:
# Distributions
note_dist = distributions(df, FIG_DIR)
if note_dist:
    print(note_dist)


In [15]:
# Pairplot (sample)
note_pair = pairplot_sample(df, os.path.join(FIG_DIR, "pairplot.png"))
if note_pair:
    print(note_pair)


In [16]:
# Save notes if any
notes = []
for msg in [note_corr, note_dist, note_pair]:
    if msg:
        notes.append(msg)

if notes:
    with open(os.path.join(OUTPUT_DIR, "notes.txt"), "w", encoding="utf-8") as f:
        f.write("\n".join(notes))

print(f"EDA complete. Reports saved to: {OUTPUT_DIR}")


EDA complete. Reports saved to: eda_reports
