In [None]:
# -----------------------------
# Name: Harsh Siddharth Brahmecha
# PRN: 20220802003
# -----------------------------

import kagglehub
import pandas as pd
import numpy as np
from scipy import stats
import os

# -----------------------------
# Download dataset from Kaggle
# -----------------------------
path = kagglehub.dataset_download("julianbloise/winners-formula-1-1950-to-2025")
print("Path to dataset files:", path)

# Find CSV file in dataset path
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
if not csv_files:
    raise FileNotFoundError("No CSV file found in dataset path.")
dataset_file = os.path.join(path, csv_files[0])
print("Using dataset file:", dataset_file)

# -----------------------------
# Load dataset
# -----------------------------
df = pd.read_csv(dataset_file)

# -----------------------------
# Select numeric & categorical columns
# -----------------------------
numeric_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

print("\nNumeric Columns:", numeric_cols.tolist())
print("Categorical Columns:", categorical_cols.tolist())

# -----------------------------
# Central Tendency for Numeric Features
# -----------------------------
desc = pd.DataFrame(index=numeric_cols)
desc['Mean'] = df[numeric_cols].mean()
desc['Median'] = df[numeric_cols].median()
desc['Mode'] = df[numeric_cols].mode().iloc[0]

print("\n--- Mean, Median, Mode (Numeric Features) ---")
print(desc)

# -----------------------------
# Balance Check (Categorical Features)
# -----------------------------
print("\n--- Balance Check (Categorical Features) ---")
for col in categorical_cols:
    counts = df[col].value_counts(normalize=True) * 100
    print(f"\n{col} Distribution (%):")
    print(counts.head(10))  # top 10 values to avoid clutter

    # Simple balance check: if the top category >50%, dataset is imbalanced
    if counts.iloc[0] > 50:
        print(f"⚠️ {col} is IMBALANCED (Top category = {counts.index[0]}: {counts.iloc[0]:.2f}%)")
    else:
        print(f"✅ {col} is relatively BALANCED")

Path to dataset files: /root/.cache/kagglehub/datasets/julianbloise/winners-formula-1-1950-to-2025/versions/4
Using dataset file: /root/.cache/kagglehub/datasets/julianbloise/winners-formula-1-1950-to-2025/versions/4/winners_f1_1950_2025_v2.csv

Numeric Columns: ['laps', 'year']
Categorical Columns: ['date', 'continent', 'grand_prix', 'circuit', 'winner_name', 'team', 'time']

--- Mean, Median, Mode (Numeric Features) ---
             Mean  Median    Mode
laps    64.617338    64.0    53.0
year  1992.999124  1995.0  2024.0

--- Balance Check (Categorical Features) ---

date Distribution (%):
date
1951-07-01    0.175131
1956-01-22    0.175131
1957-07-20    0.175131
2006-08-27    0.087566
2006-08-06    0.087566
2006-07-30    0.087566
2006-07-16    0.087566
2006-07-02    0.087566
2007-09-30    0.087566
2006-06-11    0.087566
Name: proportion, dtype: float64
✅ date is relatively BALANCED

continent Distribution (%):
continent
Europe           60.070053
Asia             14.273205
North Ameri