# 2.4 Pandas: EDA, очистка данных, groupby

Цели:
- Загрузить небольшой датасет (NYC Taxi или UCI Adult).
- Провести базовый EDA: типы, пропуски, дубликаты.
- Очистка: обработка пропусков, исправление типов.
- Группировки с несколькими агрегатами и краткое резюме инсайтов.


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
pd.__version__

'3.0.0'

## 1. Загрузка датасета (локальный путь)

In [2]:
# English comments per project rules. Adjust the path to your local file.
# Option A: NYC Yellow Taxi Parquet/CSV
nyc_parquet = Path('2_numpy_pandas/datasets/yellow_tripdata_2019-01.parquet')
nyc_csv = Path('2_numpy_pandas/datasets/yellow_tripdata_2019-01.csv')
df = None
if nyc_parquet.exists():
    df = pd.read_parquet(nyc_parquet)
elif nyc_csv.exists():
    df = pd.read_csv(nyc_csv)
else:
    # Option B: UCI Adult (converted to CSV)
    adult = Path('2_numpy_pandas/datasets/adult.data')
    if adult.exists():
        cols = ['age','workclass','fnlwgt','education','education-num',
                'marital-status','occupation','relationship','race','sex',
                'capital-gain','capital-loss','hours-per-week','native-country','income']
        df = pd.read_csv(adult, names=cols, na_values=['?',' ?'], skipinitialspace=True)
if df is None:
    raise FileNotFoundError('Dataset not found. Place NYC Taxi or UCI Adult files in 2_numpy_pandas/datasets/.')
df.head()

FileNotFoundError: Dataset not found. Place NYC Taxi or UCI Adult files in 2_numpy_pandas/datasets/.

## 2. Базовый EDA: info, describe, пропуски

In [None]:
df.info()
df.describe(include='all').T.head(20)

## 3. Очистка: дубликаты, типы, пропуски

In [None]:
# Example cleaning pipeline
before = len(df)
df = df.drop_duplicates()
# Type fixes: as needed, e.g., datetime columns for NYC Taxi
for col in df.columns:
    if 'date' in col or 'time' in col:
        with pd.option_context('mode.chained_assignment', None):
            try:
                df[col] = pd.to_datetime(df[col], errors='ignore')
            except Exception:
                pass
# Missing values handling
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(include=['object','category']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
for c in cat_cols:
    df[c] = df[c].fillna('Unknown')
after = len(df)
before, after

## 4. GroupBy и агрегаты

In [None]:
# Adjust grouping by dataset
# Example for UCI Adult
if 'income' in df.columns and 'education' in df.columns:
    res = df.groupby(['education']).agg(
        rows=('income','size'),
        hours_mean=('hours-per-week','mean') if 'hours-per-week' in df.columns else ('income','size'),
    ).sort_values('rows', ascending=False)
    display(res.head(10))
# Example for NYC Taxi
elif 'passenger_count' in df.columns:
    res = df.groupby(['passenger_count']).agg(
        trips=('passenger_count','size'),
        fare_mean=('fare_amount','mean') if 'fare_amount' in df.columns else ('passenger_count','size')
    ).sort_values('trips', ascending=False)
    display(res.head(10))
else:
    df.head()