# 02 - Statistical Analysis (WineQT)

Testes de normalidade, análise de outliers e variância, e importância inicial de features.



In [3]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from pathlib import Path
from huggingface_hub import hf_hub_download

# Carrega variáveis do .env
from dotenv import load_dotenv
load_dotenv(override=True)

def get_env_strip_quotes(key: str) -> str:
    val = os.getenv(key, '')
    return val.strip('\'"')

HF_TOKEN = get_env_strip_quotes('HF_TOKEN')
HF_PROCESSED_REPO = os.getenv('HF_PROCESSED_REPO', 'henriquebap/wine-ml-processed')

# Carrega df_capped do HF Hub (fallback local)
try:
    df_path = hf_hub_download(
        repo_id=HF_PROCESSED_REPO,
        filename='processed/full.csv',
        repo_type='dataset',
        token=HF_TOKEN if HF_TOKEN else None
    )
    df = pd.read_csv(df_path)
    print('✅ Dados carregados do HF Hub:', HF_PROCESSED_REPO)
except Exception as e:
    print('⚠️ Fallback para dados locais:', e)
    proc = Path.cwd().parent / 'data' / 'processed' / 'df_capped.csv'
    if proc.exists():
        df = pd.read_csv(proc)
    else:
        from src.data_ingestion import load_wine_dataframe
        HF_REPO = os.getenv("HF_DATASET_REPO", "henriquebap/wine-ml-dataset")
        FILENAME = os.getenv("HF_DATASET_FILENAME", "WineQT.csv")
        df = load_wine_dataframe(repo_id=HF_REPO, filename=FILENAME)

# target
if 'quality_class' not in df.columns and 'quality' in df.columns:
    df['quality_class'] = df['quality'].apply(lambda x: 'Baixa (3-4)' if x <= 4 else 'Média (5-6)' if x <= 6 else 'Alta (7-8)')
order = ['Baixa (3-4)', 'Média (5-6)', 'Alta (7-8)']
ordinal_map = {'Baixa (3-4)': 0, 'Média (5-6)': 1, 'Alta (7-8)': 2}
y_ord = df['quality_class'].map(ordinal_map)

# selected features (fallback para todas as numéricas)
selected_features = [c for c in df.select_dtypes(include=[np.number]).columns if c != 'quality']

print('Usando', len(selected_features), 'features.')
df[selected_features + ['quality_class']].head()


full.csv: 0.00B [00:00, ?B/s]

✅ Dados carregados do HF Hub: henriquebap/wine-ml-processed
Usando 11 features.


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality_class
0,2.128232,0.7,0.0,1.064711,0.07325,2.484907,3.555348,0.9978,3.51,0.444686,9.4,Média (5-6)
1,2.174752,0.88,0.0,1.280934,0.09349,3.258097,4.219508,0.9968,3.2,0.518794,9.8,Média (5-6)
2,2.174752,0.76,0.04,1.193922,0.088011,2.772589,4.007333,0.997,3.26,0.500775,9.8,Média (5-6)
3,2.501436,0.28,0.56,1.064711,0.072321,2.890372,4.110874,0.998,3.16,0.457425,9.8,Média (5-6)
4,2.128232,0.66,0.0,1.029619,0.072321,2.639057,3.713572,0.9978,3.51,0.444686,9.4,Média (5-6)


In [None]:
# Kruskal-Wallis por classe (não-paramétrico)
res = []
for col in selected_features:
    groups = [df.loc[df['quality_class']==lv, col].dropna() for lv in order]
    if min(map(len, groups)) < 2:
        continue
    H, p = stats.kruskal(*groups)
    n = sum(len(g) for g in groups); k = len(groups)
    e2 = (H - k + 1) / (n - k) if (n - k) > 0 else np.nan
    res.append((col, H, p, e2))
kruskal_rank = pd.DataFrame(res, columns=['feature','H','p_value','epsilon_sq']).sort_values('p_value')
display(kruskal_rank.head(12))


Unnamed: 0,feature,H,p_value,epsilon_sq
10,alcohol,152.205262,8.892978e-34,0.147985
1,volatile acidity,128.372431,1.3313150000000002e-28,0.124505
9,sulphates,96.437685,1.145046e-21,0.093042
2,citric acid,64.770323,8.615947e-15,0.061843
4,chlorides,39.246959,3.003525e-09,0.036697
7,density,30.249029,2.70089e-07,0.027832
6,total sulfur dioxide,29.219345,4.519598e-07,0.026817
0,fixed acidity,14.181182,0.0008329048,0.012001
8,pH,13.052292,0.00146464,0.010889
5,free sulfur dioxide,8.631994,0.01335323,0.006534


- p baixo e epsilon² maior indicam maior separação entre classes.

In [None]:
# Spearman vs alvo ordinal (monotonicidade)
spear = df[selected_features].apply(lambda s: s.corr(y_ord, method='spearman')).sort_values(ascending=False)
display(spear.to_frame('spearman'))


Unnamed: 0,spearman
alcohol,0.356021
sulphates,0.303313
citric acid,0.251804
fixed acidity,0.116804
residual sugar,0.045988
free sulfur dioxide,-0.045
pH,-0.10275
total sulfur dioxide,-0.109447
density,-0.152001
chlorides,-0.163428


- maior |spearman| sugere relação monotônica mais forte.


In [7]:
# Export e conclusões rápidas
from pathlib import Path
out_dir = Path('reports/stats'); out_dir.mkdir(parents=True, exist_ok=True)
kruskal_rank.to_csv(out_dir/'kruskal_rank.csv', index=False)
spear.to_frame('spearman').to_csv(out_dir/'spearman_selected.csv')

top_feats = list(kruskal_rank.head(8)['feature'])
print('Top (Kruskal) features:', top_feats)



Top (Kruskal) features: ['alcohol', 'volatile acidity', 'sulphates', 'citric acid', 'chlorides', 'density', 'total sulfur dioxide', 'fixed acidity']
