# 02 - Statistical Analysis (WineQT)

Testes de normalidade, análise de outliers e variância, e importância inicial de features.



In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from src.data_ingestion import load_wine_dataframe

HF_REPO = os.getenv("HF_DATASET_REPO", "henriquebap/wine-ml-dataset")
FILENAME = os.getenv("HF_DATASET_FILENAME", "WineQT.csv")

df = load_wine_dataframe(repo_id=HF_REPO, filename=FILENAME)
df.head()


In [None]:
# Normality tests (Shapiro-Wilk on sample if needed)
normality_results = {}
for col in [c for c in df.columns if c != 'quality']:
    series = df[col].dropna()
    sample = series.sample(min(500, len(series)), random_state=42)
    stat, p = stats.shapiro(sample)
    normality_results[col] = {'stat': float(stat), 'pvalue': float(p)}

pd.DataFrame(normality_results).T.sort_values('pvalue')


In [None]:
# ANOVA example: compare feature means across discrete quality groups
anova_results = {}
for col in [c for c in df.columns if c != 'quality']:
    groups = [g[1][col].values for g in df.groupby('quality')]
    stat, p = stats.f_oneway(*groups)
    anova_results[col] = {'F': float(stat), 'pvalue': float(p)}

pd.DataFrame(anova_results).T.sort_values('pvalue')


In [None]:
# Outlier analysis using z-score
z = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
outlier_counts = (z > 3).sum(axis=0)
pd.Series(outlier_counts, index=df.select_dtypes(include=[np.number]).columns).sort_values(ascending=False)
