# TASK 3: Przygotowanie Zbior√≥w Treningowych

W tym notebooku:
1. Wczytamy dane GTZAN
2. Wyodrƒôbnimy 59 cech z librosa
3. Podzielimy dane na train/val/test (60/20/20)
4. Przeanalizujemy cechy

## Setup

In [None]:
import sys
from pathlib import Path

# Dodaj src do path'u
sys.path.insert(0, str(Path.cwd().parent / 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from feature_extractor import FeatureExtractor
from dataset_splitter import DatasetSplitter

# Wizualizacja
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("‚úÖ Setup uko≈Ñczony!")

## KROK 1: Ekstrakcja 59 Cech

In [None]:
# ≈öcie≈ºki
DATA_DIR = Path.cwd().parent / 'data' / 'raw' / 'genres'
OUTPUT_DIR = Path.cwd().parent / 'data' / 'processed'

print(f"Data dir: {DATA_DIR}")
print(f"Output dir: {OUTPUT_DIR}")
print(f"\nData dir istnieje: {DATA_DIR.exists()}")

In [None]:
# Ekstrakcja cech
print("üéµ Ekstrakcja 59 cech z GTZAN...\n")

extractor = FeatureExtractor(sr=22050)
features_df = extractor.extract_from_directory(str(DATA_DIR))

In [None]:
# PrzeglƒÖd danych
print(f"Shape: {features_df.shape}")
print(f"\nPierwsze 5 wierszy:")
features_df.head()

In [None]:
# Informacje o cechach
print(f"Kolumny: {list(features_df.columns)}")
print(f"\nDatTypy:")
features_df.dtypes

In [None]:
# Statystyka cech
print("üìä Statystyka cech (pierwsze 10):")
features_df.describe().iloc[:, :10]

## KROK 2: Analiza Rozk≈Çadu Gatunk√≥w

In [None]:
# Rozk≈Çad gatunk√≥w
genre_counts = features_df['genre'].value_counts().sort_index()

print(f"üéº Rozk≈Çad gatunk√≥w:")
print(genre_counts)
print(f"\nRazem: {genre_counts.sum()} pr√≥bek")

In [None]:
# Wizualizacja rozk≈Çadu gatunk√≥w
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
genre_counts.plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Rozk≈Çad Pr√≥bek po Gatunkach', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Gatunek')
axes[0].set_ylabel('Liczba Pr√≥bek')
axes[0].grid(axis='y', alpha=0.3)

# Pie chart
genre_counts.plot(kind='pie', ax=axes[1], autopct='%1.1f%%')
axes[1].set_title('Procent Pr√≥bek po Gatunkach', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.savefig(str(OUTPUT_DIR / 'genre_distribution.png'), dpi=150, bbox_inches='tight')
plt.show()

print("‚úÖ Wykres zapisany!")

## KROK 3: Podzia≈Ç na Train/Val/Test

In [None]:
# Podzia≈Ç danych
splitter = DatasetSplitter(test_size=0.2, val_size=0.2, random_state=42)
train_df, val_df, test_df = splitter.split(features_df, label_column='genre')

In [None]:
# Zapisz podzielone zbiory
splitter.save_splits(train_df, val_df, test_df, output_dir=str(OUTPUT_DIR))

## KROK 4: Analiza Wyodrƒôbnionych Cech

In [None]:
# Wybierz tylko cechy (bez genre i filename)
feature_cols = [col for col in features_df.columns if col not in ['genre', 'filename']]
print(f"Liczba cech: {len(feature_cols)}")
print(f"\nPierwsze 10 cech: {feature_cols[:10]}")

In [None]:
# Histogram top 9 cech
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

top_features = feature_cols[:9]

for idx, feature in enumerate(top_features):
    axes[idx].hist(features_df[feature], bins=30, color='steelblue', alpha=0.7, edgecolor='black')
    axes[idx].set_title(f'{feature}', fontweight='bold')
    axes[idx].set_xlabel('Warto≈õƒá')
    axes[idx].set_ylabel('Liczba Pr√≥bek')
    axes[idx].grid(axis='y', alpha=0.3)

plt.suptitle('Rozk≈Çad Top 9 Cech', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.savefig(str(OUTPUT_DIR / 'feature_distributions.png'), dpi=150, bbox_inches='tight')
plt.show()

print("‚úÖ Wykresy zapisane!")

In [None]:
# Heatmapa korelacji top cech
fig, ax = plt.subplots(figsize=(12, 10))

top_20_features = feature_cols[:20]
corr_matrix = features_df[top_20_features].corr()

sns.heatmap(corr_matrix, cmap='coolwarm', center=0, annot=False, ax=ax, 
            square=True, linewidths=0.5, cbar_kws={'label': 'Korelacja'})
ax.set_title('Macierz Korelacji Top 20 Cech', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(str(OUTPUT_DIR / 'feature_correlation.png'), dpi=150, bbox_inches='tight')
plt.show()

print("‚úÖ Heatmapa zapisana!")

## KROK 5: Podsumowanie

In [None]:
print("\n" + "="*70)
print("‚úÖ TASK 3 UKO≈ÉCZONY")
print("="*70)

print(f"\nüìä STATYSTYKA:")
print(f"\n  Cechy:")
print(f"    ‚Ä¢ Liczba cech: {len(feature_cols)}")
print(f"    ‚Ä¢ Pr√≥bki ca≈Ço≈õƒá: {len(features_df)}")

print(f"\n  Zbi√≥r treningowy (60%):")
print(f"    ‚Ä¢ Pr√≥bki: {len(train_df)}")
print(f"    ‚Ä¢ Plik: data/processed/train_features.csv")

print(f"\n  Zbi√≥r walidacyjny (20%):")
print(f"    ‚Ä¢ Pr√≥bki: {len(val_df)}")
print(f"    ‚Ä¢ Plik: data/processed/val_features.csv")

print(f"\n  Zbi√≥r testowy (20%):")
print(f"    ‚Ä¢ Pr√≥bki: {len(test_df)}")
print(f"    ‚Ä¢ Plik: data/processed/test_features.csv")

print(f"\n  Gatunki: {features_df['genre'].nunique()}")
print(f"    ‚Ä¢ {', '.join(sorted(features_df['genre'].unique()))}")

print(f"\nüí° Nastƒôpny krok:")
print(f"   TASK 4: Trening kNN classifier")
print(f"   Otworz: notebooks/Task4_KNNBenchmark.ipynb")
print(f"\n" + "="*70)