# Exploratory Data Analysis (EDA)

This notebook performs comprehensive exploratory data analysis on the brain activity data for music classification.

## Objectives:
- Load and inspect the dataset
- Understand data structure and quality
- Visualize feature distributions
- Analyze target variable distribution
- Identify correlations and patterns
- Detect outliers and missing values

## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Statistical analysis
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 2. Load Dataset

In [None]:
# Load the datasets
train_data = pd.read_csv('../data/raw/train_data.csv')
train_labels = pd.read_csv('../data/raw/train_labels.csv')
test_data = pd.read_csv('../data/raw/test_data.csv')

print(f"Training data shape: {train_data.shape}")
print(f"Training labels shape: {train_labels.shape}")
print(f"Test data shape: {test_data.shape}")

## 3. Basic Data Inspection

In [None]:
# Display first few rows of training data
print("Training Data - First 5 rows:")
display(train_data.head())

print("\nTraining Labels - First 5 rows:")
display(train_labels.head())

In [None]:
# Data info and statistics
print("Training Data Info:")
print(train_data.info())

print("\nBasic Statistics:")
display(train_data.describe())

In [None]:
# Check for missing values
print("Missing values in training data:")
missing_train = train_data.isnull().sum().sort_values(ascending=False)
print(missing_train[missing_train > 0])

print("\nMissing values in training labels:")
missing_labels = train_labels.isnull().sum()
print(missing_labels[missing_labels > 0])

print("\nMissing values in test data:")
missing_test = test_data.isnull().sum().sort_values(ascending=False)
print(missing_test[missing_test > 0])

## 4. Target Variable Analysis

In [None]:
# Analyze target variable distribution
target_column = train_labels.columns[-1]  # Assuming last column is target
print(f"Target variable: {target_column}")

# Value counts
target_counts = train_labels[target_column].value_counts()
print(f"\nTarget distribution:")
print(target_counts)

# Calculate percentages
target_percentages = train_labels[target_column].value_counts(normalize=True) * 100
print(f"\nTarget distribution (percentages):")
print(target_percentages)

In [None]:
# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot
target_counts.plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Target Variable Distribution')
axes[0].set_xlabel('Classes')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Pie chart
axes[1].pie(target_counts.values, labels=target_counts.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Target Variable Distribution (Pie Chart)')

plt.tight_layout()
plt.show()

## 5. Feature Analysis

In [None]:
# Feature statistics
numeric_features = train_data.select_dtypes(include=[np.number]).columns
print(f"Number of numeric features: {len(numeric_features)}")
print(f"Feature names: {list(numeric_features[:10])}...")  # Show first 10 features

# Check data types
print("\nData types:")
print(train_data.dtypes.value_counts())

In [None]:
# Feature distribution analysis
# Select first 16 features for visualization
features_to_plot = numeric_features[:16]

fig, axes = plt.subplots(4, 4, figsize=(20, 16))
axes = axes.ravel()

for idx, feature in enumerate(features_to_plot):
    axes[idx].hist(train_data[feature], bins=30, alpha=0.7, color='lightblue', edgecolor='black')
    axes[idx].set_title(f'Distribution of {feature}')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Box plots to identify outliers
fig, axes = plt.subplots(4, 4, figsize=(20, 16))
axes = axes.ravel()

for idx, feature in enumerate(features_to_plot):
    axes[idx].boxplot(train_data[feature])
    axes[idx].set_title(f'Box Plot of {feature}')
    axes[idx].set_ylabel(feature)

plt.tight_layout()
plt.show()

## 6. Correlation Analysis

In [None]:
# Calculate correlation matrix for a subset of features
correlation_features = numeric_features[:20]  # Use first 20 features
correlation_matrix = train_data[correlation_features].corr()

# Create correlation heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Find highly correlated features
def find_highly_correlated_features(df, threshold=0.8):
    corr_matrix = df.corr().abs()
    upper_triangle = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )
    
    highly_correlated = [(column, row, upper_triangle.loc[row, column])
                        for column in upper_triangle.columns
                        for row in upper_triangle.index
                        if upper_triangle.loc[row, column] > threshold]
    
    return highly_correlated

high_corr_features = find_highly_correlated_features(train_data[correlation_features])
print(f"Highly correlated feature pairs (correlation > 0.8): {len(high_corr_features)}")
for pair in high_corr_features[:10]:  # Show first 10 pairs
    print(f"{pair[0]} - {pair[1]}: {pair[2]:.3f}")

## 7. Feature vs Target Analysis

In [None]:
# Merge training data with labels for analysis
train_combined = train_data.copy()
train_combined['target'] = train_labels[target_column]

# Analyze feature distributions by target class
features_for_analysis = numeric_features[:8]  # Use first 8 features

fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for idx, feature in enumerate(features_for_analysis):
    for target_class in train_combined['target'].unique():
        subset = train_combined[train_combined['target'] == target_class]
        axes[idx].hist(subset[feature], alpha=0.6, label=f'Class {target_class}', bins=20)
    
    axes[idx].set_title(f'{feature} Distribution by Target')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Frequency')
    axes[idx].legend()

plt.tight_layout()
plt.show()

## 8. Dimensionality Reduction Visualization

In [None]:
# PCA for visualization
scaler = StandardScaler()
scaled_features = scaler.fit_transform(train_data[numeric_features])

# Apply PCA
pca = PCA(n_components=2)
pca_features = pca.fit_transform(scaled_features)

# Create PCA DataFrame
pca_df = pd.DataFrame(data=pca_features, columns=['PC1', 'PC2'])
pca_df['target'] = train_labels[target_column]

# Plot PCA
plt.figure(figsize=(10, 8))
for target_class in pca_df['target'].unique():
    subset = pca_df[pca_df['target'] == target_class]
    plt.scatter(subset['PC1'], subset['PC2'], alpha=0.6, label=f'Class {target_class}')

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.title('PCA Visualization of Brain Activity Data')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"Total variance explained by first 2 components: {sum(pca.explained_variance_ratio_):.2%}")

## 9. Statistical Tests

In [None]:
# Perform statistical tests for feature importance
from scipy.stats import f_oneway

# ANOVA test for top features
anova_results = []
top_features = numeric_features[:20]  # Test top 20 features

for feature in top_features:
    groups = [train_combined[train_combined['target'] == target_class][feature] 
              for target_class in train_combined['target'].unique()]
    
    f_stat, p_value = f_oneway(*groups)
    anova_results.append({
        'feature': feature,
        'f_statistic': f_stat,
        'p_value': p_value
    })

# Create DataFrame and sort by p-value
anova_df = pd.DataFrame(anova_results)
anova_df = anova_df.sort_values('p_value')

print("ANOVA Test Results (Top 10 most significant features):")
display(anova_df.head(10))

## 10. Data Quality Assessment

In [None]:
# Check for constant features
constant_features = []
for feature in numeric_features:
    if train_data[feature].nunique() == 1:
        constant_features.append(feature)

print(f"Constant features: {len(constant_features)}")
if constant_features:
    print(constant_features[:10])  # Show first 10

# Check for near-zero variance features
low_variance_features = []
for feature in numeric_features:
    if train_data[feature].var() < 0.01:
        low_variance_features.append(feature)

print(f"\nLow variance features (var < 0.01): {len(low_variance_features)}")
if low_variance_features:
    print(low_variance_features[:10])  # Show first 10

## 11. Summary and Key Findings

In [None]:
# Summary statistics
print("=== EXPLORATORY DATA ANALYSIS SUMMARY ===")
print(f"\nDataset Shape:")
print(f"  Training data: {train_data.shape}")
print(f"  Training labels: {train_labels.shape}")
print(f"  Test data: {test_data.shape}")

print(f"\nData Quality:")
print(f"  Missing values in training data: {train_data.isnull().sum().sum()}")
print(f"  Missing values in labels: {train_labels.isnull().sum().sum()}")
print(f"  Constant features: {len(constant_features)}")
print(f"  Low variance features: {len(low_variance_features)}")

print(f"\nTarget Variable:")
print(f"  Number of classes: {train_labels[target_column].nunique()}")
print(f"  Class distribution: {dict(train_labels[target_column].value_counts())}")
print(f"  Is balanced: {'Yes' if train_labels[target_column].value_counts().std() < train_labels[target_column].value_counts().mean() * 0.1 else 'No'}")

print(f"\nFeature Characteristics:")
print(f"  Number of numeric features: {len(numeric_features)}")
print(f"  Highly correlated pairs (>0.8): {len(high_corr_features)}")
print(f"  PCA variance explained (2 components): {sum(pca.explained_variance_ratio_):.2%}")

print("\n=== RECOMMENDATIONS FOR NEXT STEPS ===")
print("1. Consider removing constant and low variance features")
print("2. Handle highly correlated features (feature selection or PCA)")
print("3. Consider feature scaling for distance-based algorithms")
print("4. Investigate class imbalance if present")
print("5. Consider outlier detection and treatment")
print("6. Feature engineering based on domain knowledge")

## 12. Save Preprocessed Data

In [None]:
# Save analysis results for next notebooks
# You can save intermediate results here

# Example: Save feature importance ranking
anova_df.to_csv('../data/processed/feature_importance_anova.csv', index=False)

# Save list of problematic features
problematic_features = {
    'constant_features': constant_features,
    'low_variance_features': low_variance_features,
    'highly_correlated_features': [pair[:2] for pair in high_corr_features]
}

import json
with open('../data/processed/problematic_features.json', 'w') as f:
    json.dump(problematic_features, f, indent=2)

print("Analysis results saved to data/processed/ directory")