# Initial Exploratory Data Analysis

**Competition**: [Competition Name]  
**Author**: [Your Name]  
**Date**: [Date]  
**Objective**: Initial exploration and understanding of the competition dataset

## Table of Contents
1. [Data Loading and Basic Info](#1-data-loading-and-basic-info)
2. [Target Variable Analysis](#2-target-variable-analysis)
3. [Feature Overview](#3-feature-overview)
4. [Missing Values Analysis](#4-missing-values-analysis)
5. [Correlation Analysis](#5-correlation-analysis)
6. [Distribution Analysis](#6-distribution-analysis)
7. [Initial Insights](#7-initial-insights)

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Configuration
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Random seed for reproducibility
np.random.seed(42)

## 1. Data Loading and Basic Info

In [None]:
# Load training data
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')
sample_submission = pd.read_csv('../data/raw/sample_submission.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

In [None]:
# Basic info about the datasets
print("=== TRAIN DATA INFO ===")
train_df.info()
print("\n=== TRAIN DATA DESCRIPTION ===")
display(train_df.describe())
print("\n=== FIRST FEW ROWS ===")
display(train_df.head())

In [None]:
# Identify target column (update based on competition)
target_col = 'target'  # Update this
feature_cols = [col for col in train_df.columns if col != target_col]

print(f"Target column: {target_col}")
print(f"Number of features: {len(feature_cols)}")
print(f"Features: {feature_cols[:10]}...")  # Show first 10 features

## 2. Target Variable Analysis

In [None]:
# Target variable statistics
print("=== TARGET VARIABLE STATISTICS ===")
print(train_df[target_col].describe())
print(f"\nMissing values: {train_df[target_col].isnull().sum()}")
print(f"Unique values: {train_df[target_col].nunique()}")

# Check if binary/multiclass classification or regression
if train_df[target_col].nunique() <= 20:
    print(f"\nValue counts:\n{train_df[target_col].value_counts()}")

In [None]:
# Target distribution visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Distribution plot
if train_df[target_col].nunique() > 20:  # Continuous target
    axes[0].hist(train_df[target_col], bins=50, alpha=0.7, edgecolor='black')
    axes[0].set_title('Target Distribution')
    axes[0].set_xlabel(target_col)
    axes[0].set_ylabel('Frequency')
    
    # Box plot
    axes[1].boxplot(train_df[target_col])
    axes[1].set_title('Target Box Plot')
    axes[1].set_ylabel(target_col)
else:  # Categorical target
    value_counts = train_df[target_col].value_counts()
    axes[0].bar(value_counts.index.astype(str), value_counts.values)
    axes[0].set_title('Target Distribution')
    axes[0].set_xlabel(target_col)
    axes[0].set_ylabel('Count')
    
    # Pie chart
    axes[1].pie(value_counts.values, labels=value_counts.index, autopct='%1.1f%%')
    axes[1].set_title('Target Proportion')

plt.tight_layout()
plt.show()

## 3. Feature Overview

In [None]:
# Categorize features by data type
numerical_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()

# Remove target from numerical features if present
if target_col in numerical_features:
    numerical_features.remove(target_col)

print(f"Numerical features ({len(numerical_features)}): {numerical_features[:10]}...")
print(f"Categorical features ({len(categorical_features)}): {categorical_features[:10]}...")

In [None]:
# Feature statistics summary
feature_summary = pd.DataFrame({
    'Feature': train_df.columns,
    'Type': train_df.dtypes,
    'Missing_Count': train_df.isnull().sum(),
    'Missing_Percent': (train_df.isnull().sum() / len(train_df)) * 100,
    'Unique_Values': train_df.nunique(),
    'Unique_Percent': (train_df.nunique() / len(train_df)) * 100
})

feature_summary = feature_summary.sort_values('Missing_Percent', ascending=False)
display(feature_summary.head(20))

## 4. Missing Values Analysis

In [None]:
# Missing values heatmap
missing_data = train_df.isnull().sum().sort_values(ascending=False)
missing_data = missing_data[missing_data > 0]

if len(missing_data) > 0:
    plt.figure(figsize=(12, 8))
    sns.barplot(x=missing_data.values, y=missing_data.index)
    plt.title('Missing Values by Feature')
    plt.xlabel('Number of Missing Values')
    plt.tight_layout()
    plt.show()
    
    # Missing values pattern
    if len(missing_data) <= 20:  # Only show if manageable number
        plt.figure(figsize=(12, 6))
        sns.heatmap(train_df[missing_data.index].isnull(), 
                   cbar=True, yticklabels=False, cmap='viridis')
        plt.title('Missing Values Pattern')
        plt.tight_layout()
        plt.show()
else:
    print("No missing values found in the training data!")

## 5. Correlation Analysis

In [None]:
# Correlation with target
if len(numerical_features) > 0:
    correlations = train_df[numerical_features + [target_col]].corr()[target_col].sort_values(ascending=False)
    correlations = correlations.drop(target_col)  # Remove self-correlation
    
    plt.figure(figsize=(10, max(6, len(correlations) * 0.3)))
    sns.barplot(x=correlations.values, y=correlations.index)
    plt.title('Feature Correlation with Target')
    plt.xlabel('Correlation Coefficient')
    plt.tight_layout()
    plt.show()
    
    print("Top 10 correlations with target:")
    print(correlations.head(10))
    print("\nBottom 10 correlations with target:")
    print(correlations.tail(10))

In [None]:
# Correlation heatmap (for top features)
if len(numerical_features) > 0:
    # Select top correlated features for heatmap
    top_features = correlations.abs().sort_values(ascending=False).head(15).index.tolist()
    
    plt.figure(figsize=(12, 10))
    correlation_matrix = train_df[top_features + [target_col]].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
    plt.title('Correlation Heatmap - Top Features')
    plt.tight_layout()
    plt.show()

## 6. Distribution Analysis

In [None]:
# Numerical features distribution
if len(numerical_features) > 0:
    n_features_to_plot = min(12, len(numerical_features))
    features_to_plot = numerical_features[:n_features_to_plot]
    
    fig, axes = plt.subplots(nrows=(n_features_to_plot + 2) // 3, ncols=3, 
                            figsize=(15, 5 * ((n_features_to_plot + 2) // 3)))
    axes = axes.ravel() if n_features_to_plot > 3 else [axes]
    
    for i, feature in enumerate(features_to_plot):
        if i < len(axes):
            train_df[feature].hist(bins=30, alpha=0.7, ax=axes[i], edgecolor='black')
            axes[i].set_title(f'{feature} Distribution')
            axes[i].set_xlabel(feature)
            axes[i].set_ylabel('Frequency')
    
    # Hide empty subplots
    for j in range(i + 1, len(axes)):
        axes[j].set_visible(False)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Categorical features analysis
if len(categorical_features) > 0:
    for feature in categorical_features[:5]:  # Show first 5 categorical features
        plt.figure(figsize=(12, 4))
        
        value_counts = train_df[feature].value_counts().head(20)  # Top 20 categories
        
        plt.subplot(1, 2, 1)
        value_counts.plot(kind='bar')
        plt.title(f'{feature} - Value Counts')
        plt.xlabel(feature)
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        
        # Target by category (if not too many categories)
        if train_df[feature].nunique() <= 20:
            plt.subplot(1, 2, 2)
            target_by_cat = train_df.groupby(feature)[target_col].mean().sort_values()
            target_by_cat.plot(kind='bar')
            plt.title(f'Average {target_col} by {feature}')
            plt.xlabel(feature)
            plt.ylabel(f'Average {target_col}')
            plt.xticks(rotation=45)
        
        plt.tight_layout()
        plt.show()

## 7. Initial Insights

In [None]:
# Summary of key insights
print("=== INITIAL EDA INSIGHTS ===")
print(f"\n1. Dataset Overview:")
print(f"   - Training samples: {len(train_df):,}")
print(f"   - Test samples: {len(test_df):,}")
print(f"   - Total features: {len(feature_cols)}")
print(f"   - Numerical features: {len(numerical_features)}")
print(f"   - Categorical features: {len(categorical_features)}")

print(f"\n2. Target Variable:")
if train_df[target_col].nunique() > 20:
    print(f"   - Type: Continuous (Regression problem)")
    print(f"   - Range: {train_df[target_col].min():.3f} to {train_df[target_col].max():.3f}")
    print(f"   - Mean: {train_df[target_col].mean():.3f}")
    print(f"   - Std: {train_df[target_col].std():.3f}")
else:
    print(f"   - Type: Categorical (Classification problem)")
    print(f"   - Classes: {train_df[target_col].nunique()}")
    class_dist = train_df[target_col].value_counts(normalize=True)
    print(f"   - Distribution: {dict(class_dist)}")

print(f"\n3. Data Quality:")
total_missing = train_df.isnull().sum().sum()
print(f"   - Total missing values: {total_missing:,}")
print(f"   - Features with missing values: {(train_df.isnull().sum() > 0).sum()}")
if total_missing > 0:
    worst_feature = train_df.isnull().sum().idxmax()
    worst_missing = train_df.isnull().sum().max()
    print(f"   - Worst feature: {worst_feature} ({worst_missing} missing, {worst_missing/len(train_df)*100:.1f}%)")

if len(numerical_features) > 0:
    print(f"\n4. Feature Correlations:")
    print(f"   - Strongest positive correlation: {correlations.iloc[0]:.3f} ({correlations.index[0]})")
    print(f"   - Strongest negative correlation: {correlations.iloc[-1]:.3f} ({correlations.index[-1]})")
    strong_correlations = correlations[abs(correlations) > 0.5]
    print(f"   - Features with |correlation| > 0.5: {len(strong_correlations)}")

print(f"\n5. Next Steps:")
print(f"   - Detailed feature engineering analysis")
print(f"   - Missing value imputation strategy")
print(f"   - Outlier detection and treatment")
print(f"   - Feature selection and dimensionality analysis")
print(f"   - Train/Test distribution comparison")

In [None]:
# Save insights to file for future reference
insights = {
    'dataset_shape': {'train': train_df.shape, 'test': test_df.shape},
    'target_info': {
        'name': target_col,
        'type': 'continuous' if train_df[target_col].nunique() > 20 else 'categorical',
        'unique_values': int(train_df[target_col].nunique()),
        'missing_values': int(train_df[target_col].isnull().sum())
    },
    'feature_counts': {
        'total': len(feature_cols),
        'numerical': len(numerical_features),
        'categorical': len(categorical_features)
    },
    'data_quality': {
        'total_missing': int(total_missing),
        'features_with_missing': int((train_df.isnull().sum() > 0).sum())
    }
}

import json
with open('../../configs/eda_insights.json', 'w') as f:
    json.dump(insights, f, indent=2)

print("Initial EDA insights saved to configs/eda_insights.json")