# Academic Performance Prediction - Enhanced Data Exploration

This notebook provides interactive exploration of the academic performance dataset with enhanced visualizations.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from src.data_loader import DataLoader
from src.enhanced_visualizer import EnhancedVisualizer

# Load data
data_loader = DataLoader()
df = data_loader.load_data()
enhanced_viz = EnhancedVisualizer()

print(f"Dataset shape: {df.shape}")
df.head()

## Basic Statistics and Data Overview

In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\nBasic Statistics:")
df.describe()

## Interactive Data Distribution Analysis

In [None]:
# Interactive data distribution
enhanced_viz.plot_interactive_data_distribution(df, save=False)

## Feature Correlation Analysis

In [None]:
# Enhanced correlation heatmap
enhanced_viz.plot_correlation_heatmap(df, save=False)

## Performance Analysis by Categories

In [None]:
# Interactive box plots for numerical features by performance
numerical_cols = df.select_dtypes(include=[np.number]).columns.drop('performance', errors='ignore')

for col in numerical_cols:
    fig = px.box(df, x='performance', y=col, 
                 title=f'{col.title()} Distribution by Performance Level',
                 color='performance')
    fig.show()

In [None]:
# Interactive violin plots
for col in numerical_cols:
    fig = px.violin(df, x='performance', y=col, 
                    title=f'{col.title()} Distribution by Performance Level (Violin Plot)',
                    color='performance', box=True)
    fig.show()

## Categorical Feature Analysis

In [None]:
# Performance by categorical features
categorical_cols = ['family_income', 'parent_education', 'extracurricular']

for col in categorical_cols:
    # Cross-tabulation
    crosstab = pd.crosstab(df[col], df['performance'], normalize='index') * 100
    
    # Interactive stacked bar chart
    fig = px.bar(crosstab.reset_index(), x=col, 
                 y=['Poor', 'Average', 'Good'], 
                 title=f'Performance Distribution by {col.title()}',
                 labels={'value': 'Percentage', 'variable': 'Performance'})
    fig.show()
    
    print(f"\nCross-tabulation for {col}:")
    print(crosstab.round(2))

## 3D Scatter Plot Analysis

In [None]:
# 3D scatter plot
fig = px.scatter_3d(df, x='study_hours', y='attendance_rate', z='previous_grade',
                    color='performance', size='study_hours',
                    title='3D Relationship: Study Hours vs Attendance vs Previous Grade',
                    hover_data=['family_income', 'parent_education'])
fig.show()

## Parallel Coordinates Plot

In [None]:
# Prepare data for parallel coordinates
df_viz = df.copy()

# Encode categorical variables for visualization
categorical_mappings = {
    'family_income': {'Low': 0, 'Medium': 1, 'High': 2},
    'parent_education': {'High School': 0, 'Bachelor': 1, 'Master': 2, 'PhD': 3},
    'performance': {'Poor': 0, 'Average': 1, 'Good': 2}
}

for col, mapping in categorical_mappings.items():
    df_viz[col + '_encoded'] = df_viz[col].map(mapping)

# Create parallel coordinates plot
fig = px.parallel_coordinates(df_viz.sample(200),  # Sample for better performance
                             dimensions=['study_hours', 'attendance_rate', 'previous_grade', 
                                       'family_income_encoded', 'extracurricular', 'parent_education_encoded'],
                             color='performance_encoded',
                             title='Parallel Coordinates Plot - Feature Relationships')
fig.show()

## Statistical Tests and Insights

In [None]:
from scipy import stats

# ANOVA test for numerical features
print("ANOVA Test Results (F-statistic, p-value):")
print("=" * 50)

for col in numerical_cols:
    groups = [df[df['performance'] == perf][col].values for perf in df['performance'].unique()]
    f_stat, p_value = stats.f_oneway(*groups)
    significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
    print(f"{col:20s}: F={f_stat:8.3f}, p={p_value:8.6f} {significance}")

print("\nSignificance levels: *** p<0.001, ** p<0.01, * p<0.05, ns = not significant")