/usr/bin/env python
coding: utf-8
# Student Dropout Prediction: Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis on the "Dropout or Academic Success" dataset to understand the data structure, distributions, and relationships between features.
## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import lightgbm as lgb
import joblib
import os

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

Import necessary libraries
Set up plotting

Load the dataset
## 2. Basic Data Overview

In [None]:
print("Loading dataset...")
data_path = '../data/dataset.csv'  # Update this path if needed
df = pd.read_csv(data_path)

Display basic information about the dataset

In [None]:
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

Display data types

In [None]:
print("Data types:")
print(df.dtypes)

Basic statistics
## 3. Missing Values Analysis

In [None]:
print("Basic statistics:")
print(df.describe())

Check for missing values
Calculate missing value percentages

In [None]:
print("Missing values per column:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

missing_percent = (missing_values / len(df)) * 100
missing_data = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percent
})

print("\nMissing values summary:")
print(missing_data[missing_data['Missing Values'] > 0].sort_values('Percentage', ascending=False))

Visualize missing values if any exist
## 4. Target Variable Analysis

In [None]:
if missing_values.sum() > 0:
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isnull(), cbar=True, cmap='viridis')
    plt.title('Missing Values Heatmap')
    plt.tight_layout()
    plt.savefig('../data/missing_values_heatmap.png')
    plt.show()
else:
    print("No missing values found in the dataset.")

Analyze target variable distribution

In [None]:
print("Target variable distribution:")
target_col = 'Target'  # Update this to the actual target column name
target_counts = df[target_col].value_counts()
target_percent = target_counts / len(df) * 100

print("Target distribution:")
print(pd.DataFrame({
    'Count': target_counts,
    'Percentage': target_percent
}))

Plot target distribution
Add percentage labels on bars
## 5. Feature Analysis

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.countplot(x=target_col, data=df)
plt.title('Distribution of Target Classes')

for p in ax.patches:
    percentage = f'{100 * p.get_height() / len(df):.1f}%'
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    ax.annotate(percentage, (x, y), ha='center', va='bottom')

plt.savefig('../data/target_distribution.png')
plt.show()

Identify numerical and categorical features
Remove target column from feature lists

In [None]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

if target_col in numerical_cols:
    numerical_cols.remove(target_col)
if target_col in categorical_cols:
    categorical_cols.remove(target_col)

print(f"Number of numerical features: {len(numerical_cols)}")
print(f"Numerical features: {numerical_cols[:10]}...")  # Show first 10

print(f"\nNumber of categorical features: {len(categorical_cols)}")
print(f"Categorical features: {categorical_cols}")

Analyze numerical feature distributions

In [None]:
if len(numerical_cols) > 0:
    plt.figure(figsize=(20, 15))
    num_plots = min(9, len(numerical_cols))
    for i, col in enumerate(numerical_cols[:num_plots], 1):
        plt.subplot(3, 3, i)
        sns.histplot(df[col], kde=True)
        plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.savefig('../data/numerical_distributions.png')
    plt.show()

Analyze categorical feature distributions
## 6. Correlation Analysis

In [None]:
if len(categorical_cols) > 0:
    plt.figure(figsize=(20, 15))
    num_plots = min(9, len(categorical_cols))
    for i, col in enumerate(categorical_cols[:num_plots], 1):
        plt.subplot(3, 3, i)
        value_counts = df[col].value_counts()
        if len(value_counts) <= 10:  # Only plot if not too many categories
            sns.barplot(x=value_counts.index, y=value_counts.values)
            plt.title(f'Distribution of {col}')
            plt.xticks(rotation=45)
        else:
            plt.text(0.5, 0.5, f'{col}\nToo many categories\n({len(value_counts)} unique values)', 
                    ha='center', va='center', transform=plt.gca().transAxes)
            plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.savefig('../data/categorical_distributions.png')
    plt.show()

Calculate correlation matrix for numerical features
Plot correlation heatmap
Find highly correlated features
## 7. Feature Relationships with Target

In [None]:
if len(numerical_cols) > 1:
    print("Calculating correlation matrix...")
    correlation = df[numerical_cols].corr()
    
    plt.figure(figsize=(14, 12))
    sns.heatmap(correlation, annot=True, cmap='coolwarm', linewidths=0.5, fmt=".2f")
    plt.title('Correlation Matrix of Numerical Features')
    plt.tight_layout()
    plt.savefig('../data/correlation_heatmap.png')
    plt.show()
    
    high_corr_threshold = 0.8
    high_corr_pairs = []
    for i in range(len(correlation.columns)):
        for j in range(i+1, len(correlation.columns)):
            if abs(correlation.iloc[i, j]) > high_corr_threshold:
                high_corr_pairs.append((correlation.columns[i], correlation.columns[j], correlation.iloc[i, j]))
    
    if high_corr_pairs:
        print("Highly correlated feature pairs (|correlation| > 0.8):")
        for pair in high_corr_pairs:
            print(f"{pair[0]} and {pair[1]}: {pair[2]:.3f}")
    else:
        print("No highly correlated feature pairs found.")
else:
    print("Not enough numerical features for correlation analysis.")

Analyze relationship between numerical features and target

In [None]:
if len(numerical_cols) > 0:
    plt.figure(figsize=(20, 15))
    num_plots = min(9, len(numerical_cols))
    for i, col in enumerate(numerical_cols[:num_plots], 1):
        plt.subplot(3, 3, i)
        sns.boxplot(x=target_col, y=col, data=df)
        plt.title(f'{col} by {target_col}')
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('../data/numerical_vs_target.png')
    plt.show()

Analyze relationship between categorical features and target
Create a crosstab
Plot
## 8. Statistical Summary

In [None]:
if len(categorical_cols) > 0:
    for col in categorical_cols[:5]:  # Analyze first 5 categorical features
        if df[col].nunique() <= 10:  # Only analyze if not too many categories
            plt.figure(figsize=(12, 6))
            
            ct = pd.crosstab(df[col], df[target_col])
            ct_pct = ct.div(ct.sum(axis=1), axis=0) * 100
            
            ct_pct.plot(kind='bar', stacked=True)
            plt.title(f'{col} vs {target_col}')
            plt.ylabel('Percentage')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig(f'../data/{col}_vs_target.png')
            plt.show()

In [None]:
print("=== EXPLORATORY DATA ANALYSIS SUMMARY ===")
print(f"Dataset shape: {df.shape}")
print(f"Number of numerical features: {len(numerical_cols)}")
print(f"Number of categorical features: {len(categorical_cols)}")
print(f"Missing values: {missing_values.sum()} total")
print(f"Target variable: {target_col}")
print(f"Target classes: {df[target_col].unique()}")
print(f"Target distribution: {df[target_col].value_counts().to_dict()}")

print("\nKey insights:")
if len(high_corr_pairs) > 0:
    print(f"- Found {len(high_corr_pairs)} highly correlated feature pairs")
else:
    print("- No highly correlated features detected")

if missing_values.sum() > 0:
    print(f"- {(missing_values > 0).sum()} features have missing values")
else:
    print("- No missing values found")

print("\nVisualizations saved to the data directory:")
print("- target_distribution.png")
print("- correlation_heatmap.png") 
print("- numerical_distributions.png")
print("- categorical_distributions.png")
print("- numerical_vs_target.png")
print("- [feature_name]_vs_target.png (for categorical features)")

print("\nExploratory analysis complete!")