# Student Performance Predictor - Model Training

This notebook contains the complete pipeline for:
1. Data Loading and Exploration
2. Data Preprocessing
3. Model Training and Comparison
4. Model Evaluation
5. Model Saving


In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import xgboost as xgb
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)


ModuleNotFoundError: No module named 'pandas'

## 1. Load and Explore Data


In [None]:
# Load dataset
df = pd.read_csv('../data/student_data.csv')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head()


In [None]:
# Dataset information
print("Dataset Info:")
df.info()
print("\nMissing Values:")
print(df.isnull().sum())
print("\nDataset Statistics:")
df.describe()


In [None]:
# Check target variable distribution
print("Target Variable Distribution:")
print(df['final_result'].value_counts())
print("\nPercentage:")
print(df['final_result'].value_counts(normalize=True) * 100)

# Visualize target distribution
plt.figure(figsize=(8, 6))
df['final_result'].value_counts().plot(kind='bar', color=['green', 'red'])
plt.title('Distribution of Final Results')
plt.xlabel('Result')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


## 2. Data Visualization


In [None]:
# Correlation Heatmap
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Heatmap of Numerical Features')
plt.tight_layout()
plt.show()


In [None]:
# Distribution of features by result
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

features = ['attendance_percentage', 'hours_studied', 'previous_score', 
            'assignments_submitted', 'internal_marks', 'age']

for idx, feature in enumerate(features):
    row = idx // 3
    col = idx % 3
    
    df[df['final_result'] == 'Pass'][feature].hist(alpha=0.5, label='Pass', ax=axes[row, col], bins=20)
    df[df['final_result'] == 'Fail'][feature].hist(alpha=0.5, label='Fail', ax=axes[row, col], bins=20)
    axes[row, col].set_title(f'Distribution of {feature}')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Frequency')
    axes[row, col].legend()

plt.tight_layout()
plt.show()


## 3. Data Preprocessing


In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Handle missing values (if any)
print("Missing values before handling:")
print(df_processed.isnull().sum())

# Fill missing values if any
df_processed = df_processed.fillna(df_processed.mean(numeric_only=True))

# Encode categorical variables
label_encoders = {}
categorical_cols = ['gender', 'parent_education', 'internet_access']

for col in categorical_cols:
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col])
    label_encoders[col] = le

# Encode target variable
target_encoder = LabelEncoder()
df_processed['final_result'] = target_encoder.fit_transform(df_processed['final_result'])

print("\nCategorical encoding completed.")
print(f"Target encoding: {dict(zip(target_encoder.classes_, target_encoder.transform(target_encoder.classes_)))}")


In [None]:
# Prepare features and target
X = df_processed.drop(['student_id', 'final_result'], axis=1)
y = df_processed['final_result']

print("Features:", X.columns.tolist())
print("\nFeature shape:", X.shape)
print("Target shape:", y.shape)


In [None]:
# Split data into train and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"\nTraining set target distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTest set target distribution:")
print(pd.Series(y_test).value_counts())


In [None]:
# Normalize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for better handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print("Feature scaling completed.")


## 4. Model Training and Comparison


In [None]:
# Function to evaluate model
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average='weighted')
    recall = recall_score(y_test, y_test_pred, average='weighted')
    f1 = f1_score(y_test, y_test_pred, average='weighted')
    
    return {
        'Model': model_name,
        'Train Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Model': model
    }


In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss')
}

# Train and evaluate all models
results = []
trained_models = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    result = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test, name)
    results.append(result)
    trained_models[name] = result['Model']
    print(f"{name} - Test Accuracy: {result['Test Accuracy']:.4f}")


In [None]:
# Create results DataFrame
results_df = pd.DataFrame([{k: v for k, v in r.items() if k != 'Model'} for r in results])
print("\nModel Comparison Results:")
print(results_df.to_string(index=False))


In [None]:
# Visualize model performance comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

metrics = ['Test Accuracy', 'Precision', 'Recall', 'F1-Score']
for idx, metric in enumerate(metrics):
    row = idx // 2
    col = idx % 2
    
    axes[row, col].bar(results_df['Model'], results_df[metric], color=['skyblue', 'lightgreen', 'salmon'])
    axes[row, col].set_title(f'{metric} Comparison')
    axes[row, col].set_ylabel(metric)
    axes[row, col].set_ylim([0, 1])
    axes[row, col].tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for i, v in enumerate(results_df[metric]):
        axes[row, col].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()


## 5. Select Best Model


In [None]:
# Select best model based on F1-Score
best_model_idx = results_df['F1-Score'].idxmax()
best_model_name = results_df.loc[best_model_idx, 'Model']
best_model = trained_models[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"\nBest Model Performance:")
print(results_df.loc[best_model_idx])


In [None]:
# Detailed evaluation of best model
y_pred_best = best_model.predict(X_test_scaled)

print(f"\nDetailed Classification Report for {best_model_name}:")
print(classification_report(y_test, y_pred_best, target_names=target_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_encoder.classes_, 
            yticklabels=target_encoder.classes_)
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()


## 6. Save Model and Preprocessing Objects


In [None]:
# Save best model
joblib.dump(best_model, '../models/best_model.pkl')
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(label_encoders, '../models/label_encoders.pkl')
joblib.dump(target_encoder, '../models/target_encoder.pkl')

print("Model and preprocessing objects saved successfully!")
print(f"\nSaved files:")
print("- best_model.pkl")
print("- scaler.pkl")
print("- label_encoders.pkl")
print("- target_encoder.pkl")


## 7. Feature Importance (for tree-based models)


In [None]:
# Display feature importance for tree-based models
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance, x='Importance', y='Feature')
    plt.title(f'Feature Importance - {best_model_name}')
    plt.tight_layout()
    plt.show()
    
    print("\nFeature Importance:")
    print(feature_importance)
