In [24]:
import numpy as np
import pandas as pd
from scipy.special import boxcox1p
from scipy.stats import boxcox
import os
from pathlib import Path

def fix_skewness(dataframes: dict, stored_values=None, is_train=True):
    student_info = dataframes.get('student_info')
    student_vle = dataframes.get('student_vle')
    assessments = dataframes.get('assessments')

    # --- Fixing skewness in student_info['num_of_prev_attempts']
    if student_info is not None and 'num_of_prev_attempts' in student_info.columns:
        col = 'num_of_prev_attempts'
        # Creating a copy of the column data to avoid modifying the original DataFrame
        series = student_info[col].copy()

         # Handling zero/negative values by adding 1 (Box-Cox requires positive values)
        if (series <= 0).any():
            series += 1
            
        if is_train:
            print(f"\nOriginal skewness {col}: {series.skew():.4f}")
            transformed, lambda_val = boxcox(series)
            stored_values['student_info_num_of_prev_attempts_lambda'] = lambda_val
            print(f"Box-Cox lambda for {col}: {lambda_val:.4f}")
        else:
            lambda_val = stored_values['student_info_num_of_prev_attempts_lambda']
            print(f"Box-Cox lambda for {col}: {lambda_val:.4f}")
            # boxcox1p is used here which is equivalent to boxcox(1 + x)
            # it adds 1 automatically, making it 0.
            transformed = boxcox1p(series - 1, lambda_val)
            
        student_info[col] = transformed
        print(f"Transformed skewness {col}: {student_info[col].skew():.4f}")

    # --- Fixing skewness in student_info['studied_credits']
    if student_info is not None and 'studied_credits' in student_info.columns:
        col = 'studied_credits'
        series = student_info[col].copy()
        
        if (series <= 0).any():
            series += 1
            
        print(f"\nOriginal skewness {col}: {series.skew():.4f}")
        # np.log() Always behaves the same way (will produce identical results on any data split)
        student_info[col] = np.log(series)
        print(f"Transformed skewness {col}: {student_info[col].skew():.4f}")

    # --- Fixing skewness in student_vle['sum_click']
    if student_vle is not None and 'sum_click' in student_vle.columns:
        col = 'sum_click'
        series = student_vle[col].copy()

        if (series <= 0).any():
            series += 1

        # Training data branch - calculates and stores Box-Cox parameters
        if is_train:
            print(f"\nOriginal skewness {col}: {series.skew():.4f}")
            transformed, lambda_val = boxcox(series)
            stored_values['student_vle_sum_click_lambda'] = lambda_val
            print(f"Box-Cox lambda for {col}: {lambda_val:.4f}")
        # Validation/test data branch - uses stored parameters
        else:
            lambda_val = stored_values['student_vle_sum_click_lambda']
            print(f"Box-Cox lambda for {col}: {lambda_val:.4f}")
            transformed = boxcox1p(series - 1, lambda_val)
            
        student_vle[col] = transformed
        print(f"Transformed skewness {col}: {student_vle[col].skew():.4f}")

    # --- Fixing skewness in assessments['weight'] using log(x + 1)
    if assessments is not None and 'weight' in assessments.columns:
        col = 'weight'
        series = assessments[col].copy()
        print(f"\nOriginal skewness {col}: {series.skew():.4f}")
        # np.log1p() always behaves the same way (will produce identical results on any data split)
        assessments[col] = np.log1p(series)  # log(x + 1)
        print(f"Transformed skewness {col}: {assessments[col].skew():.4f}")
    return {
        'student_info': student_info,
        'student_vle': student_vle,
        'assessments': assessments
    }, stored_values

# Input paths (original skewed data)
INPUT_DIR = Path.home() / "Desktop" / "Logistic Regression" / "constant"

# Output paths (fixed skewness)
OUTPUT_DIR = Path.home() / "Desktop" / "Logistic Regression Skewness Fixed" / "constant"

def process_and_save(input_dir: Path, output_dir: Path):
    stored_values = {}
    
    for split in ['train', 'val', 'test']:
        student_info = pd.read_csv(input_dir / f"{split}/student_info.csv")
        student_vle = pd.read_csv(input_dir / f"{split}/student_vle.csv")
        assessments = pd.read_csv("assessments.csv")

        dataframes = {
            'student_info': student_info,
            'student_vle': student_vle,
            'assessments': assessments,
        }
        
        # Fitting on train, applying on val/test
        if split == 'train':
            transformed_dataframes, stored_values = fix_skewness(dataframes, stored_values)
        else:
            transformed_dataframes, _ = fix_skewness(dataframes, stored_values, is_train=False)

        os.makedirs(output_dir / split, exist_ok=True)
        transformed_dataframes['student_info'].to_csv(output_dir / f"{split}/student_info.csv", index=False)
        transformed_dataframes['student_vle'].to_csv(output_dir / f"{split}/student_vle.csv", index=False)
        assessments.to_csv(output_dir / "assessments.csv", index=False)

process_and_save(INPUT_DIR, OUTPUT_DIR)



Original skewness num_of_prev_attempts: 3.8729
Box-Cox lambda for num_of_prev_attempts: -9.7674
Transformed skewness num_of_prev_attempts: 2.2182

Original skewness studied_credits: 2.0017
Transformed skewness studied_credits: 0.1654

Original skewness sum_click: 97.0345
Box-Cox lambda for sum_click: -0.7439
Transformed skewness sum_click: 0.3474

Original skewness weight: 1.9616
Transformed skewness weight: -0.0695
Box-Cox lambda for num_of_prev_attempts: -9.7674
Transformed skewness num_of_prev_attempts: 2.1993

Original skewness studied_credits: 1.7662
Transformed skewness studied_credits: 0.0948
Box-Cox lambda for sum_click: -0.7439
Transformed skewness sum_click: 0.3487

Original skewness weight: 1.9616
Transformed skewness weight: -0.0695
Box-Cox lambda for num_of_prev_attempts: -9.7674
Transformed skewness num_of_prev_attempts: 2.2838

Original skewness studied_credits: 1.5965
Transformed skewness studied_credits: 0.1037
Box-Cox lambda for sum_click: -0.7439
Transformed skewnes