In [27]:
import numpy as np
import pandas as pd
from scipy.stats import boxcox
import os
from pathlib import Path

def fix_skewness(dataframes: dict):
    student_info = dataframes.get('student_info')
    student_vle = dataframes.get('student_vle')
   # assessments = dataframes.get('assessments')
    #student_assessment = dataframes.get('student_assessment')

    # --- Fix skewness in student_info['num_of_prev_attempts']
    if student_info is not None and 'num_of_prev_attempts' in student_info.columns:
        col = 'num_of_prev_attempts'
        series = student_info[col]
        if (series <= 0).any():
            series += 1
        print(f"\nOriginal skewness {col}: {series.skew():.4f}")
        transformed, lambda_val = boxcox(series)
        print(f"Box-Cox lambda for {col}: {lambda_val:.4f}")
        student_info[col] = transformed
        print(f"Transformed skewness {col}: {student_info[col].skew():.4f}")

    # --- Fix skewness in student_info['studied_credits']
    if student_info is not None and 'studied_credits' in student_info.columns:
        col = 'studied_credits'
        series = student_info[col]
        if (series <= 0).any():
            series += 1
        print(f"\nOriginal skewness {col}: {series.skew():.4f}")
        transformed = np.log(series)
        student_info[col] = transformed
        print(f"Transformed skewness {col}: {student_info[col].skew():.4f}")

    # --- Fix skewness in student_vle['sum_click']
    if student_vle is not None and 'sum_click' in student_vle.columns:
        col = 'sum_click'
        series = student_vle[col]
        if (series <= 0).any():
            series += 1
        print(f"\nOriginal skewness {col}: {series.skew():.4f}")
        transformed, lambda_val = boxcox(series)
        print(f"Box-Cox lambda for {col}: {lambda_val:.4f}")
        student_vle[col] = transformed
        print(f"Transformed skewness {col}: {student_vle[col].skew():.4f}")

    # --- Fix skewness in assessments['weight'] using log(x + 1)
    # if assessments is not None and 'weight' in assessments.columns:
    #     col = 'weight'
    #     series = assessments[col]
    #     print(f"\nOriginal skewness {col}: {series.skew():.4f}")
    #     transformed = np.log1p(series)  # log(x + 1)
    #     assessments[col] = transformed
    #     print(f"Transformed skewness {col}: {assessments[col].skew():.4f}")

    # --- Fix left-skewed student_assessment['score'] using reflect + sqrt
    # if student_assessment is not None and 'score' in student_assessment.columns:
    #     col = 'score'
    #     series = student_assessment[col]
    #     print(f"\nOriginal skewness {col}: {series.skew():.4f}")
    #     max_val = series.max()
    #     reflected = np.sqrt(max_val - series)
    #     student_assessment[col] = reflected
    #     print(f"Transformed skewness {col}: {student_assessment[col].skew():.4f}")

# Input paths (original skewed data)
INPUT_DIR = Path.home() / "Desktop" / "Logistic Regression" / "constant"

# Output paths (fixed skewness)
OUTPUT_DIR = Path.home() / "Desktop" / "Logistic Regression Skewness Fixed" / "constant"

def process_and_save(input_dir: Path, output_dir: Path):
    for split in ['train', 'val', 'test']:
        student_info = pd.read_csv(input_dir / f"{split}/student_info.csv")
        student_vle = pd.read_csv(input_dir / f"{split}/student_vle.csv")
       # student_assessment = pd.read_csv(input_dir / f"{split}/student_assessment.csv")

        fix_skewness({
            'student_info': student_info,
            'student_vle': student_vle,
            #'student_assessment': student_assessment,
            #'assessments': assessments
        })

        os.makedirs(output_dir / split, exist_ok=True)
        student_info.to_csv(output_dir / f"{split}/student_info.csv", index=False)
        student_vle.to_csv(output_dir / f"{split}/student_vle.csv", index=False)
        #student_assessment.to_csv(output_dir / f"{split}/student_assessment.csv", index=False)
        #assessments.to_csv(output_dir / "assessments.csv", index=False)

process_and_save(INPUT_DIR, OUTPUT_DIR)



Original skewness num_of_prev_attempts: 3.8729
Box-Cox lambda for num_of_prev_attempts: -9.7674
Transformed skewness num_of_prev_attempts: 2.2182

Original skewness studied_credits: 2.0017
Transformed skewness studied_credits: 0.1654

Original skewness sum_click: 97.0345
Box-Cox lambda for sum_click: -0.7439
Transformed skewness sum_click: 0.3474

Original skewness num_of_prev_attempts: 3.7939
Box-Cox lambda for num_of_prev_attempts: -9.4988
Transformed skewness num_of_prev_attempts: 2.1993

Original skewness studied_credits: 1.7662
Transformed skewness studied_credits: 0.0948

Original skewness sum_click: 227.6219
Box-Cox lambda for sum_click: -0.7442
Transformed skewness sum_click: 0.3484

Original skewness num_of_prev_attempts: 3.6156
Box-Cox lambda for num_of_prev_attempts: -10.0070
Transformed skewness num_of_prev_attempts: 2.2838

Original skewness studied_credits: 1.5965
Transformed skewness studied_credits: 0.1037

Original skewness sum_click: 39.7380
Box-Cox lambda for sum_cl