In [9]:
import os
import pandas as pd
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [10]:
# 1. split data function
def split_data_by_group(data, group_col, train_ratio=0.8, val_ratio=0.2, test_ratio=None):
    """
    Splits the data based on 'namhoc' column and then by group.
    - Data with namhoc <= 2020: split into train (80%) and validation (20%)
    - Data with namhoc > 2020: assigned to test set

    Parameters:
    - data (pd.DataFrame): The dataset to split
    - group_col (str): The column name to group by
    - train_ratio (float): Proportion of historical data for training (default 0.8)
    - val_ratio (float): Proportion of historical data for validation (default 0.2)
    - test_ratio: Not used, kept for compatibility

    Returns:
    - train_set (pd.DataFrame): Training set
    - val_set (pd.DataFrame): Validation set
    - test_set (pd.DataFrame): Testing set
    """
    assert 'namhoc' in data.columns, "'namhoc' column must exist in the dataset"
    assert abs(train_ratio + val_ratio - 1.0) < 1e-5, "Train and validation ratios must sum to 1"
    # List of columns to drops
    drop_cols = ['Unnamed_0', 'diem_qt', 'diem_th', 'diem_gk', 'diem_ck']
    # Drop unnecessary columns
    data = data.drop(columns=drop_cols, errors='ignore')
    # Validate namhoc values
    print(f"\nnamhoc value counts:\n{data['namhoc'].value_counts().sort_index()}")
    if data['namhoc'].isna().any():
        print("Warning: Found NaN values in namhoc column. Filling with 0...")
        data['namhoc'] = data['namhoc'].fillna(0)

    # First split: separate historical and future data
    historical_data = data[data['namhoc'] <= 2020].copy()
    test_set = data[data['namhoc'] > 2020].copy()
    
    print(f"\nInitial split sizes:")
    print(f"Historical data (<=2020): {len(historical_data)} samples")
    print(f"Future data (>2020): {len(test_set)} samples")
    
    if len(historical_data) == 0:
        print("Warning: No historical data found. Using 70-30 split on all data.")
        historical_data = data.copy()
        test_set = pd.DataFrame()

    train_set = pd.DataFrame()
    val_set = pd.DataFrame()

    # Process historical data by groups
    grouped = historical_data.groupby(group_col)
    print(f"\nNumber of groups: {len(grouped)}")

    for group, group_data in grouped:
        n_samples = len(group_data)
        if n_samples < 2:
            train_set = pd.concat([train_set, group_data], ignore_index=True)
            print(f"Group '{group}' has only {n_samples} sample(s). Assigned to training set.")
            continue

        try:
            # Split into train and validation
            train, val = train_test_split(
                group_data, 
                test_size=val_ratio,
                random_state=42,
                shuffle=True
            )

            train_set = pd.concat([train_set, train], ignore_index=True)
            val_set = pd.concat([val_set, val], ignore_index=True)
            print(f"Group '{group}' split into {len(train)} train and {len(val)} validation samples.")
        except ValueError as e:
            print(f"Error splitting group '{group}': {e}. Assigning all to training set.")
            train_set = pd.concat([train_set, group_data], ignore_index=True)

    # Validate final sets
    if len(train_set) == 0:
        print("Warning: Empty training set. Using 80% of all data for training.")
        train_set = data.sample(frac=0.8, random_state=42)
        val_set = data.drop(train_set.index)
        test_set = pd.DataFrame()

    print("\nFinal split summary:")
    print(f"Training set: {len(train_set)} samples")
    print(f"Validation set: {len(val_set)} samples")
    print(f"Test set: {len(test_set)} samples")
    
    if len(train_set) == 0 or len(test_set) == 0:
        print("\nWarning: One or more sets are empty!")
        print(f"Training set columns: {train_set.columns.tolist()}")
        print(f"Test set columns: {test_set.columns.tolist()}")

    return train_set, val_set, test_set

In [11]:
# 2. Define the column cleaning functions
def clean_column_names(df):
    """
    Clean column names by:
    - Replacing non-alphanumeric characters with underscores.
    - Ensuring column names start with a letter.
    - Making column names unique.
    """
    # Replace any sequence of non-word characters with a single underscore
    df.columns = [
        re.sub(r'\W+', '_', col).strip('_') for col in df.columns
    ]

    # Ensure column names start with a letter by prefixing with 'f_' if necessary
    df.columns = [
        col if re.match(r'^[A-Za-z]', col) else f'f_{col}' for col in df.columns
    ]

    # Ensure uniqueness by appending suffixes to duplicate names
    seen = {}
    new_columns = []
    for col in df.columns:
        if col in seen:
            seen[col] += 1
            new_columns.append(f"{col}_{seen[col]}")
        else:
            seen[col] = 0
            new_columns.append(col)
    df.columns = new_columns

    return df

def verify_column_names(df):
    """
    Verify that all column names consist of only alphanumeric characters and underscores,
    and start with a letter.
    """
    problematic_cols = [
        col for col in df.columns
        if not re.match(r'^[A-Za-z]\w*$', col)
    ]
    return problematic_cols

In [12]:
# Load datasets
data1 = pd.read_csv("/home/dev/project/modelling/preprocessing/dataset1.csv")
print(f"\nDataset 1 shape: {data1.shape}")
# Clean column names
data1 = clean_column_names(data1)
# Verify column names
problematic_columns = verify_column_names(data1)
if problematic_columns:
    print(f"Problematic columns after cleaning: {problematic_columns}")
    print("Further cleaning or renaming may be required.")
else:
    print("All column names are clean and compatible with models.")
data5 = pd.read_csv("/home/dev/project/modelling/preprocessing/dataset5.csv") 
print(f"\nDataset 5 shape: {data5.shape}")
# Clean column names
data5 = clean_column_names(data5)
# Verify column names
problematic_columns = verify_column_names(data1)
if problematic_columns:
    print(f"Problematic columns after cleaning: {problematic_columns}")
    print("Further cleaning or renaming may be required.")
else:
    print("All column names are clean and compatible with models.")
# Define results directory
results_dir = "/home/dev/project/modelling/preprocessing/results"

# Create results directories for each dataset
dataset1_dir = os.path.join(results_dir, "dataset1")
dataset5_dir = os.path.join(results_dir, "dataset5")
os.makedirs(dataset1_dir, exist_ok=True)
os.makedirs(dataset5_dir, exist_ok=True)

# Split and save dataset1
train1, val1, test1 = split_data_by_group(data1, 'hocky_monhoc_count')
train1.to_csv(os.path.join(dataset1_dir, "train.csv"), index=False)
val1.to_csv(os.path.join(dataset1_dir, "val.csv"), index=False) 
test1.to_csv(os.path.join(dataset1_dir, "test.csv"), index=False)

# Split and save dataset5
train5, val5, test5 = split_data_by_group(data5, 'hocky_monhoc_count')
train5.to_csv(os.path.join(dataset5_dir, "train.csv"), index=False)
val5.to_csv(os.path.join(dataset5_dir, "val.csv"), index=False)
test5.to_csv(os.path.join(dataset5_dir, "test.csv"), index=False)

print("Data split and saved successfully")


Dataset 1 shape: (354290, 71)
All column names are clean and compatible with models.

Dataset 5 shape: (36751, 71)
All column names are clean and compatible with models.

namhoc value counts:
namhoc
2013.0    13056
2014.0    24521
2015.0    34429
2016.0    41593
2017.0    45974
2018.0    51925
2019.0    59435
2020.0    44194
2021.0    29837
2022.0     9326
Name: count, dtype: int64

Initial split sizes:
Historical data (<=2020): 315127 samples
Future data (>2020): 39163 samples

Number of groups: 22
Group '0' split into 1 train and 1 validation samples.
Group '1' split into 45041 train and 11261 validation samples.
Group '2' split into 41328 train and 10332 validation samples.
Group '3' split into 33400 train and 8351 validation samples.
Group '4' split into 35388 train and 8848 validation samples.
Group '5' split into 23596 train and 5899 validation samples.
Group '6' split into 22887 train and 5722 validation samples.
Group '7' split into 17988 train and 4498 validation samples.
Gro