In [1]:
import pandas as pd
import numpy as np
import os
import csv
from sklearn.model_selection import train_test_split
from utils.utils import clean_row

In [2]:
# Define the directories for raw data and saving the results
path_raw = "data/synthetic/raw"
path_save = "data/synthetic/processed"


In [3]:
# Read all DataFrames
listdir = os.listdir(path_raw)
dfs = []
for f in listdir:
    if "med" not in f:
        try:
            df = pd.read_csv(os.path.join(path_raw, f), header=None, names=['label', 't7', 't6', 't5', 't4', 't3', 't2', 't1'])
            dfs.append(df)
        except Exception as e:
            print(f"Error reading {f}: {e}")

if not dfs:
    raise ValueError("No valid data files found. Please check the data directory.")


In [4]:
display(pd.concat([dfs[0].head(2), dfs[0].tail(2)]))

Unnamed: 0,label,t7,t6,t5,t4,t3,t2,t1
0,1,0,0,0,0,2,2,3
1,1,0,0,0,0,2,0,3
398,0,0,0,0,0,0,3,2
399,0,0,2,0,0,0,0,2


In [6]:
# Use the first DataFrame to generate train/test indices
label_col = 'label'
df = dfs[0]
X = df.drop(label_col, axis=1)
y = df[label_col]

# Generate train/test indices with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Calculate indices for train and test sets
indices_train = X_train.index
indices_test = X_test.index

y_train.to_csv(f"{path_save}/../y_train.csv")
y_test.to_csv(f"{path_save}/../y_test.csv")

In [7]:
print(f"train index: {indices_train[:10]} \ntest index: {indices_test[:10]}")

train index: Index([190, 126, 339, 319, 277, 116, 102, 291, 289, 314], dtype='int64') 
test index: Index([271, 51, 96, 392, 351, 252, 250, 307, 396, 182], dtype='int64')


In [8]:
# Apply the same split to all DataFrames and clean rows
for i, df in enumerate(dfs):
    train_df = df.loc[indices_train]
    test_df = df.loc[indices_test]
    
    # Clean and save the train set
    train_filename = os.path.join(path_save, f'Train_{listdir[i]}')
    with open(train_filename, 'w') as file:
        writer = csv.writer(file, delimiter=',', lineterminator='\n')
        for _, rows in train_df.iterrows():
            if len(clean_row(rows.iloc[1:])) > 1:
                row_cleaned = np.concatenate([[rows.iloc[0]], clean_row(rows.iloc[1:])])
                row_cleaned = [str(x).strip() for x in row_cleaned]  # 去掉行末换行符
                writer.writerow(row_cleaned)
            else:
                writer.writerow([rows.iloc[0]])
    
    # Clean and save the test set
    test_filename = os.path.join(path_save, f'Test_{listdir[i]}')
    with open(test_filename, 'w') as file:
        writer = csv.writer(file, delimiter=',', lineterminator='\n')
        for _, rows in test_df.iterrows():
            if len(clean_row(rows.iloc[1:])) > 1:
                row_cleaned = np.concatenate([[rows.iloc[0]], clean_row(rows.iloc[1:])])
                row_cleaned = [str(x).strip() for x in row_cleaned]  # 去掉行末换行符
                writer.writerow(row_cleaned)
            else:
                writer.writerow([rows.iloc[0]])    
                
    print(f"train set cleaned and saved to {train_filename}")
    print(f"test set cleaned and saved to {test_filename}")
    

train set cleaned and saved to data/synthetic/processed/Train_feature1.csv
test set cleaned and saved to data/synthetic/processed/Test_feature1.csv
