In [38]:
import pandas as pd
import os
import math
import numpy as np	

In [143]:
def remove_highly_correlated_features(X_train, threshold=0.7):
    """
    Remove highly correlated features from the training dataset.
    """

    # Dictionary to hold features categorized by their base name
    dist_dict = {}
    for feature in X_train.columns:
        dist_name, _ = feature.split("_")
        if dist_name not in dist_dict:
            dist_dict[dist_name] = [feature]
        else:
            dist_dict[dist_name].append(feature)
    
    keep_features = []
    
    # Iterate over each group of features with the same base name
    for key, features in dist_dict.items():
        data_subset = X_train[features]
        
        corr_matrix = data_subset.corr().abs()
        upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
        data_subset.drop(labels=to_drop, axis=1, inplace=True)
        keep_features.extend(data_subset.columns)
        
        print(f"Category '{key}':")
        print(f"  Number of features before removal: {len(features)}")
        print(f"  Number of features after removal: {data_subset.shape[1]}")
    
    print(f"\nTotal number of features before removal: {X_train.shape[1]}")
    print(f"Total number of features after removal: {len(keep_features)}")
    
    return keep_features

 - Step 1: Convert shapelets to features
     - Transform mined shapelets into feature vectors by measuring distances to the original time series.
 - Step 2: Remove redundancy
     - Ensure extracted patterns are informative, relevant, and not excessively similar.


In [139]:
path_res = 'csv_results'
listdir = os.listdir(path_res)

Train = pd.DataFrame()
Test = pd.DataFrame()

## step 1

In [140]:
for dist_file in listdir:
    if "Train" in dist_file:
        train_dist_path = os.path.join(path_res, dist_file)
        dist_name = dist_file.split('_')[1]
        test_dist_path = os.path.join(path_res, f"Test_{dist_name}_2_4_metrics.csv")
        
        try:
            df_train = pd.read_csv(train_dist_path)
            df_test = pd.read_csv(test_dist_path)
            
            df_train = df_train.sort_values('p_val')
            df_test = df_test.reindex(df_train.index)
            
            df_train = df_train.reset_index(drop=True)
            df_test = df_test.reset_index(drop=True)

            for df, save_df in zip([df_train, df_test], [Train, Test]):
                shapelet_num = min(len(df['distances']), 100)  # Limit the number of shapelets to 100
                
                for i in range(shapelet_num):
                    # Extract shapelet information
                    nums = df["shapelet"][i][1:-1].split(', ')
                    nums = [int(num) for num in nums]  # Convert to integer list
                    string = ''.join(map(str, nums))  # Convert list back to string
                    
                    # Process distances
                    list_ = df['distances'][i][1:-1].split(',')
                    dist = [np.nan if math.isnan(float(a)) else int(float(a)) for a in list_]
                    
                    # Add processed distances to the DataFrame
                    save_df[f'{dist_name}_{str(string)}'] = dist
        except Exception as e:
            print(f"Error processing {train_dist_path}: {e}")


# Train['label'] = (pd.read_csv("data/synthetic/y_train.csv")["label"]).astype(int)
# Test['label'] = (pd.read_csv("data/synthetic/y_test.csv")["label"]).astype(int)

In [141]:
df_test.head()

Unnamed: 0,threshold,acc,p_val,contingency,shapelet,distances
0,0,0.675,0.000335,"[17, 25, 3, 39]","[2, 3]","[nan, 0.0, nan, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, ..."
1,1,0.675,0.000335,"[17, 25, 3, 39]","[1, 3]","[nan, 1.0, nan, 2.0, 5.0, 2.0, 4.0, 2.0, 2.0, ..."


In [142]:
display(pd.concat([Test.head(2), Test.tail(2)]))

Unnamed: 0,feature1_23,feature1_13
0,,
1,0.0,1.0
78,0.0,1.0
79,1.0,2.0


## step 2

In [146]:
keep_feat = remove_highly_correlated_features(Train, 0.6)
X_train = Train[keep_feat]
X_test = Test[keep_feat]

X_test = X_test[X_train.columns]


Category 'feature1':
  Number of features before removal: 2
  Number of features after removal: 1

Total number of features before removal: 2
Total number of features after removal: 1


In [147]:
display(pd.concat([X_test.head(2), X_test.tail(2)]))

Unnamed: 0,feature1_23
0,
1,0.0
78,0.0
79,1.0


In [148]:
save_path = os.path.join('data', 'synthetic')
os.makedirs(save_path, exist_ok=True)

# If you have other patient details like age and gender, 
# add them to the data. Make sure they match the order of the patient records in X_train and X_test

# X_train_updated = pd.concat([X_train, static_features_train], axis=1)
# X_test_updated = pd.concat([X_test, static_features_test], axis=1)

# X_train_updated.to_csv(os.path.join(save_path, 'X_train.csv'), index=False)
# X_test_updated.to_csv(os.path.join(save_path, 'X_test.csv'), index=False)

X_train.to_csv(os.path.join(save_path, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(save_path, 'X_test.csv'), index=False)