In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
# initialize empty list to store dictionaries of data from each file
data = []

# function to process individual file
def process_file(path, filename):
    try:
        with open(os.path.join(path, filename)) as f:
            lines = f.readlines()
    except UnicodeDecodeError:
        print('Skipping non-UTF8 file: ', filename)
        return   
    
    # Check if the file starts with '##'
    if not lines[0].startswith('##'):
        return
    
    
    
    file_data = {}
    file_data['filename'] = filename
    #file_data['title'] = lines[0].split("=")[1].split("\n")[0]
    n = 0
    found_xydata = False
    for line in lines:
        n += 1
        if "TITLE" in line:
            file_data['label'] = line.split("=")[1].split("\n")[0]
        if "XUNITS" in line:
            file_data['xunits'] = line.split("=")[1].split("\n")[0]
        if "YUNITS" in line:
            file_data['yunits'] = line.split("=")[1].split("\n")[0]    
        if "XYDATA" in line:
            found_xydata = True
            break
    # Skip the file if it does not contain XYDATA
    if not found_xydata:
        return
    
    print(filename)
    df = pd.read_csv(os.path.join(path, filename), header=None, sep=' ', skiprows=n+1, skipfooter=2, engine='python')

    file_data['xdata'] = df[0].tolist()
    file_data['ydata'] = [df[i].tolist() for i in range(1, len(df.columns))]

    data.append(file_data)

# function to process all files in specified directory and its subdirectories
def process_directory(path):
    for filename in os.listdir(path):
        if os.path.isfile(os.path.join(path, filename)):
            process_file(path, filename)
        elif os.path.isdir(os.path.join(path, filename)):
            process_directory(os.path.join(path, filename))

# start processing from current directory
process_directory('.')

In [None]:
# Remove duplicates from data based on the title (label)
unique_data = []
unique_titles = set()
for d in data:
    if d['label'] not in unique_titles:
        unique_titles.add(d['label'])
        unique_data.append(d)

# Count the number of XY pairs for each label
xy_pair_counts = {}
for d in unique_data:
    xy_pairs = len(d['ydata'])
    xy_pair_counts[d['label']] = xy_pairs

# Print the number of XY pairs for each label
for label, count in xy_pair_counts.items():
    print(f"Label: {label}, Number of XY pairs: {count}")


In [None]:
import numpy as np
from scipy import interpolate
import pandas as pd

# Interpolation on X direction
x_new = np.arange(700, 3500, 1)

interpolated_data = []
for d in unique_data:
    temp_xx = np.array(d['xdata'])
    
    # Convert units
    if d['xunits'] == 'MICROMETERS':
        temp_xx = 10000 / temp_xx
    
    # If the X values are in descending order, flip X values
    if temp_xx[0] > temp_xx[-1]:
        temp_xx = np.flip(temp_xx)
    
    for y_data in d['ydata']:
        temp_yy = np.array(y_data)
        
        # Convert units
        temp_yy = temp_yy / max(temp_yy)
        if d['yunits'] == 'TRANSMITTANCE':
            temp_yy = 1 - temp_yy
            
        # If the X values are in descending order, flip Y values
        if temp_xx[0] > temp_xx[-1]:
            temp_yy = np.flip(temp_yy)
    
        # Interpolate the Y values
        tck = interpolate.splrep(temp_xx, temp_yy)
        y_bspline = interpolate.splev(x_new, tck)
    
        # Append to the list
        interpolated_data.append({'label': d['label'], 'ydata': y_bspline})

# Convert to DataFrame
df = pd.DataFrame()
for d in interpolated_data:
    temp_df = pd.DataFrame(d['ydata']).T
    temp_df.insert(0, 'label', d['label'])
    df = df.append(temp_df)

# Reset index
df.reset_index(drop=True, inplace=True)


In [None]:
...

In [None]:
...

In [None]:
...

In [None]:
# Initialize train and test data lists
train_data = []
test_data = []

# Split test data and train data
for label in df['label'].unique():
    label_df = df[df['label'] == label]
    test_data_sample = label_df.sample(n=1)
    train_data_sample = label_df.drop(test_data_sample.index)
    
    # Append to train and test data lists
    train_data.append(train_data_sample)
    test_data.append(test_data_sample)

# Convert lists of dataframes into single dataframes
train_df = pd.concat(train_data)
test_df = pd.concat(test_data)


In [None]:
def custom_aug_function1(row):
    label = row['label']
    theoretical = row.values[1:]
    #print(theoretical)
    copy_theo = np.copy(theoretical)  # Create a copy of the original data to avoid modifying it directly

    # Replace the peak-specific operations with the random elimination and scaling processes
    # Random elimination
    dum1 = np.repeat(np.random.choice([0, 1], 300, p=[0.2, 0.8]), len(copy_theo) // 300)
    dum1 = np.append(dum1, np.zeros([len(copy_theo) - len(dum1), ]))
    copy_theo = np.multiply(copy_theo, dum1)

    # Random scaling
    dum2 = np.repeat(np.random.rand(150,), len(copy_theo) // 150)
    dum2 = np.append(dum2, np.zeros([len(copy_theo) - len(dum2), ]))
    copy_theo = np.multiply(copy_theo, dum2)

    # Normalize the data
    copy_theo_elimination_scaling = copy_theo
    copy_theo_elimination_scaling = (copy_theo_elimination_scaling - np.min(copy_theo_elimination_scaling)) / (
        np.max(copy_theo_elimination_scaling) - np.min(copy_theo_elimination_scaling) + 1e-9)

    # Add noise
    #noise_intensity = np.random.uniform(0, 0.005)
    noise_intensity=0
    noise = np.random.normal(0, noise_intensity * np.max(copy_theo_elimination_scaling), len(copy_theo_elimination_scaling))
    copy_theo_elimination_scaling += noise

    # Normalize again
    copy_theo_elimination_scaling = (copy_theo_elimination_scaling - np.min(copy_theo_elimination_scaling)) / (
        np.max(copy_theo_elimination_scaling) - np.min(copy_theo_elimination_scaling) + 1e-9)

    # Left-right shifting process
    shift = np.random.randint(-25 * 1, 25)  # Generate a random integer between -20 and 20 for left-right shifting
    if shift >= 0:
        # Shift data to the right
        copy_theo_elimination_scaling = np.append(copy_theo_elimination_scaling[shift:], np.zeros([shift,]))
    else:
        # Shift data to the left
        copy_theo_elimination_scaling = np.append(copy_theo_elimination_scaling[0:len(copy_theo) + shift],
                                                  np.zeros([shift * -1,]))

    # Construct augmented row data with label
    augmented_row = np.insert(copy_theo_elimination_scaling, 0, label)
    return pd.Series(augmented_row, index=row.index)


In [None]:
#Our Augmentation Method
def custom_aug_function2(row):
    label = row[0]
    theoretical = row[1:]

    copy_theo = np.copy(theoretical)

    # Random elimination
    dum1 = []
    while len(dum1) < len(copy_theo):
        repeat_count = np.random.randint(7, 9)
        dum1 = np.append(dum1, np.repeat(np.random.choice([0, 1], 1, p=[0.2, 0.8]), repeat_count))
    dum1 = dum1[:len(copy_theo)]  # if dum1 is longer than copy_theo, truncate it
    copy_theo = np.multiply(copy_theo, dum1)

    # Random scaling
    dum2 = []
    while len(dum2) < len(copy_theo):
        repeat_count = np.random.randint(14, 17)
        dum2 = np.append(dum2, np.repeat(np.random.rand(1), repeat_count))
    dum2 = dum2[:len(copy_theo)]  # if dum2 is longer than copy_theo, truncate it
    copy_theo = np.multiply(copy_theo, dum2)

    # Normalize the data
    copy_theo_elimination_scaling = copy_theo
    copy_theo_elimination_scaling = (copy_theo_elimination_scaling - np.min(copy_theo_elimination_scaling)) / (
        np.max(copy_theo_elimination_scaling) - np.min(copy_theo_elimination_scaling) + 1e-9)

    # Add noise
    noise_intensity = 0
    noise = np.random.normal(0, noise_intensity * np.max(copy_theo_elimination_scaling), len(copy_theo_elimination_scaling))
    copy_theo_elimination_scaling += noise

    # Normalize again
    copy_theo_elimination_scaling = (copy_theo_elimination_scaling - np.min(copy_theo_elimination_scaling)) / (
        np.max(copy_theo_elimination_scaling) - np.min(copy_theo_elimination_scaling) + 1e-9)

    # Left-right shifting process
    shift = np.random.randint(-25, 25)  # Generate a random integer between -25 and 25 for left-right shifting
    if shift >= 0:
        # Shift data to the right
        copy_theo_elimination_scaling = np.append(copy_theo_elimination_scaling[shift:], np.zeros([shift,]))
    else:
        # Shift data to the left
        copy_theo_elimination_scaling = np.append(np.zeros([abs(shift),]), copy_theo_elimination_scaling[:len(copy_theo) + shift])

    # Construct augmented row data with label
    augmented_row = np.insert(copy_theo_elimination_scaling, 0, label)
    return augmented_row


In [None]:
# Define the target number of data for each label
target_num = 50
# Get unique labels

unique_labels = train_df['label'].unique()

# Create a dictionary to store the positions of each label in train_df
label_positions = {label: np.where(train_df['label'] == label)[0] for label in unique_labels}

In [None]:
# Augment the train data
augmented_data_list = []  # Use a list to store augmented data
# Loop over each unique label
for label in tqdm(unique_labels, desc="Processing labels"):    
    # Get the positions of current label
    label_pos = label_positions[label]
    
    # Use custom_aug_function to augment data
    for i in range(target_num):
        # Randomly select a position from label_pos and get the corresponding row in train_df
        sample_to_augment = train_df.iloc[np.random.choice(label_pos)]
        #print(sample_to_augment)
        augmented_data = custom_aug_function2(sample_to_augment)
        augmented_data_list.append(augmented_data)  # Append the data to the list

# Convert the list of augmented data to a DataFrame
augmented_df = pd.DataFrame(augmented_data_list, columns=train_df.columns)


In [None]:
# Create a new columns order with 'label' at the start
new_columns_order = ['label'] + [col for col in augmented_df.columns if col != 'label']

# Reindex the DataFrame
augmented_df = augmented_df.loc[:, new_columns_order]

In [None]:
# Save the train and test dataframes