In [11]:
import os
import glob
import numpy as np
import pandas as pd
import re


# Specify the directory containing the .dat files
directory_path = './LHeC_Ee60/'

# Construct the full pattern for .dat files
file_pattern = os.path.join(directory_path, '*.dat')

# Use glob to get a list of all .dat files in the directory
dat_files = glob.glob(file_pattern)

In [12]:

##################################################################################################################
# Function to check if a line starts with a digit
def starts_with_digit(line):
    return line.strip() and (line.strip()[0].isdigit() or line.strip()[0] == '-' or line.strip()[0] == '.')

##################################################################################################################
# Function to read the file, skip lines starting with any string, and return a DataFrame

def expand_column_names(column_names_raw):
    column_names = []
    parts = column_names_raw.split(',')
    for part in parts:
        part = part.strip().replace("'", "")
        match = re.match(r"(\d+)\*(.+)", part)
        if match:
            count = int(match.group(1))
            string = match.group(2)
            column_names.extend([string] * count)
        else:
            column_names.append(part)
    return column_names
##################################################################################################################

def read_dat_file(file_path):
    from io import StringIO
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    # Find the line with column names
    column_names = None
    data_lines = []
    for line in lines:
        if line.startswith("  ColumnName = "):
            column_names_raw = line[len("  ColumnName = "):].strip()
            # Expand column names to handle patterns like 3*'string'
            column_names = expand_column_names(column_names_raw)
        elif starts_with_digit(line):
            data_lines.append(line)
    
    # Convert the filtered lines to a DataFrame
    filtered_content = '\n'.join(data_lines)
    data = pd.read_csv(StringIO(filtered_content), delim_whitespace=True, header=None)
    
    # Assign column names if found
    if column_names and len(column_names) == data.shape[1]:
        data.columns = column_names
    
    return data

##################################################################################################################
# function to generate Gaussian random number
def generate_random_number(mean, std_dev):
    return np.random.normal(mean, std_dev)

##################################################################################################################
# function to Get unique values in the 'column' column and produce corrsponding shifts
def generate_corr_err_shift(df , column):
    sysErrDict = {key: value for key, value in []}
    unique_values = df[column].unique()
    for elem in unique_values:
        sysErrDict[elem] = generate_random_number(0,elem)
    return sysErrDict

##################################################################################################################
#  function Find column names that start with 'esyst'
def find_corr_err(df):
    norm_columns = [col for col in df.columns if (col.startswith('e') or col.startswith('radco') or col.startswith('ccsys'))]
    return norm_columns
##################################################################################################################
# Function to write the filefrom a DataFrame
def write_dataframe_with_preamble(df, preamble_file, output_file):
    """
    Write lines from a preamble file and a Pandas DataFrame to a TSV file.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to write to file.
    preamble_file (str): The name of the file to read the preamble lines from.
    output_file (str): The name of the output file to write to.
    """
    # Read lines from the preamble file
    preamble_lines = []
    with open(preamble_file, 'r') as file:
        lines = file.readlines()
    
    # Find the line which start with a string : preamble
    for line in lines:
        if starts_with_digit(line):
            continue
        else:
            preamble_lines.append(line)
    ###########################################################################
    # Open the output file in write mode
    with open(output_file, 'w') as file:
        # Write the preamble lines to the output file
        file.writelines(preamble_lines)
    
    # Write the DataFrame to the output file in TSV format, appending mode, w/o header
    df.to_csv(output_file, sep='\t', index=False, mode='a', header=False)

    print(f"DataFrame has been written to {output_file} with preamble from {preamble_file}")

In [14]:
# Iterate through the list of .dat files and process each one
for file_path in dat_files:
    try:
        df = read_dat_file(file_path)
        # Get the file name from the file path
        file_name = os.path.basename(file_path)
        file_name = file_name.replace(".dat","")
        
        # Loop through the column and update the column that needs changing
        sysErrDict = {key: value for key, value in []} 
        for column in find_corr_err(df):
            sysErrDict.update(generate_corr_err_shift(df , column))
     
        column_to_modify = 'reduced x-section'

        for i in range(len(df)):
            std  = np.sqrt(df.at[i, 'stat']**2 + df.at[i, 'uncor']**2) 
            std *= df.at[i, column_to_modify]/100.0 
            df.at[i, column_to_modify] += generate_random_number(0,std)

            sys_shift = 0.0
            for col in find_corr_err(df):
                print(f'row: {i} and error: {df.at[i, col]} and column: {col}, sysErrDict[df.at[i, col]] is:{sysErrDict[df.at[i, col]]}')
                sys_shift += sysErrDict[df.at[i, col]]**2
            
            sys_shift  = sys_shift**0.5
            sys_shift *= df.at[i, column_to_modify]/100.0    # all errors are in percent
            df.at[i, column_to_modify] += sys_shift * 000.0
        
    except Exception as e:
        print(f"An error occurred while processing {file_path}: {e}")

    preamble_file  = directory_path + file_name + ".dat"
    outputFileName = directory_path + "smeared/" + file_name + "_smeared.dat"
    write_dataframe_with_preamble(df, preamble_file, outputFileName)

row: 0 and error: 2.0 and column: ccsys, sysErrDict[df.at[i, col]] is:-2.8500431562316093
row: 1 and error: 2.0 and column: ccsys, sysErrDict[df.at[i, col]] is:-2.8500431562316093
row: 2 and error: 2.0 and column: ccsys, sysErrDict[df.at[i, col]] is:-2.8500431562316093
row: 3 and error: 2.0 and column: ccsys, sysErrDict[df.at[i, col]] is:-2.8500431562316093
row: 4 and error: 2.0 and column: ccsys, sysErrDict[df.at[i, col]] is:-2.8500431562316093
row: 5 and error: 2.0 and column: ccsys, sysErrDict[df.at[i, col]] is:-2.8500431562316093
row: 6 and error: 2.0 and column: ccsys, sysErrDict[df.at[i, col]] is:-2.8500431562316093
row: 7 and error: 2.0 and column: ccsys, sysErrDict[df.at[i, col]] is:-2.8500431562316093
row: 8 and error: 2.0 and column: ccsys, sysErrDict[df.at[i, col]] is:-2.8500431562316093
row: 9 and error: 2.0 and column: ccsys, sysErrDict[df.at[i, col]] is:-2.8500431562316093
row: 10 and error: 2.0 and column: ccsys, sysErrDict[df.at[i, col]] is:-2.8500431562316093
row: 11 a