In [None]:
import numpy as np
import pandas as pd
import os
from config import *
def filelist(p):
    """
    Generate a list of file paths for various imputed CSV files based on the provided directory path.

    Parameters:
    p (str): The base directory path where the CSV files are located.

    Returns:
    list: A list of file paths to the imputed CSV files.
    """
    return [
        f"{p}/baseline_imputed.csv",      # Path to the baseline imputed CSV file
        f"{p}/lifestyle_imputed.csv",     # Path to the lifestyle imputed CSV file
        f"{p}/measurement_imputed.csv",   # Path to the measurement imputed CSV file
        f"{p}/environment_imputed.csv",   # Path to the environment imputed CSV file
        f"{p}/genetic_imputed.csv"        # Path to the genetic imputed CSV file
    ]


In [None]:
# Define the relative path to the 'Process_missingness' directory within the 'results' folder
p = '../../results/Process_missingness/'

# Create a tuple containing the path and a list of all files and directories within that path
p, os.listdir(p)


In [None]:
# Initialize an empty list to store the data from CSV files
Xdata = []

# Iterate over the range of the number of files returned by the filelist function
for i in range(len(filelist(p))):
    # Read the CSV file at the current index and append the DataFrame to Xdata
    Xdata.append(pd.read_csv(filelist(p)[i]))

In [None]:
# Create a priority list for each field

# Concatenate all DataFrames in Xdata along the columns (axis=1)
# Then, drop the 'eid' column from the resulting DataFrame
t = pd.concat(Xdata, axis=1).drop('eid', axis=1)

# Read the 'showcase.csv' file to create a mapping of FieldID to Priority
# - Read the CSV file located at '../../data/Preprocess/showcase.csv'
# - Extract the 'FieldID' and 'Priority' columns
# - Create a dictionary mapping each FieldID to its corresponding Priority
pridict = dict(zip(
    pd.read_csv('../../data/Preprocess/showcase.csv')['FieldID'],
    pd.read_csv('../../data/Preprocess/showcase.csv')['Priority']
))

# Initialize an empty list to store priority values
priority = []

# Iterate over each column name in the concatenated DataFrame 't'
for i in list(t.columns):
    # Check if the column name contains the substring 'PRSPC'
    if 'PRSPC' in i:
        # If 'PRSPC' is found, append a priority value of 3 to the priority list
        priority.append(3)
    else:
        # If 'PRSPC' is not found, process the column name to extract the FieldID
        # Split the column name at '#' and take the first part
        # Then, split the result at '-' and take the first part again
        # Convert the extracted string to an integer to use as a key in the 'pridict' dictionary
        field_id = int(i.split('#')[0].split('-')[0])
        # Append the corresponding Priority value from 'pridict' to the priority list
        priority.append(pridict[field_id])

# Display the 'pridict' dictionary
pridict


In [None]:
# Initialize an empty list to store data from CSV files
Xdata = []

# Stack all the data together according to priority
for i in range(len(filelist(p))):
    # Read the CSV file at the current index, convert it to a NumPy array, and exclude the first column ('eid')
    Xdata.append(pd.read_csv(filelist(p)[i]).to_numpy()[:, 1:])  # [:,1:] removes the 'eid' column

    # Iterate over priority levels 1, 2, and 3
    for j in [1, 2, 3]:
        # Concatenate all arrays in Xdata along the columns (axis=1)
        temp = np.concatenate(Xdata, axis=1)

        # Create a boolean mask where priority is less than or equal to the current priority level 'j'
        # Convert 'priority' list to a NumPy array and select columns where priority <= j
        out = temp[:, np.where(np.array(priority)[:temp.shape[1]] <= j)[0]]

        # Attempt to create the base directory for priority blocks
        os.makedirs(f'{Xblocklocation}priority/', exist_ok=True)
        os.makedirs(f'{Xblocklocation}priority/{j}', exist_ok=True)
        
        # Save the filtered array 'out' as a NumPy binary file in the corresponding priority directory
        np.save(f'{Xblocklocation}priority/{j}/blk{i+1}', out)


In [None]:
import numpy as np  # Import the NumPy library for numerical operations
import pandas as pd  # Import the Pandas library for data manipulation and analysis

# Initialize an empty list to store priority values (though not used in this snippet)
plist = []

# Initialize an empty list to store data from the CSV files
Xdata = []

# Directory path where the concatenated NumPy arrays will be saved
# If not defined, uncomment and set the path accordingly

# Stack all the data together by reading each CSV file and saving the concatenated array
for i in range(len(filelist(p))):
    # Read the CSV file at the current index, convert it to a NumPy array, and exclude the first column ('eid')
    data_array = pd.read_csv(filelist(p)[i]).to_numpy()[:, 1:]  # [:,1:] removes the 'eid' column
    Xdata.append(data_array)  # Append the processed array to the Xdata list

    # Concatenate all arrays in Xdata along the columns (axis=1) to form a single 2D array
    concatenated_data = np.concatenate(Xdata, axis=1)

    # Define the filename for saving the concatenated array
    # The filename includes the block number (i+1) to differentiate between blocks
    filename = f'{Xblocklocation}blk{i+1}'  # Add '.npy' extension for NumPy binary files

    # Ensure the directory exists
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    # Save the concatenated NumPy array to the specified file
    np.save(filename, concatenated_data)

    # Optional: Print a confirmation message after saving each block
    print(f"Saved concatenated data to {filename}")
