#### Normalizing each pcl file

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

files_processed = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Exclude non-experiment columns from normalization (e.g., 'GWEIGHT', 'NAME', 'IDENTIFIER', etc.)
            experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME', 'IDENTIFIER', 'Description', 'GENE']]

            # Initialize the scaler
            scaler = StandardScaler()

            # Apply z-score normalization for each experiment column
            df[experiment_columns] = scaler.fit_transform(df[experiment_columns])

            # Save the modified DataFrame back to the same file
            df.to_csv(file_path, sep="\t")

            files_processed += 1

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")


print(f"Processed {files_processed} files.")

#### Merging

In [2]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/test_pcls'

# Prepare an empty DataFrame to store the master matrix
master_df = pd.DataFrame()

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Filter out only the experimental columns (exclude 'GWEIGHT', 'NAME', 'IDENTIFIER', 'Description', etc.)
            experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME', 'IDENTIFIER', 'Description']]

            # Keep only the rows and columns of interest (experiment columns)
            df = df[experiment_columns]

            # Merge this dataframe with the master dataframe (join by gene names/rows)
            if master_df.empty:
                master_df = df
            else:
                master_df = master_df.join(df, how='outer')  # 'outer' join to keep all genes

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Save the master matrix to a new file
master_df.to_csv('/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/test.csv', sep="\t")
print("Master matrix created and saved.")

Master matrix created and saved.


In [5]:
import pandas as pd

# Create some toy .pcl-like data manually
df1 = pd.DataFrame({
    'GWEIGHT': [1, 1, 1],
    'NAME': ['gene1', 'gene2', 'gene3'],
    'EXPT1': [3.5, 4.2, 5.1],
    'EXPT2': [2.1, 3.1, 4.0]
})
df1.set_index('NAME', inplace=True)

df2 = pd.DataFrame({
    'GWEIGHT': [1, 1, 1],
    'NAME': ['gene3', 'gene4', 'gene5'],
    'EXPT3': [3.6, 4.1, 5.0],
    'EXPT4': [2.2, 3.2, 4.1]
})
df2.set_index('NAME', inplace=True)

df3 = pd.DataFrame({
    'GWEIGHT': [1, 1, 1],
    'NAME': ['gene5', 'gene6', 'gene7'],
    'EXPT5': [2.5, 3.5, 4.4],
    'EXPT6': [2.3, 3.3, 4.2]
})
df3.set_index('NAME', inplace=True)

# Now, let's merge these dataframes into a master matrix
master_df = pd.DataFrame()

# List of toy datasets
dfs = [df1, df2, df3]

# Iterate over the toy dataframes
for df in dfs:
    # Filter out only the experimental columns (exclude 'GWEIGHT' and 'NAME')
    experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME']]
    
    # Keep only the rows and columns of interest (experiment columns)
    df_filtered = df[experiment_columns]
    
    # Merge this dataframe with the master dataframe (join by gene names/rows)
    if master_df.empty:
        master_df = df_filtered
    else:
        master_df = master_df.join(df_filtered, how='outer')  # 'outer' join to keep all genes

master_df.to_csv('/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/toy_test.csv', sep="\t")
print("Master matrix created and saved.")

Master matrix created and saved.
