#### Normalizing each pcl file

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

files_processed = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Exclude non-experiment columns from normalization (e.g., 'GWEIGHT', 'NAME', 'IDENTIFIER', etc.)
            experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME', 'IDENTIFIER', 'Description', 'GENE']]

            # Initialize the scaler
            scaler = StandardScaler()

            # Apply z-score normalization for each experiment column
            df[experiment_columns] = scaler.fit_transform(df[experiment_columns])

            # Save the modified DataFrame back to the same file
            df.to_csv(file_path, sep="\t")

            files_processed += 1

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")


print(f"Processed {files_processed} files.")

#### Merging

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/test_pcls'

# Prepare an empty DataFrame to store the master matrix
master_df = pd.DataFrame()

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Filter out only the experimental columns (exclude 'GWEIGHT', 'NAME', 'IDENTIFIER', 'Description', etc.)
            experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME', 'IDENTIFIER', 'Description']]

            # Keep only the rows and columns of interest (experiment columns)
            df = df[experiment_columns]

            # Merge this dataframe with the master dataframe (join by gene names/rows)
            if master_df.empty:
                master_df = df
            else:
                master_df = master_df.join(df, how='outer')  # 'outer' join to keep all genes

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Save the master matrix to a new file
master_df.to_csv('/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/test.csv', sep="\t")
print("Master matrix created and saved.")

In [None]:
import pandas as pd

# Path to the CSV file
csv_file_path = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/test.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path, delimiter="\t", index_col=0)

# Get the number of rows and columns
num_rows, num_columns = df.shape

# Print the results
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")

print(df.info())


#### GWEIGHT

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

# List to store file names with non-uniform GWEIGHT values
files_with_non_uniform_gweight = []

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Check if the 'GWEIGHT' column exists
            if 'GWEIGHT' in df.columns:
                # Check if all values in 'GWEIGHT' are equal to 1
                if not (df['GWEIGHT'] == 1).all():
                    files_with_non_uniform_gweight.append(file_name)

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Print the results
if files_with_non_uniform_gweight:
    print("Files with non-uniform GWEIGHT values:")
    for file in files_with_non_uniform_gweight:
        print(file)
else:
    print("All files have GWEIGHT values uniformly equal to 1.")

#### Total number of columns

In [7]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

# Counter for total number of experiment columns
total_experiment_columns = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Count the experiment columns (excluding 'GWEIGHT', 'NAME', 'IDENTIFIER', 'Description', etc.)
            experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME', 'IDENTIFIER', 'Description']]
            total_experiment_columns += len(experiment_columns)

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Print the total number of experiment columns
print(f"Total number of experiment columns across all files: {total_experiment_columns}")


Total number of experiment columns across all files: 11889


##### Type of columns

In [2]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Filter out only the experimental columns (exclude 'GWEIGHT', 'NAME', 'IDENTIFIER', 'Description', etc.)
            experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME', 'IDENTIFIER', 'Description']]

            # Convert experiment columns to float32
            df[experiment_columns] = df[experiment_columns].astype('float32')

            # Save the updated DataFrame, overwriting the original file
            df.to_csv(file_path, sep="\t")
            # print(f"File '{file_name}' updated successfully.")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

print("All files processed and updated.")

All files processed and updated.
