### Preprocessing Yeast Matrix From Microarray Expression Data

#### 0. Misc Explorations

##### GWEIGHTs

In [1]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

# List to store file names with non-uniform GWEIGHT values
files_with_non_uniform_gweight = []

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Check if the 'GWEIGHT' column exists
            if 'GWEIGHT' in df.columns:
                # Check if all values in 'GWEIGHT' are equal to 1
                if not (df['GWEIGHT'] == 1).all():
                    files_with_non_uniform_gweight.append(file_name)

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Print the results
if files_with_non_uniform_gweight:
    print("Files with non-uniform GWEIGHT values:")
    for file in files_with_non_uniform_gweight:
        print(file)
else:
    print("All files have GWEIGHT values uniformly equal to 1.")

All files have GWEIGHT values uniformly equal to 1.


##### Total Number of Columns

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

# Counter for total number of experiment columns
total_experiment_columns = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Count the experiment columns (excluding 'GWEIGHT', 'NAME', 'IDENTIFIER', 'Description', etc.)
            experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME', 'IDENTIFIER', 'Description']]
            total_experiment_columns += len(experiment_columns)

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Print the total number of experiment columns
print(f"Total number of experiment columns across all files: {total_experiment_columns}")

##### Files with More Genes than Genome

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

# Threshold for the size of the yeast genome
threshold = 7337

# Counter for files with unique rows exceeding the threshold
count_exceeding_files = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Find the number of unique rows
            unique_count = df.drop_duplicates().shape[0]

            # Check if the unique count exceeds the threshold
            if unique_count > threshold:
                count_exceeding_files += 1

                # Count rows where the YORF (index) starts with 'SGD'
                sgd_count = sum(df.index.astype(str).str.startswith('SGD'))

                print(f"File: {file_name}, Number of unique rows: {unique_count}")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Print the total count of files exceeding the threshold
print(f"Number of files with unique rows larger than {threshold}: {count_exceeding_files}")

#### 1. Normalizing Columns in Each .pcl File

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

files_processed = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Exclude non-experiment columns from normalization
            experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME']]

            # Cast the experiment columns to float32
            df[experiment_columns] = df[experiment_columns].astype('float32')

            # Initialize the scaler
            scaler = StandardScaler()

            # Apply z-score normalization for each experiment column
            df[experiment_columns] = scaler.fit_transform(df[experiment_columns])

            # Save the modified DataFrame back to the same file
            df.to_csv(file_path, sep="\t")

            files_processed += 1

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

print(f"Processed {files_processed} files.")

#### 2. Translating All Indexes to YORFs

##### Dealing with SGD Indexes

In [1]:
import os
import pandas as pd

# Directory containing .pcl files and the all_yeast_genes.tsv file
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'
genes_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes.tsv'

# Load the all_yeast_genes.tsv to create the mapping dictionary
genes_df = pd.read_csv(genes_file, sep='\t')
gene_mapping = dict(zip(genes_df['Gene > Primary DBID'], genes_df['Gene > Systematic Name']))

files_processed = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Replace index values that start with 'SGD' using the mapping dictionary
            df.index = df.index.to_series().apply(lambda x: gene_mapping.get(x, x) if x.startswith('SGD') else x)

            # Save the modified DataFrame back to the same file
            df.to_csv(file_path, sep="\t")

            files_processed += 1

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

print(f"Processed {files_processed} files.")

Processed 556 files.


In [4]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

files_checked = 0
sgd_found = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Check if any index starts with 'SGD'
            if df.index.str.startswith('SGD').any():
                print(f"Found 'SGD' in indices of file: {file_name}")
                sgd_found += 1

            files_checked += 1

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

print(f"Checked {files_checked} files.")
print(f"Found 'SGD' indices in {sgd_found} files.")

Checked 556 files.
Found 'SGD' indices in 0 files.


##### Dealing with Standard Indexes

In [5]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'
yeast_genes_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes_rest_of_problematic_update.tsv' # Contains some manual additions

# Load the yeast genes data
yeast_genes_df = pd.read_csv(yeast_genes_file, sep='\t')

# Create a dictionary mapping Gene > Standard Name to Gene > Systematic Name (case-insensitive)
standard_to_systematic = {
    standard.upper(): systematic.upper() 
    for standard, systematic in zip(
        yeast_genes_df['Gene > Standard Name'].dropna(), 
        yeast_genes_df['Gene > Systematic Name'].dropna()
    )
}

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Flag to check if the file was modified
            modified = False

            # Create a new index list
            new_index = []
            for index in pcl_df.index:
                # Check if the index is problematic (not in standard_to_systematic dictionary)
                upper_index = index.upper()
                if upper_index not in standard_to_systematic:
                    # Keep the index as is if no mapping exists
                    new_index.append(index)
                else:
                    # Replace the index with the corresponding Gene > Systematic Name
                    new_index.append(standard_to_systematic[upper_index])
                    modified = True

            # Update the index of the DataFrame if modified
            if modified:
                pcl_df.index = new_index

                # Overwrite the original file
                pcl_df.to_csv(pcl_file_path, sep='\t')

                print(f"File '{file_name}' updated successfully.")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

File 'GSE24802_final.pcl' updated successfully.
File 'GSE66521.mapped.pcl' updated successfully.
File 'GSE25081_final.pcl' updated successfully.
File 'GSE30168_final.pcl' updated successfully.
File 'GSE54539.remapped.final.pcl' updated successfully.
File 'GSE33427.remapped.final.pcl' updated successfully.
File 'GSE55223.final.pcl' updated successfully.
File 'GSE22832_final.pcl' updated successfully.
File 'GSE44871.remapped.final.pcl' updated successfully.
File 'GSE44085.remapped.final.pcl' updated successfully.
File 'GSE63663.final.pcl' updated successfully.
File 'GSE27234_final.pcl' updated successfully.
File '2010.Bulik03.filter.flt.knn.avg.div.log.pcl' updated successfully.
File 'GSE89875.mapped.pcl' updated successfully.
File 'GSE49340.remapped.final.pcl' updated successfully.
File 'GSE81480.mapped.pcl' updated successfully.
File 'GSE40351.remapped.final.pcl' updated successfully.
File '2010.Bro03.filter.flt.knn.avg.div.log.pcl' updated successfully.
File '2010.Martin04.filter.flt.

##### Dealing with the Rest

In [None]:
import os
import pandas as pd
from collections import Counter

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'
yeast_genes_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes_rest_of_problematic_update.tsv'

# Load the yeast genes data
yeast_genes_df = pd.read_csv(yeast_genes_file, sep='\t')

# Create sets for faster lookups
valid_systematic_names = set(yeast_genes_df['Gene > Systematic Name'])

problematic_gene_counts = Counter()

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Find problematic indexes
            problematic_indexes = [
                index for index in pcl_df.index
                if index not in valid_systematic_names
            ]

            # Print problematic file and indexes
            # if problematic_indexes:
                # print(f"File: {file_name}")
                # print(f"Problematic indexes: {problematic_indexes}")
                # print('-----------------------------------')

            # Update the counter with the problematic indexes
            problematic_gene_counts.update(problematic_indexes)

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Iterate over all genes, not just the top 100, and filter those with count < 10
for gene, count in problematic_gene_counts.items():
    if count < 10 and count >=5 :
        print(f"{gene}: {count} occurrences")


###### For now, let's remove the problematic index if YORF available and same values

In [6]:
import os
import pandas as pd

# Paths to directories and files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'
gene_mapping_path = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes.tsv'

# Load the valid systematic names from the mapping file
gene_mapping_df = pd.read_csv(gene_mapping_path, sep='\t')
valid_systematic_names = set(gene_mapping_df['Gene > Systematic Name'].dropna())

# Process each .pcl file
for file_count, file_name in enumerate(os.listdir(pcl_directory), start=1):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(file_path, sep='\t', index_col=0)

            # Identify problematic indexes (not in valid systematic names)
            problematic_indexes = [
                index for index in pcl_df.index if index not in valid_systematic_names
            ]

            if problematic_indexes:
                print(f"Processing file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")
                print(f"Problematic indexes: {problematic_indexes}")

                # Check if problematic indexes have matching valid systematic name rows
                rows_to_drop = []
                for problem_index in problematic_indexes:
                    # Check if there is a matching row with valid systematic name
                    matching_rows = pcl_df.loc[
                        pcl_df.index.isin(valid_systematic_names) & 
                        (pcl_df.loc[problem_index].drop(['NAME', 'GWEIGHT'], errors='ignore') == pcl_df.drop(['NAME', 'GWEIGHT'], axis=1, errors='ignore')).all(axis=1)
                    ]

                    # If a matching row exists, mark the problematic index for removal
                    if not matching_rows.empty:
                        rows_to_drop.append(problem_index)

                # Remove the problematic rows and overwrite the file
                if rows_to_drop:
                    pcl_df = pcl_df.drop(index=rows_to_drop)
                    pcl_df.to_csv(file_path, sep='\t')
                    print(f"Fixed and overwritten file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")
                else:
                    print(f"No matching rows found for problematic indexes in file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")

        except Exception as e:
            print(f"Error processing file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}: {e}")

Processing file (8/556): GSE24802_final.pcl
Problematic indexes: ['ADR6', 'END12', 'TFP1', 'END13', 'ADR2', 'END11', 'TFP3', 'MSS10', 'PAS20', 'PAS21', 'PAS22', 'FAM1', 'XTC1', 'CVT7', 'CVT8', 'CVT1', 'CVT2', 'CVT4', 'CVT9', 'UGA5', 'PAS14', 'PAS11', 'PAS12', 'PAS10', 'SWP59', 'PBA4', 'PBA3', 'PBA2', "AAP1'", 'MDV2', 'ENG1', 'SLI2', 'SWP61', 'ENG2', 'MDS1', 'SLK1', 'LOM3', 'POS9', 'SWP73', 'MDT1', 'SLK2', 'PAT2', 'BCL21', 'SLD4', 'ATG6', 'NET2', 'MEC2', 'YGE1', 'YMR44', 'END1', 'END2', 'END9', 'END7', 'END8', 'END5', 'END6', 'END4', 'POX2', 'PIK120', 'TFG3', 'YMR26', 'TFC2', 'TFC5', 'CST13', 'CST17', 'DID7', 'DID1', 'DID6', 'PAY2', 'LOT3', 'LOT1', 'LOT2', 'FAH1', 'DIE1', 'SLC2', 'SLC4', 'RKS2', 'CFI1', 'PPD1', 'PPC1', 'DHE4', 'UREP1', 'YGL023', 'SHR10', 'YCRX13W', 'FSP2', 'HCS77', 'BUR1', 'BUR3', 'BUR5', 'MDF1', 'FSR2', 'MEP80', 'PPH1', 'YHB4', 'PPH2', 'SLY2', 'NEP1', 'YHC8', 'PPF3', 'PPF2', 'TRM61', 'TUP7', 'SWP29', 'QCR1', 'KIS4', 'KIS3', 'SLU2', 'PBF2', 'SLU1', 'BEE1', 'SLU4', 'HCS2

###### Apparently, most of the remaining problematic genes are "LTRs". Remove them

In [7]:
import os
import pandas as pd

# Paths to directories and files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

# Process each .pcl file
for file_count, file_name in enumerate(os.listdir(pcl_directory), start=1):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(file_path, sep='\t', index_col=0)

            # Identify rows to remove by checking if index contains 'delta', 'sigma', 'tau' or 'omega'
            rows_to_remove = [
                index for index in pcl_df.index if any(x in index.lower() for x in ['delta', 'sigma', 'tau', 'omega'])
            ]

            if rows_to_remove:
                print(f"Processing file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")
                print(f"Rows to remove: {rows_to_remove}")

                # Remove the rows and overwrite the file
                pcl_df = pcl_df.drop(index=rows_to_remove)
                pcl_df.to_csv(file_path, sep='\t')
                print(f"Fixed and overwritten file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")
            else:
                print(f"No rows to remove in file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")

        except Exception as e:
            print(f"Error processing file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}: {e}")

No rows to remove in file (1/556): GSE8897_setA_family.pcl
No rows to remove in file (2/556): GSE12222_set5_family.pcl
No rows to remove in file (3/556): GSE5283_setA_family.pcl
No rows to remove in file (4/556): GSE3815_set0_family.pcl
No rows to remove in file (5/556): GSE6302_set04_family.pcl
No rows to remove in file (6/556): GSE10269_set0_family.pcl
No rows to remove in file (7/556): 2010.MMSresponse.pcl
No rows to remove in file (8/556): GSE24802_final.pcl
No rows to remove in file (9/556): GSE3182_setA_family.pcl
No rows to remove in file (10/556): GSE9482_setA_family.pcl
No rows to remove in file (11/556): GSE6302_set08_family.pcl
No rows to remove in file (12/556): 2010.Causton01_acid.filter.flt.knn.avg.div.log.pcl
No rows to remove in file (13/556): GSE8825_set5_family.pcl
No rows to remove in file (14/556): 2010.Gasch00_hyper-osmotic.flt.knn.avg.pcl
No rows to remove in file (15/556): 2010.Saldanha04_LeucineBatchChem.flt.knn.avg.pcl
No rows to remove in file (16/556): 2010.d

###### Removing more Retrotransposons

In [8]:
import os
import pandas as pd

# Paths to directories and files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

# List of gene names to remove
genes_to_remove = [
    "YGRWTy2-2", "YLRCTy2-2", "YLRCTy1-1", "YMRCTy1-4", "YORCTy2-1", "YPLCTy4-1", "YPLWTy1-1",
    "YILWTy3-1", "YGRCTy1-3", "YBRWTy1-2", "YDRWTy1-5", "YFLWTy2-1", "YLRWTy1-3", "YOLWTy1-1",
    "YLRWTy1-2", "YGRWTy1-1", "YBLWTy1-1", "YPRWTy1-3", "YDRCTy1-3", "YJLWTy4-1", "YNLCTy2-1",
    "YJRWTy1-2", "YDRCTy1-1", "YPRCTy1-4", "YPRCTy1-2", "YMLWTy1-2", "YDRCTy2-1", "YARCTy1-1",
    "YDRWTy2-2", "YNLWTy1-2", "YNLCTy1-1", "YDRWTy1-4", "YGRCTy1-2", "YBLWTy2-1", "YORWTy2-2",
    "YCLWTy2-1", "YORWTy1-2", "YGRWTy3-1", "YDRWTy2-3", "YDRCTy1-2", "YJRWTy1-1", "YHRCTy1-1",
    "YMLWTy1-1", "YERCTy1-1", "YMRCTy1-3", "YLRWTy2-1", "YGRCTy2-1", "YCLWTy5-1'"
]

# Process each .pcl file
for file_count, file_name in enumerate(os.listdir(pcl_directory), start=1):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(file_path, sep='\t', index_col=0)

            # Identify rows to remove by checking if index is in the list of genes to remove
            rows_to_remove = [
                index for index in pcl_df.index if index in genes_to_remove
            ]

            if rows_to_remove:
                print(f"Processing file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")
                print(f"Rows to remove: {rows_to_remove}")

                # Remove the rows and overwrite the file
                pcl_df = pcl_df.drop(index=rows_to_remove)
                pcl_df.to_csv(file_path, sep='\t')
                print(f"Fixed and overwritten file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")
            else:
                print(f"No rows to remove in file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")

        except Exception as e:
            print(f"Error processing file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}: {e}")


No rows to remove in file (1/556): GSE8897_setA_family.pcl
No rows to remove in file (2/556): GSE12222_set5_family.pcl
No rows to remove in file (3/556): GSE5283_setA_family.pcl
No rows to remove in file (4/556): GSE3815_set0_family.pcl
No rows to remove in file (5/556): GSE6302_set04_family.pcl
No rows to remove in file (6/556): GSE10269_set0_family.pcl
No rows to remove in file (7/556): 2010.MMSresponse.pcl
No rows to remove in file (8/556): GSE24802_final.pcl
No rows to remove in file (9/556): GSE3182_setA_family.pcl
No rows to remove in file (10/556): GSE9482_setA_family.pcl
No rows to remove in file (11/556): GSE6302_set08_family.pcl
No rows to remove in file (12/556): 2010.Causton01_acid.filter.flt.knn.avg.div.log.pcl
No rows to remove in file (13/556): GSE8825_set5_family.pcl
No rows to remove in file (14/556): 2010.Gasch00_hyper-osmotic.flt.knn.avg.pcl
No rows to remove in file (15/556): 2010.Saldanha04_LeucineBatchChem.flt.knn.avg.pcl
No rows to remove in file (16/556): 2010.d

###### Remove the remaining ones, which seem irrelevant and not a sig number of occurrences

In [9]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'
yeast_genes_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes.tsv'

# Load the yeast genes data
yeast_genes_df = pd.read_csv(yeast_genes_file, sep='\t')

# Get the set of valid systematic names, ensuring case insensitivity
valid_systematic_names = set(yeast_genes_df['Gene > Systematic Name'].str.upper())

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Identify rows with indexes not in the valid systematic names
            invalid_indexes = [index for index in pcl_df.index if index.upper() not in valid_systematic_names]

            # If invalid rows are found, drop them
            if invalid_indexes:
                pcl_df.drop(index=invalid_indexes, inplace=True)

                # Overwrite the original file
                pcl_df.to_csv(pcl_file_path, sep='\t')
                print(f"File '{file_name}' updated: Removed {len(invalid_indexes)} invalid rows.")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

File '2010.Bulik03.filter.flt.knn.avg.div.log.pcl' updated: Removed 97 invalid rows.
File 'GSE89875.mapped.pcl' updated: Removed 17 invalid rows.
File '2010.Bro03.filter.flt.knn.avg.div.log.pcl' updated: Removed 97 invalid rows.
File '2010.Martin04.filter.flt.knn.avg.div.log.pcl' updated: Removed 94 invalid rows.
File 'GSE32196GPL15778.sfp.pcl' updated: Removed 139 invalid rows.
File 'GSE94945.mapped.pcl' updated: Removed 17 invalid rows.
File 'GSE76985.mapped.pcl' updated: Removed 17 invalid rows.
File '2010.Orlandi04.filter.flt.knn.avg.div.log.pcl' updated: Removed 75 invalid rows.
File 'GSE34330GPL8154.sfp.pcl' updated: Removed 475 invalid rows.
File '2010.Bernstein00_TSA.filter.flt.knn.avg.div.log.pcl' updated: Removed 50 invalid rows.
File '2010.Bernstein00_HDACsin3sap30ume6hda1hos2hos3.filter.flt.knn.avg.div.log.pcl' updated: Removed 50 invalid rows.
File 'GSE96849.mapped.pcl' updated: Removed 17 invalid rows.
File '2010.Caba05.filter.flt.knn.avg.div.log.pcl' updated: Removed 1 i

##### Final Check

In [23]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'
yeast_genes_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes.tsv'

# Load the yeast genes data
yeast_genes_df = pd.read_csv(yeast_genes_file, sep='\t')

# Get the set of valid systematic names, ensuring case insensitivity
valid_systematic_names = set(yeast_genes_df['Gene > Systematic Name'].str.upper())

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Identify rows with indexes not in the valid systematic names
            invalid_indexes = [index for index in pcl_df.index if index.upper() not in valid_systematic_names]

            # If invalid rows are found, report them
            if invalid_indexes:
                print(f"File '{file_name}' has {len(invalid_indexes)} invalid rows: {', '.join(invalid_indexes)}.")
            # else:
            #     print(f"File '{file_name}' has all valid rows.")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

#### 3. Dealing with Duplicates

In [None]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)
            
            # Identify duplicate indexes
            duplicate_indexes = pcl_df.index[pcl_df.index.duplicated()].unique()
            
            if len(duplicate_indexes) > 1000:
                print(f"File: {file_name}")
                print(f"Number of duplicate indexes: {len(duplicate_indexes)}")
        
        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")


##### Direct duplicates

In [16]:
import os
import pandas as pd

# Directory containing PCL files
pcl_directory = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls"

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)

        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Rename index to "YORF"
            pcl_df.index.name = "YORF"

            # Identify experimental columns (excluding NAME and GWEIGHT)
            experimental_cols = [col for col in pcl_df.columns if col not in ["NAME", "GWEIGHT"]]

            # Reset index to include the YORF in duplicate detection
            pcl_df_reset = pcl_df.reset_index()

            # Detect exact duplicates (same YORF and experimental values)
            duplicate_mask = pcl_df_reset.duplicated(subset=["YORF"] + experimental_cols, keep=False)

            # Filter duplicated rows
            duplicate_df = pcl_df_reset[duplicate_mask]

            if not duplicate_df.empty:
                print(f"\nDuplicates found in {file_name}:")
                duplicate_genes = duplicate_df["YORF"].unique()
                print(f"Duplicate genes: {', '.join(duplicate_genes)}")
                #print(duplicate_df)

        except Exception as e:
            print(f"❌ Error processing file '{file_name}': {e}")

In [None]:
import os
import pandas as pd

# Directory containing PCL files
pcl_directory = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls"

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)

        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Rename index to "YORF"
            pcl_df.index.name = "YORF"

            # Identify experimental columns (excluding NAME and GWEIGHT)
            experimental_cols = [col for col in pcl_df.columns if col not in ["NAME", "GWEIGHT"]]

            # Reset index for duplicate detection
            pcl_df_reset = pcl_df.reset_index()

            # Drop exact duplicates (same index and same experimental values)
            pcl_df_cleaned = pcl_df_reset.drop_duplicates(subset=["YORF"] + experimental_cols, keep="first")

            # Save back to the original file
            pcl_df_cleaned.to_csv(pcl_file_path, sep='\t', index=False)

            print(f"✅ Cleaned duplicates from {file_name}")

        except Exception as e:
            print(f"❌ Error processing file '{file_name}': {e}")

##### Pseudoduplicates

###### Aux

In [None]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/test_pcls'

# Track first pseudoduplicate group
first_pseudoduplicate_printed = False

# Iterate over all .pcl files
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)
            
            # Identify experimental columns (excluding NAME and GWEIGHT)
            experimental_cols = [col for col in pcl_df.columns if col not in ["NAME", "GWEIGHT"]]
            
            # Count pseudoduplicates
            duplicate_groups = pcl_df.groupby(pcl_df.index)

            pseudoduplicate_count = 0
            first_pseudoduplicate_family = None

            for yorf, group in duplicate_groups:
                if len(group) > 1:
                    unique_experiment_values = group[experimental_cols].drop_duplicates()
                    
                    if len(unique_experiment_values) > 1:
                        pseudoduplicate_count += 1
                        
                        # Store the first pseudoduplicate group if not printed yet
                        if not first_pseudoduplicate_printed:
                            first_pseudoduplicate_family = group
                            first_pseudoduplicate_printed = None
            
            # Print count for the file if there are pseudoduplicates
            if pseudoduplicate_count > 0:
                print(f"File: {file_name} - Pseudoduplicate YORFs: {pseudoduplicate_count}")
                
                # Print the first found pseudoduplicate family
                if first_pseudoduplicate_family is not None:
                    print("\nFirst Pseudoduplicate Family:")
                    print(first_pseudoduplicate_family.index)
                    print("-----------------------------------")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

In [None]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

done = 0

# Iterate over all .pcl files
for file_name in os.listdir(pcl_directory):
    if done:
        break
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)
            
            # Identify experimental columns (excluding NAME and GWEIGHT)
            experimental_cols = [col for col in pcl_df.columns if col not in ["NAME", "GWEIGHT"]]
            
            # Count pseudoduplicates
            duplicate_groups = pcl_df.groupby(pcl_df.index)
            
            for gene, group in duplicate_groups:
                if len(group) >= 3:  # 3 or more pseudoduplicates
                    print(f"File: {file_name} - Gene: {gene}")
                    done = True
        
        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

###### Fixing the Issue

In [None]:
import os
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

def load_pcl(file_path):
    """Loads a .pcl file into a Pandas DataFrame, using the first column as the index."""
    try:
        df = pd.read_csv(file_path, sep='\t', index_col=0)
        df.drop(columns=['NAME', 'GWEIGHT'], errors='ignore', inplace=True)  # Ignore if not present
        return df
    except Exception as e:
        print(f"Error loading file '{file_path}': {e}")
        return None

def compute_correlation_matrix(group):
    """Computes the correlation matrix for a group of pseudoduplicates."""
    return group.T.corr()

def merge_pseudoduplicates(group, threshold=0.8):
    """Merges correlated pseudoduplicates and selects one if uncorrelated remain."""
    if len(group) == 1:
        return group  # Only one row, no duplicates
    
    corr_matrix = compute_correlation_matrix(group)
    merged_rows = []
    used = set()
    
    for i in range(len(group)):
        if i in used:
            continue
        correlated = [i]
        
        for j in range(i + 1, len(group)):
            if j not in used and corr_matrix.iloc[i, j] >= threshold:
                correlated.append(j)
                used.add(j)
        
        merged_rows.append(group.iloc[correlated].mean())
        used.add(i)
    
    if len(merged_rows) > 1:
        return pd.DataFrame([merged_rows[np.random.choice(len(merged_rows))]])  # Pick one randomly
    
    return pd.DataFrame(merged_rows)  # Return merged row

def process_pcl_files(pcl_directory, output_directory):
    """Processes all .pcl files in the directory, cleans pseudoduplicates, and saves results."""
    os.makedirs(output_directory, exist_ok=True)
    
    for filename in os.listdir(pcl_directory):
        if filename.endswith(".pcl"):
            file_path = os.path.join(pcl_directory, filename)
            df = load_pcl(file_path)
            
            if df is None:
                continue
            
            # Identify experimental columns
            experimental_cols = [col for col in df.columns if col not in ["NAME", "GWEIGHT"]]
            df = df[experimental_cols]  # Keep only experimental columns
            
            cleaned_data = []
            duplicate_groups = df.groupby(df.index)
            
            for gene, group in duplicate_groups:
                cleaned_group = merge_pseudoduplicates(group)
                cleaned_group.index = [gene] * len(cleaned_group)  # Keep original index
                cleaned_data.append(cleaned_group)
            
            cleaned_df = pd.concat(cleaned_data)
            output_path = os.path.join(output_directory, filename)
            cleaned_df.to_csv(output_path, sep='\t')
            print(f"Processed: {filename} -> {output_path}")

# Set directories
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'
output_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

# Run processing
process_pcl_files(pcl_directory, output_directory)

In [2]:
import pandas as pd

file_path = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_master_matrix_sgd.csv'

df = pd.read_csv(file_path, sep='\t', index_col=0)

print(df.shape)

(6800, 11889)
