### Building Yeast Master Matrix From Microarray Expression Data

#### 0. Misc Explorations

##### GWEIGHTs

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

# List to store file names with non-uniform GWEIGHT values
files_with_non_uniform_gweight = []

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Check if the 'GWEIGHT' column exists
            if 'GWEIGHT' in df.columns:
                # Check if all values in 'GWEIGHT' are equal to 1
                if not (df['GWEIGHT'] == 1).all():
                    files_with_non_uniform_gweight.append(file_name)

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Print the results
if files_with_non_uniform_gweight:
    print("Files with non-uniform GWEIGHT values:")
    for file in files_with_non_uniform_gweight:
        print(file)
else:
    print("All files have GWEIGHT values uniformly equal to 1.")

##### Total Number of Columns

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

# Counter for total number of experiment columns
total_experiment_columns = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Count the experiment columns (excluding 'GWEIGHT', 'NAME', 'IDENTIFIER', 'Description', etc.)
            experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME', 'IDENTIFIER', 'Description']]
            total_experiment_columns += len(experiment_columns)

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Print the total number of experiment columns
print(f"Total number of experiment columns across all files: {total_experiment_columns}")

##### Files with More Genes than Genome

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

# Threshold for the size of the yeast genome
threshold = 7337

# Counter for files with unique rows exceeding the threshold
count_exceeding_files = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Find the number of unique rows
            unique_count = df.drop_duplicates().shape[0]

            # Check if the unique count exceeds the threshold
            if unique_count > threshold:
                count_exceeding_files += 1

                # Count rows where the YORF (index) starts with 'SGD'
                sgd_count = sum(df.index.astype(str).str.startswith('SGD'))

                print(f"File: {file_name}, Number of unique rows: {unique_count}")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Print the total count of files exceeding the threshold
print(f"Number of files with unique rows larger than {threshold}: {count_exceeding_files}")

#### 1. Normalizing Columns in Each .pcl File

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

files_processed = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Exclude non-experiment columns from normalization
            experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME']]

            # Cast the experiment columns to float32
            df[experiment_columns] = df[experiment_columns].astype('float32')

            # Initialize the scaler
            scaler = StandardScaler()

            # Apply z-score normalization for each experiment column
            df[experiment_columns] = scaler.fit_transform(df[experiment_columns])

            # Save the modified DataFrame back to the same file
            df.to_csv(file_path, sep="\t")

            files_processed += 1

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

print(f"Processed {files_processed} files.")

#### 2. Translating All Indexes to YORFs

##### Dealing with SGD Indexes

In [None]:
import os
import pandas as pd

# Directory containing .pcl files and the all_yeast_genes.tsv file
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'
genes_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes.tsv'

# Load the all_yeast_genes.tsv to create the mapping dictionary
genes_df = pd.read_csv(genes_file, sep='\t')
gene_mapping = dict(zip(genes_df['Gene > Primary DBID'], genes_df['Gene > Systematic Name']))

files_processed = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Replace index values that start with 'SGD' using the mapping dictionary
            df.index = df.index.to_series().apply(lambda x: gene_mapping.get(x, x) if x.startswith('SGD') else x)

            # Save the modified DataFrame back to the same file
            df.to_csv(file_path, sep="\t")

            files_processed += 1

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

print(f"Processed {files_processed} files.")

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

files_checked = 0
sgd_found = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Check if any index starts with 'SGD'
            if df.index.str.startswith('SGD').any():
                print(f"Found 'SGD' in indices of file: {file_name}")
                sgd_found += 1

            files_checked += 1

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

print(f"Checked {files_checked} files.")
print(f"Found 'SGD' indices in {sgd_found} files.")

##### Dealing with Standard Indexes

In [None]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'
yeast_genes_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes_v2.tsv' # Contains some manual additions

# Load the yeast genes data
yeast_genes_df = pd.read_csv(yeast_genes_file, sep='\t')

# Create a dictionary mapping Gene > Standard Name to Gene > Systematic Name (case-insensitive)
standard_to_systematic = {
    standard.upper(): systematic.upper() 
    for standard, systematic in zip(
        yeast_genes_df['Gene > Standard Name'].dropna(), 
        yeast_genes_df['Gene > Systematic Name'].dropna()
    )
}

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Flag to check if the file was modified
            modified = False

            # Create a new index list
            new_index = []
            for index in pcl_df.index:
                # Check if the index is problematic (not in standard_to_systematic dictionary)
                upper_index = index.upper()
                if upper_index not in standard_to_systematic:
                    # Keep the index as is if no mapping exists
                    new_index.append(index)
                else:
                    # Replace the index with the corresponding Gene > Systematic Name
                    new_index.append(standard_to_systematic[upper_index])
                    modified = True

            # Update the index of the DataFrame if modified
            if modified:
                pcl_df.index = new_index

                # Overwrite the original file
                pcl_df.to_csv(pcl_file_path, sep='\t')

                print(f"File '{file_name}' updated successfully.")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

##### Dealing with the Rest

In [10]:
import os
import pandas as pd
from collections import Counter

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'
yeast_genes_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes_rest_of_problematic_update.tsv'

# Load the yeast genes data
yeast_genes_df = pd.read_csv(yeast_genes_file, sep='\t')

# Create sets for faster lookups
valid_systematic_names = set(yeast_genes_df['Gene > Systematic Name'])

problematic_gene_counts = Counter()

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Find problematic indexes
            problematic_indexes = [
                index for index in pcl_df.index
                if index not in valid_systematic_names
            ]

            # Print problematic file and indexes
            if problematic_indexes and len(problematic_indexes) <= 5:
                print(f"File: {file_name}")
                print(f"Problematic indexes: {problematic_indexes}")
                print('-----------------------------------')

            # Update the counter with the problematic indexes
            problematic_gene_counts.update(problematic_indexes)

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

for gene, count in problematic_gene_counts.most_common(100):
    print(f"{gene}: {count} occurrences")


YERWomega2: 19 occurrences
YGRWTy2-2: 18 occurrences
YLRCTy2-2: 18 occurrences
YLRCTy1-1: 18 occurrences
YMRCTy1-4: 18 occurrences
YORCTy2-1: 18 occurrences
YPLCTy4-1: 18 occurrences
YPLWTy1-1: 18 occurrences
YILWTy3-1: 18 occurrences
YGRCTy1-3: 18 occurrences
YBRWTy1-2: 18 occurrences
YDRWTy1-5: 18 occurrences
YFLWTy2-1: 18 occurrences
YLRWTy1-3: 18 occurrences
YOLWTy1-1: 18 occurrences
YLRWTy1-2: 18 occurrences
YGRWTy1-1: 18 occurrences
YBLWTy1-1: 18 occurrences
YPRWTy1-3: 18 occurrences
YDRCTy1-3: 18 occurrences
YJLWTy4-1: 18 occurrences
YNLCTy2-1: 18 occurrences
YJRWTy1-2: 18 occurrences
YDRCTy1-1: 18 occurrences
YPRCTy1-4: 18 occurrences
YPRCTy1-2: 18 occurrences
YMLWTy1-2: 18 occurrences
YDRCTy2-1: 18 occurrences
YARCTy1-1: 18 occurrences
YDRWTy2-2: 18 occurrences
YNLWTy1-2: 18 occurrences
YNLCTy1-1: 18 occurrences
YDRWTy1-4: 18 occurrences
YGRCTy1-2: 18 occurrences
YBLWTy2-1: 18 occurrences
YORWTy2-2: 18 occurrences
YCLWTy2-1: 18 occurrences
YORWTy1-2: 18 occurrences
YGRWTy3-1: 

###### For now, let's remove the problematic index if YORF available and same values

In [None]:
import os
import pandas as pd

# Paths to directories and files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'
gene_mapping_path = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes.tsv'

# Load the valid systematic names from the mapping file
gene_mapping_df = pd.read_csv(gene_mapping_path, sep='\t')
valid_systematic_names = set(gene_mapping_df['Gene > Systematic Name'].dropna())

# Process each .pcl file
for file_count, file_name in enumerate(os.listdir(pcl_directory), start=1):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(file_path, sep='\t', index_col=0)

            # Identify problematic indexes (not in valid systematic names)
            problematic_indexes = [
                index for index in pcl_df.index if index not in valid_systematic_names
            ]

            if problematic_indexes:
                print(f"Processing file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")
                print(f"Problematic indexes: {problematic_indexes}")

                # Check if problematic indexes have matching valid systematic name rows
                rows_to_drop = []
                for problem_index in problematic_indexes:
                    # Check if there is a matching row with valid systematic name
                    matching_rows = pcl_df.loc[
                        pcl_df.index.isin(valid_systematic_names) & 
                        (pcl_df.loc[problem_index].drop(['NAME', 'GWEIGHT'], errors='ignore') == pcl_df.drop(['NAME', 'GWEIGHT'], axis=1, errors='ignore')).all(axis=1)
                    ]

                    # If a matching row exists, mark the problematic index for removal
                    if not matching_rows.empty:
                        rows_to_drop.append(problem_index)

                # Remove the problematic rows and overwrite the file
                if rows_to_drop:
                    pcl_df = pcl_df.drop(index=rows_to_drop)
                    pcl_df.to_csv(file_path, sep='\t')
                    print(f"Fixed and overwritten file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")
                else:
                    print(f"No matching rows found for problematic indexes in file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")

        except Exception as e:
            print(f"Error processing file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}: {e}")

###### Apparently, most of the remaining problematic genes are "LTRs". Remove them

In [None]:
import os
import pandas as pd

# Paths to directories and files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

# Process each .pcl file
for file_count, file_name in enumerate(os.listdir(pcl_directory), start=1):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(file_path, sep='\t', index_col=0)

            # Identify rows to remove by checking if index contains 'delta', 'sigma', 'tau' or 'omega'
            rows_to_remove = [
                index for index in pcl_df.index if any(x in index.lower() for x in ['delta', 'sigma', 'tau', 'omega'])
            ]

            if rows_to_remove:
                print(f"Processing file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")
                print(f"Rows to remove: {rows_to_remove}")

                # Remove the rows and overwrite the file
                pcl_df = pcl_df.drop(index=rows_to_remove)
                pcl_df.to_csv(file_path, sep='\t')
                print(f"Fixed and overwritten file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")
            else:
                print(f"No rows to remove in file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")

        except Exception as e:
            print(f"Error processing file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}: {e}")

###### For simplicity, let's just discard the remaining since we would need to manually look up everything

In [None]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'
yeast_genes_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes.tsv'

# Load the yeast genes data
yeast_genes_df = pd.read_csv(yeast_genes_file, sep='\t')

# Get the set of valid systematic names, ensuring case insensitivity
valid_systematic_names = set(yeast_genes_df['Gene > Systematic Name'].str.upper())

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Identify rows with indexes not in the valid systematic names
            invalid_indexes = [index for index in pcl_df.index if index.upper() not in valid_systematic_names]

            # If invalid rows are found, drop them
            if invalid_indexes:
                pcl_df.drop(index=invalid_indexes, inplace=True)

                # Overwrite the original file
                pcl_df.to_csv(pcl_file_path, sep='\t')
                print(f"File '{file_name}' updated: Removed {len(invalid_indexes)} invalid rows.")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

#### 3. Dealing with Duplicates

In [26]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)
            
            # Identify duplicate indexes
            duplicate_indexes = pcl_df.index[pcl_df.index.duplicated()].unique()
            
            if len(duplicate_indexes) > 0:
                print(f"File: {file_name}")
                print(f"Number of duplicate indexes: {len(duplicate_indexes)}")

                # Check if the rows match for each duplicate index
                for dup_index in duplicate_indexes:
                    duplicate_rows = pcl_df.loc[dup_index]
                    
                    # Count occurrences of the duplicate index
                    num_duplicates = len(duplicate_rows) if isinstance(duplicate_rows, pd.DataFrame) else 1
                    
                    # Handle duplicates with 3 or more occurrences
                    if num_duplicates >= 3:
                        print(f"  - Gene '{dup_index}' has {num_duplicates} duplicates.")
                    
                    # Check if the rows match
                    if isinstance(duplicate_rows, pd.DataFrame):
                        experiment_columns = duplicate_rows.drop(columns=['NAME', 'GWEIGHT'], errors='ignore')
                        
                        # Check if all rows are identical
                        rows_match = experiment_columns.nunique().sum() == experiment_columns.shape[1]
                        
                        if rows_match:
                            print(f"    - Duplicate index '{dup_index}' has matching rows.")
                        else:
                            print(f"    - Duplicate index '{dup_index}' has non-matching rows.")
                    else:
                        print(f"  - Duplicate index '{dup_index}' has only one row (unexpected).")

                print('\n')
        
        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")


##### Removing direct duplicates

In [None]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Drop duplicates based on the index and experimental columns (excluding NAME and GWEIGHT)
            experimental_columns = pcl_df.drop(columns=['NAME', 'GWEIGHT'], errors='ignore')
            deduplicated_df = pcl_df.loc[~experimental_columns.index.duplicated(keep='first')]

            # Save the deduplicated DataFrame back to the same file
            deduplicated_df.to_csv(pcl_file_path, sep='\t')
            print(f"Duplicates removed for file: {file_name}")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

In [28]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_pcls'

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)
            
            # Identify duplicate indexes
            duplicate_indexes = pcl_df.index[pcl_df.index.duplicated()].unique()
            
            # Print if the file has duplicates or not
            if len(duplicate_indexes) > 0:
                print(f"File: {file_name} has {len(duplicate_indexes)} duplicate indexes.")
            # else:
            #     print(f"File: {file_name} has no duplicate indexes.")
        
        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")