#### Normalizing each pcl file

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

files_processed = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Exclude non-experiment columns from normalization (e.g., 'GWEIGHT', 'NAME', 'IDENTIFIER', etc.)
            experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME', 'IDENTIFIER', 'Description', 'GENE']]

            # Initialize the scaler
            scaler = StandardScaler()

            # Apply z-score normalization for each experiment column
            df[experiment_columns] = scaler.fit_transform(df[experiment_columns])

            # Save the modified DataFrame back to the same file
            df.to_csv(file_path, sep="\t")

            files_processed += 1

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")


print(f"Processed {files_processed} files.")

#### Merging

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/test_pcls'

# Prepare an empty DataFrame to store the master matrix
master_df = pd.DataFrame()

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Filter out only the experimental columns (exclude 'GWEIGHT', 'NAME', 'IDENTIFIER', 'Description', etc.)
            experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME', 'IDENTIFIER', 'Description']]

            # Keep only the rows and columns of interest (experiment columns)
            df = df[experiment_columns]

            # Merge this dataframe with the master dataframe (join by gene names/rows)
            if master_df.empty:
                master_df = df
            else:
                master_df = master_df.join(df, how='outer')  # 'outer' join to keep all genes

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Save the master matrix to a new file
master_df.to_csv('/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/test.csv', sep="\t")
print("Master matrix created and saved.")

In [None]:
import pandas as pd

# Path to the CSV file
csv_file_path = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/test.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path, delimiter="\t", index_col=0)

# Get the number of rows and columns
num_rows, num_columns = df.shape

# Print the results
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")

print(df.info())


#### GWEIGHT

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

# List to store file names with non-uniform GWEIGHT values
files_with_non_uniform_gweight = []

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Check if the 'GWEIGHT' column exists
            if 'GWEIGHT' in df.columns:
                # Check if all values in 'GWEIGHT' are equal to 1
                if not (df['GWEIGHT'] == 1).all():
                    files_with_non_uniform_gweight.append(file_name)

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Print the results
if files_with_non_uniform_gweight:
    print("Files with non-uniform GWEIGHT values:")
    for file in files_with_non_uniform_gweight:
        print(file)
else:
    print("All files have GWEIGHT values uniformly equal to 1.")

#### Total number of columns

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

# Counter for total number of experiment columns
total_experiment_columns = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Count the experiment columns (excluding 'GWEIGHT', 'NAME', 'IDENTIFIER', 'Description', etc.)
            experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME', 'IDENTIFIER', 'Description']]
            total_experiment_columns += len(experiment_columns)

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Print the total number of experiment columns
print(f"Total number of experiment columns across all files: {total_experiment_columns}")


##### Type of columns

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Filter out only the experimental columns (exclude 'GWEIGHT', 'NAME', 'IDENTIFIER', 'Description', etc.)
            experiment_columns = [col for col in df.columns if col not in ['GWEIGHT', 'NAME', 'IDENTIFIER', 'Description']]

            # Convert experiment columns to float32
            df[experiment_columns] = df[experiment_columns].astype('float32')

            # Save the updated DataFrame, overwriting the original file
            df.to_csv(file_path, sep="\t")
            # print(f"File '{file_name}' updated successfully.")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

print("All files processed and updated.")

#### Checking for duplicate rows

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/test_pcls'

# Counter for files with duplicate rows
duplicate_file_count = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Check for duplicate rows based on the index (gene names)
            duplicate_rows = df.index.duplicated(keep=False)

            if duplicate_rows.any():
                duplicate_file_count += 1
                print(f"File with duplicates: {file_name}")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Print the total number of files with duplicates
print(f"Total files with duplicate rows: {duplicate_file_count}")

In [None]:
import pandas as pd

# Path to the specific .pcl file
pcl_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/test_pcls/2010.alpha_conc.pcl'

try:
    # Load the .pcl file
    df = pd.read_csv(pcl_file, sep="\t", index_col=0)

    # Identify duplicate rows based on the index (gene names)
    duplicated_genes = df.index[df.index.duplicated(keep=False)]

    if duplicated_genes.any():
        print(f"Duplicate genes in file '{pcl_file}':")
        print(duplicated_genes.unique())  # Print unique duplicate gene names
    else:
        print(f"No duplicate genes found in file '{pcl_file}'.")

except Exception as e:
    print(f"Error processing file '{pcl_file}': {e}")

#### Dealing with duplicate rows

##### Files with more genes than yeast genome

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

# Threshold for the size of the yeast genome
threshold = 7337

# Counter for files with unique rows exceeding the threshold
count_exceeding_files = 0

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the .pcl file
            df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Find the number of unique rows
            unique_count = df.drop_duplicates().shape[0]

            # Check if the unique count exceeds the threshold
            if unique_count > threshold:
                count_exceeding_files += 1

                # Count rows where the YORF (index) starts with 'SGD'
                sgd_count = sum(df.index.astype(str).str.startswith('SGD'))

                print(f"File: {file_name}, Number of unique rows: {unique_count}")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

# Print the total count of files exceeding the threshold
print(f"Number of files with unique rows larger than {threshold}: {count_exceeding_files}")

##### Checking if it's simply a matter of a row being repeated

In [5]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'
yeast_genes_file_path = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes.tsv'

# Load the yeast genes mapping file
yeast_genes_df = pd.read_csv(yeast_genes_file_path, sep='\t')

# Create a mapping from SGD IDs to Systematic Names (YORFs)
yorf_mapping = dict(zip(yeast_genes_df['Gene > Primary DBID'], yeast_genes_df['Gene > Systematic Name']))

# Process each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Loop through each row in the PCL file
            for yorfs, row in pcl_df.iterrows():
                # Check if the index (YORF) starts with 'SGD'
                if yorfs.startswith('SGD'):
                    try:
                        # Translate SGD ID to YORF
                        yorf = yorf_mapping.get(yorfs)

                        if not yorf:
                            print(f"SGD {yorfs} has no corresponding YORF in the mapping.")
                            continue

                        # Count rows in the PCL file that have the same YORF as an index
                        matching_rows = pcl_df.index[pcl_df.index == yorf]

                        if len(matching_rows) > 1:
                            print(f"\nProcessing file: {file_name}")
                            print(f"YORF {yorf} (translated from {yorfs}) has {len(matching_rows)} associated rows in the PCL file.")

                    except KeyError:
                        print(f"YORF {yorfs} not found in the mapping file.")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

##### Checking if all SGD rows are equal to the translated YORF row

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'
yeast_genes_file_path = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes.tsv'

# Load the yeast genes mapping file
yeast_genes_df = pd.read_csv(yeast_genes_file_path, sep='\t')

# Create a mapping from SGD IDs to Systematic Names (YORFs)
yorf_mapping = dict(zip(yeast_genes_df['Gene > Primary DBID'], yeast_genes_df['Gene > Systematic Name']))

# Process each .pcl file in the directory
for file_count, file_name in enumerate(os.listdir(pcl_directory)):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        print(f"\nProcessing file {file_count + 1}/{len(os.listdir(pcl_directory))}")

        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Loop through each row in the PCL file
            for yorfs, row in pcl_df.iterrows():
                # Check if the index (YORF) starts with 'SGD'
                if yorfs.startswith('SGD'):
                    try:
                        # Translate SGD ID to YORF
                        yorf = yorf_mapping.get(yorfs)

                        if not yorf:
                            print(f"SGD {yorfs} has no corresponding YORF in the mapping.")
                            continue

                        # Check if the translated YORF exists in the PCL file
                        if yorf in pcl_df.index:
                            # Extract experimental columns (excluding NAME and GWEIGHT)
                            sgd_experiment_data = row.drop(['NAME', 'GWEIGHT'], errors='ignore')
                            yorf_experiment_data = pcl_df.loc[yorf].drop(['NAME', 'GWEIGHT'], errors='ignore')

                            # Compare experimental columns
                            if not sgd_experiment_data.equals(yorf_experiment_data):
                                print(f"Mismatch in file '{file_name}' for YORF: {yorf} (translated from {yorfs})")

                    except KeyError:
                        print(f"YORF {yorfs} not found in the mapping file.")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

##### Removing SGD Rows

In [None]:
import os
import pandas as pd

# Directory containing .pcl files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

# Process each .pcl file in the directory
for file_count, file_name in enumerate(os.listdir(pcl_directory)):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        #print(f"Processing file: {file_name}")

        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Remove rows where the index starts with 'SGD'
            filtered_df = pcl_df[~pcl_df.index.str.startswith('SGD')]

            # Overwrite the original file with the filtered data
            filtered_df.to_csv(pcl_file_path, sep='\t')
            print(f"File '{file_name}' ({file_count + 1}/{len(os.listdir(pcl_directory))}) updated successfully. Removed rows starting with 'SGD'.")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

#### Now we are close to the size of the genome, but still having some extra rows

In [None]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'
yeast_genes_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes.tsv'

# Load the yeast genes data
yeast_genes_df = pd.read_csv(yeast_genes_file, sep='\t')

# Create sets for faster lookups
valid_systematic_names = set(yeast_genes_df['Gene > Systematic Name'])

all_problematic_genes = set()

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Find problematic indexes
            problematic_indexes = [
                index for index in pcl_df.index
                if index not in valid_systematic_names
            ]

            # Print problematic file and indexes
            if problematic_indexes:
                print(f"File: {file_name}")
                print(f"Problematic indexes: {problematic_indexes}")
                all_problematic_genes.update(problematic_indexes)
                print('-----------------------------------')

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

print(len(all_problematic_genes))

##### For simplicity, let's just discard the remaining since we would need to manually look up everything

In [None]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'
yeast_genes_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes.tsv'

# Load the yeast genes data
yeast_genes_df = pd.read_csv(yeast_genes_file, sep='\t')

# Get the set of valid systematic names, ensuring case insensitivity
valid_systematic_names = set(yeast_genes_df['Gene > Systematic Name'].str.upper())

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Identify rows with indexes not in the valid systematic names
            invalid_indexes = [index for index in pcl_df.index if index.upper() not in valid_systematic_names]

            # If invalid rows are found, drop them
            if invalid_indexes:
                pcl_df.drop(index=invalid_indexes, inplace=True)

                # Overwrite the original file
                pcl_df.to_csv(pcl_file_path, sep='\t')
                print(f"File '{file_name}' updated: Removed {len(invalid_indexes)} invalid rows.")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

##### Problem: if not treated, master matrix would have two different names for the same gene and the matrix will grow

In [None]:
import os
import pandas as pd

# Path to the normalized folder
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

# Initialize a flag to indicate whether the index is found
found = False

# Process each .pcl file in the folder
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Check if 'YJR084W' exists as an index
            if 'YJR084W' in pcl_df.index:
                print(f"Found 'YJR084W' in file: {file_name}")
                found = True
                break  # Stop the loop as the index is found

        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

# If not found in any file, notify
if not found:
    print("'YJR084W' not found in any file.")

##### Let's look for repeated YORF rows

In [None]:
import os
import pandas as pd

# Path to the normalized folder
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

# Flag to stop processing if a problematic file is found
problem_found = False

# Process each .pcl file in the folder
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(file_path, sep="\t", index_col=0)

            # Check for duplicate indexes
            duplicate_indexes = pcl_df.index[pcl_df.index.duplicated()].tolist()
            if duplicate_indexes:
                print(f"Problematic file: {file_name}")
                print(f"Duplicate indexes: {duplicate_indexes}")
                # problem_found = True
                # break  # Stop the loop at the first problematic file

        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

# If no problematic file is found, notify
if not problem_found:
    print("All files have unique row indexes.")


##### Apparently again rows are repeated, the corresponding YORF contains the same values

In [None]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'
yeast_genes_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes.tsv'

# Load the yeast genes data
yeast_genes_df = pd.read_csv(yeast_genes_file, sep='\t')

# Extract valid Gene > Systematic Names
valid_systematic_names = set(yeast_genes_df['Gene > Systematic Name'])

# Iterate over each .pcl file in the directory
for file_count, file_name in enumerate(os.listdir(pcl_directory)):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Identify problematic indexes
            problematic_indexes = [index for index in pcl_df.index if index not in valid_systematic_names]

            # Store problematic genes that fail the condition
            unsatisfied_genes = []

            # Iterate over problematic indexes
            for problematic_index in problematic_indexes:
                # Get experimental data for the problematic row, excluding NAME and GWEIGHT
                problematic_data = pcl_df.loc[problematic_index].drop(['NAME', 'GWEIGHT'], errors='ignore')

                # Check for matching rows with indexes starting with 'Y'
                matching_rows = pcl_df[
                    pcl_df.index.str.startswith('Y')
                    & (pcl_df.drop(columns=['NAME', 'GWEIGHT'], errors='ignore') == problematic_data).all(axis=1)
                ]

                # If no matching row is found, add to unsatisfied list
                if matching_rows.empty:
                    unsatisfied_genes.append(problematic_index)

            # If there are unsatisfied genes, print the file and the genes
            if unsatisfied_genes:
                print(f"File ({file_count +1}/{len(os.listdir(pcl_directory))}): {file_name}")
                print(f"Problematic genes without matching 'Y' rows: {len(unsatisfied_genes)}")
                print('\n')

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

##### For now, let's remove the problematic index if YORF available and same values

In [None]:
import os
import pandas as pd

# Paths to directories and files
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'
gene_mapping_path = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes.tsv'

# Load the valid systematic names from the mapping file
gene_mapping_df = pd.read_csv(gene_mapping_path, sep='\t')
valid_systematic_names = set(gene_mapping_df['Gene > Systematic Name'].dropna())

# Process each .pcl file
for file_count, file_name in enumerate(os.listdir(pcl_directory), start=1):
    if file_name.endswith(".pcl"):
        file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(file_path, sep='\t', index_col=0)

            # Identify problematic indexes (not in valid systematic names)
            problematic_indexes = [
                index for index in pcl_df.index if index not in valid_systematic_names
            ]

            if problematic_indexes:
                print(f"Processing file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")
                print(f"Problematic indexes: {problematic_indexes}")

                # Check if problematic indexes have matching valid systematic name rows
                rows_to_drop = []
                for problem_index in problematic_indexes:
                    # Check if there is a matching row with valid systematic name
                    matching_rows = pcl_df.loc[
                        pcl_df.index.isin(valid_systematic_names) & 
                        (pcl_df.loc[problem_index].drop(['NAME', 'GWEIGHT'], errors='ignore') == pcl_df.drop(['NAME', 'GWEIGHT'], axis=1, errors='ignore')).all(axis=1)
                    ]

                    # If a matching row exists, mark the problematic index for removal
                    if not matching_rows.empty:
                        rows_to_drop.append(problem_index)

                # Remove the problematic rows and overwrite the file
                if rows_to_drop:
                    pcl_df = pcl_df.drop(index=rows_to_drop)
                    pcl_df.to_csv(file_path, sep='\t')
                    print(f"Fixed and overwritten file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")
                else:
                    print(f"No matching rows found for problematic indexes in file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}")

        except Exception as e:
            print(f"Error processing file ({file_count}/{len(os.listdir(pcl_directory))}): {file_name}: {e}")

##### For those in Standard Name, replace by YORF

In [None]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'
yeast_genes_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_yeast_genes.tsv'

# Load the yeast genes data
yeast_genes_df = pd.read_csv(yeast_genes_file, sep='\t')

# Create a dictionary mapping Gene > Standard Name to Gene > Systematic Name (case-insensitive)
standard_to_systematic = {
    standard.upper(): systematic.upper() 
    for standard, systematic in zip(
        yeast_genes_df['Gene > Standard Name'].dropna(), 
        yeast_genes_df['Gene > Systematic Name'].dropna()
    )
}

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Flag to check if the file was modified
            modified = False

            # Create a new index list
            new_index = []
            for index in pcl_df.index:
                # Check if the index is problematic (doesn't start with 'Y' or 'Q')
                if not (index.upper().startswith('Y') or index.upper().startswith('Q')):
                    # Convert index to uppercase and check in dictionary
                    upper_index = index.upper()
                    if upper_index in standard_to_systematic:
                        # Replace the index with the corresponding Gene > Systematic Name
                        new_index.append(standard_to_systematic[upper_index])
                        modified = True
                    else:
                        # Keep the index as is if no mapping exists
                        new_index.append(index)
                else:
                    new_index.append(index)

            # Update the index of the DataFrame if modified
            if modified:
                pcl_df.index = new_index

                # Overwrite the original file
                pcl_df.to_csv(pcl_file_path, sep='\t')

                print(f"File '{file_name}' updated successfully.")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

#### Now all indexes are correct YORFs, but we need to deal with duplicates

In [26]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)
            
            # Identify duplicate indexes
            duplicate_indexes = pcl_df.index[pcl_df.index.duplicated()].unique()
            
            if len(duplicate_indexes) > 0:
                print(f"File: {file_name}")
                print(f"Number of duplicate indexes: {len(duplicate_indexes)}")

                # Check if the rows match for each duplicate index
                for dup_index in duplicate_indexes:
                    duplicate_rows = pcl_df.loc[dup_index]
                    
                    # Count occurrences of the duplicate index
                    num_duplicates = len(duplicate_rows) if isinstance(duplicate_rows, pd.DataFrame) else 1
                    
                    # Handle duplicates with 3 or more occurrences
                    if num_duplicates >= 3:
                        print(f"  - Gene '{dup_index}' has {num_duplicates} duplicates.")
                    
                    # Check if the rows match
                    if isinstance(duplicate_rows, pd.DataFrame):
                        experiment_columns = duplicate_rows.drop(columns=['NAME', 'GWEIGHT'], errors='ignore')
                        
                        # Check if all rows are identical
                        rows_match = experiment_columns.nunique().sum() == experiment_columns.shape[1]
                        
                        if rows_match:
                            print(f"    - Duplicate index '{dup_index}' has matching rows.")
                        else:
                            print(f"    - Duplicate index '{dup_index}' has non-matching rows.")
                    else:
                        print(f"  - Duplicate index '{dup_index}' has only one row (unexpected).")

                print('\n')
        
        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")


##### Removing direct duplicates

In [24]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)

            # Drop duplicates based on the index and experimental columns (excluding NAME and GWEIGHT)
            experimental_columns = pcl_df.drop(columns=['NAME', 'GWEIGHT'], errors='ignore')
            deduplicated_df = pcl_df.loc[~experimental_columns.index.duplicated(keep='first')]

            # Save the deduplicated DataFrame back to the same file
            deduplicated_df.to_csv(pcl_file_path, sep='\t')
            print(f"Duplicates removed for file: {file_name}")

        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")

Duplicates removed for file: GSE8897_setA_family.pcl
Duplicates removed for file: GSE12222_set5_family.pcl
Duplicates removed for file: GSE5283_setA_family.pcl
Duplicates removed for file: GSE3815_set0_family.pcl
Duplicates removed for file: GSE6302_set04_family.pcl
Duplicates removed for file: GSE10269_set0_family.pcl
Duplicates removed for file: 2010.MMSresponse.pcl
Duplicates removed for file: GSE24802_final.pcl
Duplicates removed for file: GSE3182_setA_family.pcl
Duplicates removed for file: GSE9482_setA_family.pcl
Duplicates removed for file: GSE6302_set08_family.pcl
Duplicates removed for file: 2010.Causton01_acid.filter.flt.knn.avg.div.log.pcl
Duplicates removed for file: GSE8825_set5_family.pcl
Duplicates removed for file: 2010.Gasch00_hyper-osmotic.flt.knn.avg.pcl
Duplicates removed for file: 2010.Saldanha04_LeucineBatchChem.flt.knn.avg.pcl
Duplicates removed for file: 2010.deNadal04.flt.knn.avg.pcl
Duplicates removed for file: GSE12222_set2_family.pcl
Duplicates removed for f

In [28]:
import os
import pandas as pd

# Paths
pcl_directory = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/normalized_pcls'

# Iterate over each .pcl file in the directory
for file_name in os.listdir(pcl_directory):
    if file_name.endswith(".pcl"):
        pcl_file_path = os.path.join(pcl_directory, file_name)
        try:
            # Load the PCL file
            pcl_df = pd.read_csv(pcl_file_path, sep='\t', index_col=0)
            
            # Identify duplicate indexes
            duplicate_indexes = pcl_df.index[pcl_df.index.duplicated()].unique()
            
            # Print if the file has duplicates or not
            if len(duplicate_indexes) > 0:
                print(f"File: {file_name} has {len(duplicate_indexes)} duplicate indexes.")
            # else:
            #     print(f"File: {file_name} has no duplicate indexes.")
        
        except Exception as e:
            print(f"Error processing file '{file_name}': {e}")