### Libraries

In [7]:
import os
import pandas as pd
from itertools import combinations

### Pre-checking

#### Example

In [None]:
example_file = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_spell_exprconn_pcl/Abbott_2008_PMID_18676708/GSE10066_setA_family.pcl'

# Read the file without specifying index_col or header to inspect the content
df_preview = pd.read_csv(example_file, sep="\t")

# Print the first few rows and columns to inspect the structure
print("First few rows of the file:")
print(df_preview.head())  # Print the first 5 rows

print("\nColumn names of the file:")
print(df_preview.columns.tolist())  # Print the column names

print("\nRow names (index) of the file:")
print(df_preview.index.tolist())  # Print the row names (index)

#### Checking .pcl files per folder

In [None]:
# Base directory
base_dir = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_spell_exprconn_pcl"

# List subfolders containing only one .pcl file
for root, _, files in os.walk(base_dir):
    # Filter for .pcl files in the current subfolder
    pcl_files = [file for file in files if file.endswith(".pcl")]
    
    # Check if there is not exactly one .pcl file in the subfolder
    if len(pcl_files) != 1:
        print(f"Subfolder: {root} contains {len(pcl_files)} .pcl files.")

#### Take an example and check duplicate columns names

In [None]:
target_dir = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_spell_exprconn_pcl/Friedlander_2006_PMID_16542486'

# List all .pcl files in the directory
files = [file for file in os.listdir(target_dir) if file.endswith(".pcl")]

# Load the columns for each file into a dictionary
file_columns = {}
for file in files:
    file_path = os.path.join(target_dir, file)
    try:
        df = pd.read_csv(file_path, sep="\t", index_col=0)
        file_columns[file] = set(df.columns)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

# Compute pairwise intersections of columns
for (file1, cols1), (file2, cols2) in combinations(file_columns.items(), 2):
    common_columns = cols1.intersection(cols2)
    print(f"Common columns between '{file1}' and '{file2}':")
    print(list(common_columns))
    print()

#### Among which datasets is there a column name conflict?

In [None]:
# Parent directory
parent_dir = '/home/logs/jtorresb/Geneformer/yeast/yeast_data/all_spell_exprconn_pcl'

# Columns to exclude from the check
excluded_columns = {'GWEIGHT', 'NAME'}

# Dictionary to store column names for each file
file_columns = {}

# Iterate through subfolders and collect columns from .pcl files
for root, _, files in os.walk(parent_dir):
    for file in files:
        if file.endswith(".pcl"):
            file_path = os.path.join(root, file)
            try:
                # Read the file and store columns excluding specific names
                df = pd.read_csv(file_path, sep="\t", index_col=0)
                file_columns[file_path] = set(df.columns) - excluded_columns
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

# Check for repeated column names between all file pairs
for (file1, cols1), (file2, cols2) in combinations(file_columns.items(), 2):
    common_columns = cols1.intersection(cols2)
    if common_columns:  # Only print if there are overlaps
        print(f"'{file1}' and '{file2}':")
        print(list(common_columns))
        print()