In [5]:
import pandas as pd
import numpy as np
import os
import tkinter as tk
from tkinter import filedialog

In [2]:
def filter_gene_list(excel_path):
    genes_of_interest = pd.read_excel(excel_path)
    # takes the first column of your excel file and makes it into a list of every gene of interest
    genes_of_interest = genes_of_interest[genes_of_interest.columns[0]].to_frame(name="HGNC.symbol")
    unique_genes_of_interest = genes_of_interest.drop_duplicates(subset=["HGNC.symbol"])
    print("You have listed {} genes of interest, of which {} are unique.".format(len(genes_of_interest), genes_of_interest["HGNC.symbol"].nunique()))
    dup_gene_names = (genes_of_interest["HGNC.symbol"].value_counts().loc[lambda s: s > 1])
    print(dup_gene_names)
    return unique_genes_of_interest

def process_folder_of_exels(folder_of_excels, unique_genes_of_interest):
    excel_files = [f for f in os.listdir(folder_of_excels) if f.endswith('.xlsx')]
    for excel_file in excel_files:
        print(f"Processing file: {excel_file}")
            
        df = pd.read_excel(os.path.join(folder_of_excels, excel_file))
        print(df.shape)
        print("You have {} rows in the sample, of which {} HGNC symbols are unique.".format(len(df), df["HGNC.symbol"].nunique()))    
        dupes = (df[df["HGNC.symbol"].duplicated(keep=False)].sort_values("HGNC.symbol"))
        dupes.to_csv(os.path.join(folder_of_excels, f"duplicates_in_{excel_file.replace('.xlsx', '.csv')}"), index=False)
        
        filtered = df[ df["HGNC.symbol"].isin(unique_genes_of_interest["HGNC.symbol"])].copy()
        filtered["is_repeated_gene"] = (filtered["HGNC.symbol"].duplicated(keep=False))
        missing_genes = unique_genes_of_interest.loc[~unique_genes_of_interest["HGNC.symbol"].isin(df["HGNC.symbol"]), "HGNC.symbol"]
        print("Of your genes of interest, {} are not present in the sample.".format(missing_genes.shape[0]))
        missing_df = pd.DataFrame({"HGNC.symbol": missing_genes,"is_repeated_gene": False,"gene_present": False})
        filtered["gene_present"] = True
        final_df = pd.concat([filtered, missing_df], ignore_index=True, sort=False)
        final_df.to_csv(os.path.join(folder_of_excels, f"filtered_{excel_file.replace('.xlsx', '.csv')}"), index=False)
        print("\n")


In [None]:
root = tk.Tk()
root.withdraw()
root.attributes("-topmost", True)
root.update() 

folder_of_excels = filedialog.askdirectory(parent=root, title="Select folder containing DE Excel files")
excel_path = filedialog.askopenfilename(parent=root, title="Select Gene List Excel file", filetypes=[("Excel files", "*.xlsx *.xls")])
root.destroy()

print("Folder:", folder_of_excels)
print("Gene list:", excel_path)

unique_genes_of_interest = filter_gene_list(excel_path)
process_folder_of_exels(folder_of_excels, unique_genes_of_interest)

Folder: Z:/Marina/Limfoma project/Bulk mRNA seq/DE excels
Gene list: Z:/Marina/Limfoma project/Bulk mRNA seq/Gene list.xlsx
You have listed 65 genes of interest, of which 59 are unique.
HGNC.symbol
ACKR1    2
CD34     2
CA4      2
GJA5     2
PLVAP    2
RGCC     2
Name: count, dtype: int64
Processing file: DE_HUVECLF_vs_HUVECTF_all.xlsx
(22087, 8)
You have 22087 rows in the sample, of which 18818 HGNC symbols are unique.
Of your genes of interest, 4 are not present in the sample.


Processing file: DE_HUVECLF_vs_tonsil_all.xlsx
(22087, 8)
You have 22087 rows in the sample, of which 18818 HGNC symbols are unique.
Of your genes of interest, 4 are not present in the sample.


Processing file: DE_HUVECTF_vs_tonsil_all.xlsx
(22087, 8)
You have 22087 rows in the sample, of which 18818 HGNC symbols are unique.
Of your genes of interest, 4 are not present in the sample.


