In [12]:
import numpy as np
import pandas as pd
from glob import glob
import re
import sys
import os
from typing import List, Optional, Dict


In [7]:
# setting global variables
DAVID_ANNOTATION_DIR: str = "../Davids_mega_annotation"

In [8]:
def get_files(file_directory: str) -> List[str]:
    """Function that can get all the annotated excel files from the specified directory
    Parameters
    __________
    file_directory : str
        string that list the filepath to all the xlsx files that have the MEGA array probes
    
    Returns
    _______
    List[str]
        returns a list of strings that has all the annotation files in it
    """
    cur_dir: str = os.getcwd()

    annotation_file_list: List[str] = []

    os.chdir(file_directory)

    # gathering all the files that have and .xlsx extension and iterating through them
    for file in glob("*.xlsx"):

        full_file_path: str = os.path.join(file_directory, file)

        # check to make sure that the file has the format ChrXX where XX is a number
        match_string: str = re.search(r'Chr\d\d', file)

        if match_string:

            annotation_file_list.append(full_file_path)
    
    os.chdir(cur_dir)

    return annotation_file_list


In [9]:
def get_gene_list(gene_file: str) -> List[str]:
    """function to get a list of genes of interest
    Parameter
    _________
    gene_file : str
        string that contains the file path for the file contains genes of interest 
    
    Returns
    _______
    List[str]
        list of strings that has the genes from the file of interest
    """
    gene_df: pd.DataFrame = pd.read_csv(gene_file, sep="\t")
    
    

    return gene_df.gene.values.tolist()


In [128]:
def find_variant_snps(file_list: List[str], gene_list: List[str]) -> pd.DataFrame:
    """Function to find the variant snps on the mega probe for a specific gene
    Parameters
    __________
    file_list : List[str]
        list of files for each chromosome mega annotation file
    
    gene_list : List[str]
        list of all the genes of interest
    
    Returns
    _______
    pd.DataFrame
        returns a pandas dataframe that has the snp information for the snp of interest
    """
    # looking for the gene in each of the files

    for file in file_list:
        if re.match(r".*Chr06.*", file): 

            file_df: pd.DataFrame = pd.read_excel(file, sheet_name="cleaned")
        
        else:
            file_df: pd.DataFrame = pd.read_excel(file)

        if "Gene(s)" not in file_df.columns:

            print("expected a column called Gene(s) to be in the file")
            sys.exit(1)

        filtered_df: pd.DataFrame = file_df.loc[file_df["Gene(s)"].isin(gene_list)]
        
        if not filtered_df.empty:

            filtered_df = filtered_df[filtered_df["Mutation(s)"].str.contains(r'Missense|Nonsense', na=False)]
            
            if re.match(r".*Chr06.*", file): 
                
                file_info_dict: Dict[str, Optional] = {
                    "name":file_df.IlmnID.values.tolist(), 
                    "RsID":file_df["RS Name"].values.tolist(),
                    "Chr": file_df.Chr.values.tolist(),
                    "MapInfo": file_df.MapInfo.values.tolist(),
                    "Alleles": file_df.SNP.values.tolist(),
                    "Transcript":None,
                    "Gene(s)":file_df["Gene(s)"].values.tolist(),
                    "In-exon": file_df["In-exon"].values.tolist(),
                    "Mutation(s)": file_df["Mutation(s)"].values.tolist()
                    }
                
                file_info_df: pd.DataFrame = pd.DataFrame.from_dict(file_info_dict)

                file_info_df.to_csv("mega_probes.txt", sep="\t", mode="a+", index=False)

            else:

                filtered_df.to_csv("mega_probes.txt",sep="\t", mode="a+", index=False)
                


In [129]:
file_list: List[str] = get_files(DAVID_ANNOTATION_DIR)

gene_list: List[str] = get_gene_list("./retinopathy_gene_targets.txt")

find_variant_snps(file_list, gene_list)
