In [1]:
import pandas as pd 
import re

In [2]:
chromosome_df = (
    pd.read_csv("input/gene_chromosome_location.tsv", sep="\t")
    .rename(
        columns={
            "Approved symbol": "Approved_symbol",
            "Chromosome": "Chromosome",
            "NCBI Gene ID": "NCBI_Gene_ID",
        }
    )
    .dropna(subset=["Chromosome", "NCBI_Gene_ID"])
    .query("Chromosome!='reserved'")
    .reset_index(drop=True)
)

chromosome_df["Chromosome"] = chromosome_df["Chromosome"].apply(
    lambda x: "12" if x == "12 alternate reference locus" else x
)

chromosome_df["NCBI_Gene_ID"] = chromosome_df["NCBI_Gene_ID"].astype(int)

In [3]:
def extract_chromosome_info(location):
    match = re.match(r"([XY]|\d+)([pq])([\d.]+)", location)  # Updated regex
    if match:
        return {
        "chromosome": match.group(1),
        "arm": match.group(2),
        "region": match.group(3),
        }
    else:
        return None

In [4]:
chromosome_df["chromosome_info"] = chromosome_df["Chromosome"].apply(
    extract_chromosome_info
)
chromosome_df["Chromosome_number"] = chromosome_df["chromosome_info"].apply(
    lambda x: x["chromosome"] if x else None
)
chromosome_df["Arm_name"] = chromosome_df["chromosome_info"].apply(
    lambda x: x["arm"] if x else None
)
chromosome_df["Region_name"] = chromosome_df["chromosome_info"].apply(
    lambda x: x["region"] if x else None
)
chromosome_df["Chromosome_arm"] = chromosome_df["chromosome_info"].apply(
    lambda x: x["chromosome"]+x["arm"] if x else None
)

chromosome_df.drop(columns=['chromosome_info'], inplace=True)

chromosome_df.dropna(subset=["Chromosome_number"], inplace=True)

chromosome_df.head()

Unnamed: 0,Approved_symbol,Chromosome,NCBI_Gene_ID,Chromosome_number,Arm_name,Region_name,Chromosome_arm
0,A1BG,19q13.43,1,19,q,13.43,19q
1,A1BG-AS1,19q13.43,503538,19,q,13.43,19q
2,A1CF,10q11.23,29974,10,q,11.23,10q
3,A2M,12p13.31,2,12,p,13.31,12p
4,A2M-AS1,12p13.31,144571,12,p,13.31,12p


In [5]:
chromosome_df.to_csv("output/gene_chromosome_location_processed.tsv", sep="\t", index=False)