### Pulling genes for RNAseq overlay


In [None]:
import os

import pandas as pd
import polars as pl
import gtfparse
from biomart import BiomartServer


In [None]:
"""
Read and filter hg38 rna annotation file for just genes
"""
hg38_rna_anno = gtfparse.read_gtf("/Users/jkirkland/2023_chavez_rotation/anno/gencode.v38.annotation.gtf")


In [None]:
hg38_rna_anno_df = hg38_rna_anno.to_pandas()
genes_hg38_anno = hg38_rna_anno_df[hg38_rna_anno_df["feature"] == "gene"]

In [None]:
"""
Read and setup normalized RNAseq data
"""
medullo_rnaseq_norm = pd.read_csv("/Users/jkirkland/2023_chavez_rotation/data/RNAseq/dkfz_RNAseq_v2_rsem_genes_counts.genesymbol.nodup.renamed.norm.txt", sep="\t")
medullo_rnaseq_norm = medullo_rnaseq_norm.rename(columns={"Unnamed: 0": "gene"})

In [None]:
"""
Pull ensembl gene id's
"""

def get_ensembl_mappings():                                   
    # Set up connection to server                                             
    server = biomart.BiomartServer('http://ensembl.org/biomart')         
    mart = server.datasets['hsapiens_gene_ensembl']                            
                                                                                
    # List the types of data we want                                            
    attributes = ["ensembl_gene_id_version", "hgnc_symbol"]
                                                                                
    # Get the mapping between the attributes                                    
    response = mart.search( {'attributes': attributes})                          

    # Initialize an empty list to collect the rows
    data = []

    for line in response.iter_lines():
        # Decode the line from the response
        line = line.decode('utf-8')
        
        # Split the line by tabs
        line_data = line.split("\t")
        
        # Append the line data to the data list
        data.append(line_data)

    # Create the DataFrame from the list of rows
    columns = attributes  # Replace with actual column names
    df = pd.DataFrame(data, columns=["ensembl_gene_id", "gene_name"])
                                   
                                                                                
              
                                                                                
    return df

ensmbl_ids = get_ensembl_mappings()

In [None]:
medullo_w_gene_id = medullo_rnaseq_norm.merge(ensmbl_ids, how="left", left_on="gene", right_on="gene_name", indicator=True)
medullo_w_gene_id[medullo_w_gene_id["_merge"] == "left_only"]

In [None]:
"""
Subsetting Datframe for needed col_names
"""

col_names = [
    "gene", "MB095", "MB106", "MB170", "MB226", "MB247", "MB248", "MB260", "MB164", "MB166",
    "MB271", "MB277", "MB278", "MB288", "MB091", "MB099", "MB118", "MB174", "MB177",
    "MB199", "MB227", "MB264", "MB265", "MB269", "MB270", "MB281", "MB102", "MB104",
    "MB234", "MB239", "MB244", "MB268", "MB274", "MB275", "MB284", "MB088", "MB136",
    "MB206", "MB266", "MB287", "seqname", "source", "feature", "start", "end", "strand",
    "gene_id", "gene_type", "level", "hgnc_id", "havana_gene"
]
merged_subset = medullo_rnaseq_norm.merge(genes_hg38_anno, how="left", left_on="gene", right_on="gene_name", indicator=True)
unmerged = merged_subset[~merged_subset['_merge'].isin(["both"])]

merged_subset.head()
# merge_filtered = merged_subset[merged_subset.columns.intersection(col_names)]


In [None]:
unmerged

In [None]:
"""
Export to CSV
"""
merge_filtered.to_csv("medullo_rnaseq_annotated.csv", index=False)