# Translate the Ensembl Gene IDs of the data object into Gene Symbols and add as key, var, or obs

## Load adata after processing by 01 to 04

In [3]:
import scanpy as sc
import matplotlib.pyplot as plt
import anndata as ad


In [23]:
import pandas as pd

In [4]:
#adata = sc.read("/storage/users/sac43cg/res_Samantha_1/outs/per_sample_outs/clustered_adata.h5ad")
#adata = sc.read("/storage/users/sac43cg/res_Samantha_1/outs/per_sample_outs/filtered_preprocessed_combined_adata.h5ad")
#adata = sc.read("/storage/users/sac43cg/res_Samantha_1/outs/per_sample_outs/hvg_adata.h5ad")
adata = sc.read("/storage/users/sac43cg/res_Samantha_1/outs/per_sample_outs/cellcycle_adata.h5ad")
#adata = sc.read("/storage/users/sac43cg/res_Samantha_1/outs/h5as_files/adata_neu.h5ad")  # Replace with your data file

In [5]:
adata

AnnData object with n_obs × n_vars = 10837 × 2000
    obs: 'condition', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'pct_cmo', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_combined', 'louvain', 'louvain_combined', 'S_score', 'G2M_score', 'phase'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'CMO', 'mito', 'ribo', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'is_s_gene', 'is_g2m_gene', 'mean', 'std'
    uns: 'condition_colors', 'hvg', 'leiden', 'leiden_combined_colors', 'log1p', 'louvain', 'louvain_combined_colors', 'neighbors', 'pca', 'phase_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connecti

#### Create dictionary

In [6]:
from biomart import BiomartServer

# Connect to the BioMart server
server = BiomartServer("http://ensembl.org/biomart")

# Select the human genes dataset
dataset = server.datasets['hsapiens_gene_ensembl']

# Query BioMart to get the translation from Ensembl ID to gene symbol
response = dataset.search({
    'attributes': ['ensembl_gene_id', 'external_gene_name'],
})

# Initialize an empty dictionary to store the mapping
ensembl_to_gene_name = {}

# Iterate through the response lines
for line in response.iter_lines():
    # Decode the line and split it into Ensembl ID and gene name
    decoded_line = line.decode('utf-8')
    ensembl_id, gene_name = decoded_line.split('\t')
    
    # Store the mapping in the dictionary
    ensembl_to_gene_name[ensembl_id] = gene_name

# Now, ensembl_to_gene_name is a dictionary with Ensembl IDs as keys and gene names as values.


## Name Var_names or var->gene.symbol

In [7]:
adata.var['gene_symbol'] = [ensembl_to_gene_name.get(gene_id, 'NA') for gene_id in adata.var_names]

In [8]:
adata.var_names = [ensembl_to_gene_name.get(gene_id, gene_id) for gene_id in adata.var_names]

In [9]:
adata

AnnData object with n_obs × n_vars = 10837 × 2000
    obs: 'condition', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'pct_cmo', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_combined', 'louvain', 'louvain_combined', 'S_score', 'G2M_score', 'phase'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'CMO', 'mito', 'ribo', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'is_s_gene', 'is_g2m_gene', 'mean', 'std', 'gene_symbol'
    uns: 'condition_colors', 'hvg', 'leiden', 'leiden_combined_colors', 'log1p', 'louvain', 'louvain_combined_colors', 'neighbors', 'pca', 'phase_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    

In [10]:
# optional
adata.var['gene_symbol']

HES4              HES4
ISG15            ISG15
C1QTNF12      C1QTNF12
TNFRSF9        TNFRSF9
LINC01647    LINC01647
               ...    
MT-ND4L        MT-ND4L
MT-ND4          MT-ND4
MT-ND5          MT-ND5
MT-ND6          MT-ND6
MT-CYB          MT-CYB
Name: gene_symbol, Length: 2000, dtype: object

In [11]:
adata.var_names

Index(['HES4', 'ISG15', 'C1QTNF12', 'TNFRSF9', 'LINC01647', 'KAZN',
       'SLC25A34-AS1', 'TMEM82', 'MFAP2', 'PADI2',
       ...
       'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3', 'MT-ND4L', 'MT-ND4',
       'MT-ND5', 'MT-ND6', 'MT-CYB'],
      dtype='object', length=2000)

## Names Sample IDs as obs

In [12]:
adata.obs_names

Index(['AAACCCAGTAGGCTCC-1', 'AAACCCAGTCGCAACC-1', 'AAACCCATCCACAAGT-1',
       'AAACGCTTCACTACGA-1', 'AAACGCTTCAGCCTCT-1', 'AAAGAACAGATACAGT-1',
       'AAAGAACAGCCTCGTG-1', 'AAAGAACAGTAATCCC-1', 'AAAGAACCAACATCGT-1',
       'AAAGAACGTCGCGGTT-1',
       ...
       'TTGTGGATCTAGCAAC-1', 'TTGTTCACAGGTCAAG-1', 'TTGTTTGAGCTCATAC-1',
       'TTTACGTTCAAGGCTT-1', 'TTTACTGCAGCTACAT-1', 'TTTACTGTCATTACTC-1',
       'TTTCACAGTGATATAG-1', 'TTTCACATCGATGCAT-1', 'TTTCATGTCCCTCTAG-1',
       'TTTGACTGTTCCGGTG-1'],
      dtype='object', name='barcode', length=10837)

### Build a new Index by combining Barcode and Conditions

In [13]:
#df = pd.DataFrame(adata.X, index=adata.obs.index, columns=adata.var.index)

In [14]:
#df.index = df.index + '_' + adata.obs['conditions'].values


In [15]:
def short_hash(input_string, length=8):
    """Get a shorter hash representation of the input string."""
    return str(hash(input_string))[:length]

In [16]:
adata.obs['index'] = [short_hash(name) for name in adata.obs_names]

In [17]:
adata.obs['IDs'] = [f"{condition}_{name}" for condition, name in zip(adata.obs['condition'], adata.obs['index'] )]

In [18]:
#optional
adata.obs_names = adata.obs['IDs']

In [19]:
adata.obs['IDs']

IDs
CTRL_1_69505533              CTRL_1_69505533
CTRL_1_57292812              CTRL_1_57292812
CTRL_1_-7137068              CTRL_1_-7137068
CTRL_1_10527883              CTRL_1_10527883
CTRL_1_51088425              CTRL_1_51088425
                                ...         
TGFb1_GEM_2_24816592    TGFb1_GEM_2_24816592
TGFb1_GEM_2_16615797    TGFb1_GEM_2_16615797
TGFb1_GEM_2_88051030    TGFb1_GEM_2_88051030
TGFb1_GEM_2_-2734503    TGFb1_GEM_2_-2734503
TGFb1_GEM_2_-1866562    TGFb1_GEM_2_-1866562
Name: IDs, Length: 10837, dtype: object

In [20]:
adata.obs_names

Index(['CTRL_1_69505533', 'CTRL_1_57292812', 'CTRL_1_-7137068',
       'CTRL_1_10527883', 'CTRL_1_51088425', 'CTRL_1_21946080',
       'CTRL_1_-3450445', 'CTRL_1_58921981', 'CTRL_1_-7803235',
       'CTRL_1_86862279',
       ...
       'TGFb1_GEM_2_20650838', 'TGFb1_GEM_2_-2818654', 'TGFb1_GEM_2_13723599',
       'TGFb1_GEM_2_23092216', 'TGFb1_GEM_2_27754323', 'TGFb1_GEM_2_24816592',
       'TGFb1_GEM_2_16615797', 'TGFb1_GEM_2_88051030', 'TGFb1_GEM_2_-2734503',
       'TGFb1_GEM_2_-1866562'],
      dtype='object', name='IDs', length=10837)

In [21]:
adata.var_names

Index(['HES4', 'ISG15', 'C1QTNF12', 'TNFRSF9', 'LINC01647', 'KAZN',
       'SLC25A34-AS1', 'TMEM82', 'MFAP2', 'PADI2',
       ...
       'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3', 'MT-ND4L', 'MT-ND4',
       'MT-ND5', 'MT-ND6', 'MT-CYB'],
      dtype='object', length=2000)

## Export

In [22]:
#adata.write('/storage/users/sac43cg/res_Samantha_1/outs/per_sample_outs/annotated_adata.h5ad')
adata.write('/storage/users/sac43cg/res_Samantha_1/outs/per_sample_outs/annotated_cellcycle_adata.h5ad')

# Snippets

## If you have a dictionary of Ensembl you can extratc the counts matrix and annotate 
## the columns and rows with a Sample ID and Gene Symbols

In [None]:
# 1. Extracting data
df = pd.DataFrame(adata.X, index=adata.obs.index, columns=adata.var.index)

# 2. Combining barcodes and conditions
df.index = df.index + '_' + adata.obs['conditions'].values

# 3. Translate Ensembl IDs to Gene Symbols
# Assuming ensembl_to_symbol is your dictionary of Ensembl ID -> Gene Symbol
df.columns = df.columns.map(lambda x: ensembl_to_symbol.get(x, x))

In [None]:
counts = pd.DataFrame(adata.X, index=adata.obs_names, columns=adata.var_names)
symbol_adata=sc.AnnData(counts)