# Pre-process the data #

Pre-requisites to using this file
- Run the string_preprocessing.py main function and store the STRING database with known interactions as ../data_processed/STRING_full_filtered.tsv
- Manually retrieve STRING IDs to UniProtIDs and UniProtIDs to GeneIDs

In [1]:
import pandas as pd
from string_preprocessing import *
from preprocess_data import *

Preprocess the DNA, RNA, and Protein data


In [2]:
preprocess_dna_data()
preprocess_rna_data()
preprocess_protein_data()

Original shape: (140258, 76)
Transposed shape: (60, 138892)
Original shape: (26188, 66)
Transposed shape: (60, 26178)
Original shape: (3181, 69)
Transposed shape: (59, 3167)


Run the string_preprocessing.py main function before this!

Get all of the STRING IDs from the full filtered STRING and save to a file

In [3]:
known_STRING = pd.read_csv('../data_processed/STRING_full_filtered.tsv', sep='\t')
# Get all STRING IDs in both columns protein1 and protein2
known_STRING_ids = set(known_STRING['protein1']).union(set(known_STRING['protein2']))
# Save all STRING IDs in a file
with open('../data_processed/known_STRING_ids.txt', 'w') as f:
    for id in known_STRING_ids:
        f.write(id + '\n')

print('Number of known STRING IDs: {}'.format(len(known_STRING_ids)))

Number of known STRING IDs: 18727


### Manually get the STRING id to UniProt ID conversion from https://www.uniprot.org/id-mapping/ ###

Manually retrieved STRING IDs to UniProtKB -- 18,305 IDs were mapped to 18,305 results, but 422 ID were not mapped
- Mapped STRING IDs saved at "../data_processed/stringids_to_uniprotkb.tsv"
- Unmapped STRING IDs saved at "../data_processed/STRING_unmapped_stringids.txt"

In [4]:
mapped_uniprot = pd.read_csv('../data_processed/stringids_to_uniprotkb.tsv', sep='\t')
# Get all UniProtKB IDs in "Entry" column
mapped_uniprot_ids = set(mapped_uniprot['Entry'])
# Save all UniProtKB IDs in a file
with open('../data_processed/STRING_mapped_uniprot_ids.txt', 'w') as f:
    for id in mapped_uniprot_ids:
        f.write(id + '\n')

### Manually get the UniProt ID conversion to GeneID from https://www.uniprot.org/id-mapping/ ###

UniProtKB AC/ID to GeneID -- 17,816 IDs were mapped to 18,018 results, but 489 ID were not mapped
- Mapped UniProtIDs saved at "../data_processed/uniprot_ids_to_gene_id.tsv"
- Unmapped UniProtIDs saved at "../data_processed/STRING_unmapped_uniprotids.txt"

The mapped gene IDs have significant intersection with the DNA/RNA/Protein intersection: found 2638 out of 2667

Find the entrez IDs for the IDs that couldn't be mapped. Go from STRING ID -> Protein Symbol (assume same as gene name) -> Entrez ID

In [5]:
# Get all gene names to entrez IDs possible
gene_names_to_entrez_ids = {}

dna_identifier_gene_name_entrez = pd.read_csv('../data_processed/dnaexome_identifier_gene_name_entrez_id.csv')
# for each gene name in "Gene name (d)" column, if it is not in the dictionary, add it with the entrez ID in "Entrez gene id (e)" column
for index, row in dna_identifier_gene_name_entrez.iterrows():
    gene_name = row['Gene name (d)']
    entrez_id = row['Entrez gene id (e)']
    if gene_name not in gene_names_to_entrez_ids:
        gene_names_to_entrez_ids[gene_name] = entrez_id
    else:
        # Check if the entrez id is the same
        if gene_names_to_entrez_ids[gene_name] != entrez_id:
            print('Error: gene name ' + str(gene_name) + ' has two different entrez IDs: ' + str(gene_names_to_entrez_ids[gene_name]) + ' and ' + str(entrez_id))

rna_gene_name_entrez = pd.read_csv('../data_processed/rna_gene_name_entrez_id.csv')
# for each gene name in "Gene name d" column, if it is not in the dictionary, add it with the entrez ID in "Entrez gene id e" column
for index, row in rna_gene_name_entrez.iterrows():
    gene_name = row['Gene name d']
    entrez_id = row['Entrez gene id e']
    if gene_name not in gene_names_to_entrez_ids:
        gene_names_to_entrez_ids[gene_name] = entrez_id
    else:
        # Check if the entrez id is the same
        if gene_names_to_entrez_ids[gene_name] != entrez_id:
            print('Error: gene name ' + str(gene_name) + ' has two different entrez IDs: ' + str(gene_names_to_entrez_ids[gene_name]) + ' and ' + str(entrez_id))

protein_identifier_gene_name_entrez = pd.read_csv('../data_processed/protein_identifier_gene_name_entrez_id.csv')
# for each gene name in "Gene name d" column, if it is not in the dictionary, add it with the entrez ID in "Entrez gene id e" column
for index, row in protein_identifier_gene_name_entrez.iterrows():
    gene_name = row['Gene name d']
    entrez_id = row['Entrez gene id e']
    if gene_name not in gene_names_to_entrez_ids:
        gene_names_to_entrez_ids[gene_name] = entrez_id
    else:
        # Check if the entrez id is the same
        if gene_names_to_entrez_ids[gene_name] != entrez_id:
            print('Error: gene name ' + str(gene_name) + ' has two different entrez IDs: ' + str(gene_names_to_entrez_ids[gene_name]) + ' and ' + str(entrez_id))

# Print how many gene names are mapped to entrez IDs
print('Number of gene names mapped to entrez IDs: ' + str(len(gene_names_to_entrez_ids)))


Number of gene names mapped to entrez IDs: 27163


- Collect all STRING IDs that could not be mapped to either uniprot or from uniprot to gene id (STRING_unmapped_stringids.txt + STRING_unmapped_uniprot_ids.txt)
- Read in STRING IDs -> protein symbols list
- Convert unmapped STRING IDs -> protein symbols -> entrez_ids
- Keep track of any IDs that still cannot be mapped in a separate list

In [6]:
# Find unmapped STRING IDs -> entrez IDs using protein symbols
unmapped_STRING_ids = set()

with open('../data_processed/STRING_unmapped_stringids.txt', 'r') as f:
    for line in f:
        unmapped_STRING_ids.add(line.strip())
print("No uniprot IDs from string ID: " + str(len(unmapped_STRING_ids)))

unmapped_uniprot_ids = set()
with open('../data_processed/STRING_unmapped_uniprot_ids.txt', 'r') as f:
    for line in f:
        unmapped_uniprot_ids.add(line.strip())
print("No entrez IDs from uniprot: " + str(len(unmapped_uniprot_ids)))

# Get df with mapping from STRING ID to UniProtKB ID
mapped_uniprot = pd.read_csv('../data_processed/stringids_to_uniprotkb.tsv', sep='\t')
for id in unmapped_uniprot_ids:
    string_id = mapped_uniprot[mapped_uniprot['Entry'] == id]['From'].values[0]
    unmapped_STRING_ids.add(string_id)

print("All unmapped STRING IDs: " + str(len(unmapped_STRING_ids)))

stringid_to_protsym = get_protein_id_to_symbol_dict()

# For each unmapped STRING ID, find the corresponding protein symbol, and if it does not exist, add 
# list of string_ids_no_protsym
string_ids_no_protsym = []
mapped_string_ids_to_protsym = {}
for id in unmapped_STRING_ids:
    if id in stringid_to_protsym:
        mapped_string_ids_to_protsym[id] = stringid_to_protsym[id]
    else:
        string_ids_no_protsym.append(id)

print("No protein symbols: " + str(len(string_ids_no_protsym))) # 0
print("Mapped protein symbols: " + str(len(mapped_string_ids_to_protsym.keys()))) # 911


# Map protein symbols to entrez IDs
string_no_entrez = []
string_via_protsym_to_entrez = {}
protsym_derived_entrez = set()
example = 0
for stringid in mapped_string_ids_to_protsym:
    protsym = mapped_string_ids_to_protsym[stringid]
    if protsym in gene_names_to_entrez_ids:
        entrez_id = gene_names_to_entrez_ids[protsym]
        string_via_protsym_to_entrez[stringid] = entrez_id
        protsym_derived_entrez.add(entrez_id)
        if example < 5: # Verified top 5 examples manually
            print(str(stringid) + ' ' + str(protsym) + ' ' + str(entrez_id))
            example += 1
    else:
        string_no_entrez.append(stringid)

print("No entrez for protein symbols: " + str(len(string_no_entrez))) # 390
print("Mapped protein symbols to entrez IDs: " + str(len(string_via_protsym_to_entrez))) # 521

# Save file of string IDs with no entrez ID
with open('../data_processed/STRING_no_entrez.txt', 'w') as f:
    for id in string_no_entrez:
        f.write(id + '\n')


No uniprot IDs from string ID: 422
No entrez IDs from uniprot: 489
All unmapped STRING IDs: 911
No protein symbols: 0
Mapped protein symbols: 911
9606.ENSP00000375822 ZNF222 7673
9606.ENSP00000339992 MYB 4602
9606.ENSP00000345151 RFPL4AL1 729974
9606.ENSP00000369774 ZNF763 284390
9606.ENSP00000312457 BAALC 79870
No entrez for protein symbols: 390
Mapped protein symbols to entrez IDs: 521


There are several uniprot IDs that map to multiple entrez IDs, but we can just add all of these to one big list of entrez IDs, store a mapping of the entrez ID to STRING ID

In [7]:
uniprot_mapped_gene = pd.read_csv('../data_processed/uniprot_ids_to_gene_ids.tsv', sep='\t')

# Figure out which uniprot IDs are mapped to multiple gene IDs
uniprot_ids_to_gene_ids = {}
for index, row in uniprot_mapped_gene.iterrows():
    uniprot_id = row['From']
    gene_id = row['To']
    if uniprot_id not in uniprot_ids_to_gene_ids:
        uniprot_ids_to_gene_ids[uniprot_id] = [gene_id]
    else:
        uniprot_ids_to_gene_ids[uniprot_id].append(gene_id)

# Which uniprot IDs are mapped to multiple gene IDs
uniprots_with_multiple_gene_ids = []
for uniprot_id in uniprot_ids_to_gene_ids:
    if len(uniprot_ids_to_gene_ids[uniprot_id]) > 1:
        uniprots_with_multiple_gene_ids.append(uniprot_id)
print('Number of uniprot IDs mapped to multiple gene IDs: ', len(uniprots_with_multiple_gene_ids))
# This is okay because from protein expression or any other data, we don't mind connecting multiple
# entrez IDs to one STRING protein node in the STRING layer

Number of uniprot IDs mapped to multiple gene IDs:  139


Get all STRING IDs to Entrez IDs, create dataframe with columns STRING ID, Protein Symbol, Uniprot ID, Entrez ID, save to file

In [8]:
all_mapped_string_ids_to_entrez = pd.DataFrame(columns=['STRING_ID', 'Protein_Symbol', 'Uniprot_ID', 'Entrez_Gene_ID'])
stringids_no_entrez = []
for stringid in known_STRING_ids:
    protein_symbol = stringid_to_protsym[stringid]
    entrez_id = ''
    uniprot_id = '-'
    if stringid in mapped_uniprot['From'].values:
        # get the row index of where stringid is in the From column
        row_index = mapped_uniprot[mapped_uniprot['From'] == stringid].index[0]
        uniprot_id = mapped_uniprot['Entry'][row_index]
    
    if stringid in string_via_protsym_to_entrez:
        entrez_id = string_via_protsym_to_entrez[stringid]
        new_row_dict = {'STRING_ID': stringid, 'Protein_Symbol': protein_symbol, 'Uniprot_ID': uniprot_id, 'Entrez_Gene_ID': entrez_id}
        all_mapped_string_ids_to_entrez.loc[len(all_mapped_string_ids_to_entrez)] = new_row_dict
    elif uniprot_id in uniprot_ids_to_gene_ids:
        for gene_id in uniprot_ids_to_gene_ids[uniprot_id]:
            new_row_dict = {'STRING_ID': stringid, 'Protein_Symbol': protein_symbol, 'Uniprot_ID': uniprot_id, 'Entrez_Gene_ID': gene_id}
            all_mapped_string_ids_to_entrez.loc[len(all_mapped_string_ids_to_entrez)] = new_row_dict
    else:
        stringids_no_entrez.append(stringid)
    
print('Number of known STRING IDs not mapped to entrez IDs: ' + str(len(stringids_no_entrez)))
print('Number of mapped STRING IDs to entrez IDs (including multiple entrez IDs for the same STRING): ' + str(len(all_mapped_string_ids_to_entrez)))
print('Number of unique STRING IDs that are mapped to entrez IDs: ' + str(len(all_mapped_string_ids_to_entrez['STRING_ID'].unique())))
print(all_mapped_string_ids_to_entrez.head())




Number of known STRING IDs not mapped to entrez IDs: 390
Number of mapped STRING IDs to entrez IDs (including multiple entrez IDs for the same STRING): 18539
Number of unique STRING IDs that are mapped to entrez IDs: 18337
              STRING_ID Protein_Symbol Uniprot_ID  Entrez_Gene_ID
0  9606.ENSP00000306627         SLC9C1     Q4G0N8          285335
1  9606.ENSP00000367029         POLR1E     Q9GZS1           64425
2  9606.ENSP00000321971          RHOT2     Q8IXI1           89941
3  9606.ENSP00000330658          PAPPA     Q13219            5069
4  9606.ENSP00000286091          PDIA4     P13667            9601


In [9]:
# Save dataframe to file
all_mapped_string_ids_to_entrez.to_csv('../data_processed/string_ids_prot_entrez.csv', index=False)