In [1]:
import src
import pandas as pd

In [2]:
uniprot_families_path = src.external_data_path / "uniprot_families.txt"

def process_gene(gene_string):
    gene_string = gene_string.strip()
    splits = gene_string.split()
    gene_with_species, uniprot_id = splits[0], splits[1]
    splits = gene_with_species.split("_")
    gene, species = splits[0], splits[1]
    uniprot_id = uniprot_id[1:-1]
    return gene_with_species, species, uniprot_id

In [3]:
dataset = []
with open(uniprot_families_path, 'r') as f:
    current_family = None
    for i,line in src.progress(enumerate(f), total=197289):
        line = line.rstrip()
        if line.endswith("family"):
            current_family = line
        if line.startswith(" "*5) and (current_family is not None):
            genes_in_line = list(filter(lambda x: len(x)>0, map(str.strip, line.split(","))))
            for gene_string in genes_in_line:
                gene_with_species, species, uniprot_id = process_gene(gene_string)  
                dataset.append({
                    'uniprot_gene_id': gene_with_species,
                    'organism': species,
                    'uniprot_id': uniprot_id,
                    'family': current_family
                })
dataset = pd.DataFrame.from_dict(dataset)
dataset.to_csv(src.processed_data_path / "uniprot_families_allOrganisms.tsv", sep="\t", index=False, header=True)
dataset.head()

HBox(children=(IntProgress(value=0, max=197289), HTML(value='')))




Unnamed: 0,family,organism,uniprot_gene_id,uniprot_id
0,11S seed storage protein (globulins) family,SESIN,11S2_SESIN,Q9XHP0
1,11S seed storage protein (globulins) family,HELAN,11S3_HELAN,P19084
2,11S seed storage protein (globulins) family,CUCMA,11SB_CUCMA,P13744
3,11S seed storage protein (globulins) family,FAGES,13S1_FAGES,O23878
4,11S seed storage protein (globulins) family,FAGES,13S2_FAGES,O23880


In [4]:
geneid_to_genename = pd.read_csv(src.processed_data_path / "uniprot_geneid_to_genename.txt", sep="\t", 
                                 header=None, names=['uniprot_gene_id', 'uniprot_gene_name'])
geneid_to_genename.head()

Unnamed: 0,uniprot_gene_id,uniprot_gene_name
0,006L_IIV6,IIV6-006L
1,019R_FRG3G,FV3-019R
2,044L_IIV3,IIV3-044L
3,052L_FRG3G,FV3-052L
4,080R_IIV3,IIV3-080R


In [5]:
dataset = dataset.merge(geneid_to_genename, on='uniprot_gene_id', how='inner')
dataset.to_csv(src.processed_data_path / "uniprot_families_allOrganisms.tsv", sep="\t", index=False, header=True)
dataset.head()

Unnamed: 0,family,organism,uniprot_gene_id,uniprot_id,uniprot_gene_name
0,11S seed storage protein (globulins) family,HELAN,11S3_HELAN,P19084,HAG3
1,11S seed storage protein (globulins) family,FAGES,13S1_FAGES,O23878,FA02
2,11S seed storage protein (globulins) family,FAGES,13S2_FAGES,O23880,FA18
3,11S seed storage protein (globulins) family,FAGES,13S3_FAGES,Q9XFM4,FAGAG1
4,11S seed storage protein (globulins) family,LUPAN,CONA1_LUPAN,F5B8V6,CONALPHA


In [6]:
human_dataset = dataset[dataset.organism == 'HUMAN'].copy()
print("# human genes in uniprot", human_dataset.shape[0])
human_dataset[['uniprot_id']].to_csv(src.processed_data_path / "human_uniprot_ids.txt", sep="\t", index=False, header=False)
human_uniprot_to_ensg = pd.read_csv(src.processed_data_path / "human_uniprotid_to_ensg.txt", sep="\t", skiprows=1,header=None, names=['uniprot_id', 'ensembl_gene_id'])
human_dataset = human_dataset.merge(human_uniprot_to_ensg, on='uniprot_id', how='left')
human_dataset.head()

# human genes in uniprot 14493


Unnamed: 0,family,organism,uniprot_gene_id,uniprot_id,uniprot_gene_name,ensembl_gene_id
0,14-3-3 family,HUMAN,1433B_HUMAN,P31946,YWHAB,ENSG00000166913
1,14-3-3 family,HUMAN,1433E_HUMAN,P62258,YWHAE,ENSG00000108953
2,14-3-3 family,HUMAN,1433E_HUMAN,P62258,YWHAE,ENSG00000274474
3,14-3-3 family,HUMAN,1433F_HUMAN,Q04917,YWHAH,ENSG00000128245
4,14-3-3 family,HUMAN,1433G_HUMAN,P61981,YWHAG,ENSG00000170027


In [7]:
ensg_to_coord = pd.read_csv(src.external_data_path / "ensg_to_coord.txt", sep="\t")
ensg_to_coord.columns = src.coords + ['strand', 'ensembl_gene_id', 'gene_type', 'gene_name']
ensg_to_coord['strand'] = ensg_to_coord.strand.map(lambda x: "+" if x == 1 else "-")
ensg_to_coord['chr'] = 'chr' + ensg_to_coord.chr
ensg_to_coord = ensg_to_coord.sort_values(src.coords).reset_index(drop=True)
ensg_to_coord.to_csv(src.external_data_path / "ensg_to_coord.tsv", sep="\t", index=False, header=True)

# get only protein coding
ensg_to_coord = ensg_to_coord[ensg_to_coord.gene_type == 'protein_coding']
ensg_to_coord = ensg_to_coord.sort_values(src.coords).reset_index(drop=True)
ensg_to_coord['gene_position'] = ensg_to_coord.index
ensg_to_coord.to_csv(src.external_data_path / "ensg_to_coord_protein_coding.tsv", sep="\t", index=False, header=True)
ensg_to_coord.head()

Unnamed: 0,chr,start,end,strand,ensembl_gene_id,gene_type,gene_name,gene_position
0,chr1,69091,70008,+,ENSG00000186092,protein_coding,OR4F5,0
1,chr1,134901,139379,-,ENSG00000237683,protein_coding,AL627309.1,1
2,chr1,367640,368634,+,ENSG00000235249,protein_coding,OR4F29,2
3,chr1,621059,622053,-,ENSG00000185097,protein_coding,OR4F16,3
4,chr1,738532,739137,-,ENSG00000269831,protein_coding,AL669831.1,4


In [18]:
human_dataset_with_coords = human_dataset.merge(ensg_to_coord, on='ensembl_gene_id', how='inner')
human_dataset_with_coords = human_dataset_with_coords[src.coords + ['strand', 'gene_name', 'ensembl_gene_id', 'gene_position', 'gene_type', 'family', 'organism', 'uniprot_gene_id', 'uniprot_gene_name', 'uniprot_id']]
human_dataset_with_coords = human_dataset_with_coords.sort_values(['family'] + src.coords).reset_index(drop=True)
human_dataset_with_coords.head()

Unnamed: 0,chr,start,end,strand,gene_name,ensembl_gene_id,gene_position,gene_type,family,organism,uniprot_gene_id,uniprot_gene_name,uniprot_id
0,chr2,9613787,9636672,+,IAH1,ENSG00000134330,10713,protein_coding,'GDSL' lipolytic enzyme family. IAH1 subfamily,HUMAN,IAH1_HUMAN,IAH1,Q2TAA2
1,chr2,28680012,28866654,+,PLB1,ENSG00000163803,10850,protein_coding,'GDSL' lipolytic enzyme family. Phospholipase ...,HUMAN,PLB1_HUMAN,PLB1,Q6P1J6
2,chr11,117014983,117047610,+,PAFAH1B2,ENSG00000168092,3982,protein_coding,'GDSL' lipolytic enzyme family. Platelet-activ...,HUMAN,PA1B2_HUMAN,PAFAH1B2,P68402
3,chr19,42801185,42807698,-,PAFAH1B3,ENSG00000079462,10113,protein_coding,'GDSL' lipolytic enzyme family. Platelet-activ...,HUMAN,PA1B3_HUMAN,PAFAH1B3,Q15102
4,chr1,12704566,12727097,+,AADACL4,ENSG00000204518,171,protein_coding,'GDXG' lipolytic enzyme family,HUMAN,ADCL4_HUMAN,AADACL4,Q5VUY2


In [19]:
r = []
for gn, g in src.progress(human_dataset_with_coords.groupby('family')):
    g = g.drop_duplicates(src.coords + ['strand'])
    r.append(g)
human_dataset_with_coords = pd.concat(r, axis=0)
human_dataset_with_coords.head()

HBox(children=(IntProgress(value=0, max=4951), HTML(value='')))




Unnamed: 0,chr,start,end,strand,gene_name,ensembl_gene_id,gene_position,gene_type,family,organism,uniprot_gene_id,uniprot_gene_name,uniprot_id
0,chr2,9613787,9636672,+,IAH1,ENSG00000134330,10713,protein_coding,'GDSL' lipolytic enzyme family. IAH1 subfamily,HUMAN,IAH1_HUMAN,IAH1,Q2TAA2
1,chr2,28680012,28866654,+,PLB1,ENSG00000163803,10850,protein_coding,'GDSL' lipolytic enzyme family. Phospholipase ...,HUMAN,PLB1_HUMAN,PLB1,Q6P1J6
2,chr11,117014983,117047610,+,PAFAH1B2,ENSG00000168092,3982,protein_coding,'GDSL' lipolytic enzyme family. Platelet-activ...,HUMAN,PA1B2_HUMAN,PAFAH1B2,P68402
3,chr19,42801185,42807698,-,PAFAH1B3,ENSG00000079462,10113,protein_coding,'GDSL' lipolytic enzyme family. Platelet-activ...,HUMAN,PA1B3_HUMAN,PAFAH1B3,Q15102
4,chr1,12704566,12727097,+,AADACL4,ENSG00000204518,171,protein_coding,'GDXG' lipolytic enzyme family,HUMAN,ADCL4_HUMAN,AADACL4,Q5VUY2


In [20]:
human_dataset_with_coords.to_csv(src.processed_data_path / "uniprot_families_human_with_coords.tsv", sep="\t", index=False, header=True)