In [20]:
import requests
import json
import csv
import threading
import time
import pandas as pd

main_keyword_list = {}

def extract_functions(protein_list: list) -> None:
    for protein in protein_list:
        # create words dict
        words = {}

        # extract request
        url = "https://rest.uniprot.org/uniprotkb/" + protein + ".json"
        r = requests.get(url)
        request = r.json()
        
        # input 1 if avaliable
        for i, val in enumerate(request["keywords"]):
            words[request["keywords"][i]["name"]] = 1
            
        # append dict to main dict
        main_keyword_list[protein] = words

def extract_proteins(path: str) -> list:
    proteins = []

    # read in text file
    with open(path, 'r') as f:
        lines = f.readlines()

    # take the first value from text file
    for line in lines:
        proteins.append(line.split('|')[0])

    # return list
    return proteins

def explore_json(protein: str) -> None:
    url = "https://rest.uniprot.org/uniprotkb/" + protein + ".json"
    r = requests.get(url)
    request = r.json()
    for key in request.keys():
        print(key)
        print(request[key])
        print()



proteins = extract_proteins("data/protein_aliases.txt")
res_path = "results/functions.txt"

t1 = threading.Thread(target=extract_functions, args=(proteins[:10],))
t2 = threading.Thread(target=extract_functions, args=(proteins[10:20],))
t3 = threading.Thread(target=extract_functions, args=(proteins[20:30],))
t1.start()
t2.start()
t3.start()

#  turn dict into csv 
df = pd.DataFrame(main_keyword_list)
df = df.transpose()
df.to_csv('sample.csv')

In [24]:
len(main_keyword_list.keys())

30

In [None]:
df = pd.DataFrame(main_keyword_list)
df = df.trans

In [19]:
df

Unnamed: 0,Acetylation,Activator,Alternative splicing,Biological rhythms,Cytoplasm,DNA-binding,Glycoprotein,Host-virus interaction,Isopeptide bond,Metal-binding,...,mRNA splicing,Amino-acid biosynthesis,Cysteine biosynthesis,Pyridoxal phosphate,Oxidation,Acyltransferase,Carbohydrate metabolism,Glucose metabolism,Lipoyl,Tricarboxylic acid cycle
O89090,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
F1SU18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
F1S4X9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q9ULI3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P04049,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q4GWZ2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q6AY86,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C9JJ47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B4DGK6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A0G2K1Y8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
for column in df.columns:
    x = df[column].value_counts()[1]
    print(f"{column}: {x}")

Acetylation: 8
Activator: 3
Alternative splicing: 10
Biological rhythms: 2
Cytoplasm: 8
DNA-binding: 2
Glycoprotein: 7
Host-virus interaction: 1
Isopeptide bond: 4
Metal-binding: 12
Nucleus: 8
Phosphoprotein: 13
Reference proteome: 50
Repeat: 13
Repressor: 2
Transcription: 4
Transcription regulation: 4
Ubl conjugation: 4
Zinc: 5
Zinc-finger: 2
LIM domain: 2
Proteomics identification: 21
ATP-binding: 6
Glutathione biosynthesis: 1
Ligase: 2
Magnesium: 2
Nucleotide-binding: 7
3D-structure: 6
Calcium: 4
Cell junction: 2
Cell membrane: 5
Developmental protein: 1
Disulfide bond: 7
EGF-like domain: 2
Membrane: 17
Secreted: 6
Signal: 7
Transmembrane: 12
Transmembrane helix: 12
Cardiomyopathy: 1
Deafness: 1
Direct protein sequencing: 7
Disease variant: 4
Kinase: 1
Methylation: 3
Mitochondrion: 5
Proto-oncogene: 1
Serine/threonine-protein kinase: 1
Transferase: 4
Host cell receptor for virus entry: 1
Receptor: 2
Ribonucleoprotein: 2
Ribosomal protein: 2
Endosome: 1
Protein transport: 4
Transport