In [2]:
import requests
import pandas as pd
import json

This notebook is for drafting a script to download data from the interpro API. The code may later be turned into a standalone script.

In [13]:
# Read the protein data
df = pd.read_csv('./project_pipeline/data/classified_files_3.tsv', sep='\t').astype('object')


def add_interpro(df):
    '''
    Function to add interpro data to the dataframe. Data is retrieved as [{type: [name, ...]}, ...]. For instance, [{'domain': ['Protein kinase domain', ...]}, ...].
    Data is then saved to dataframe with keys as columns and values as cells.
    '''

    for i in range(len(df)):
        uniprot = df.loc[i, 'uniprot']

        # Get the query
        query = interpro_get(uniprot)

        # Decode the query into our dictionary format
        results_dict = decode_interpro(query)
        # Add the data to the dataframe
        for key in results_dict:
            df.at[i, key] = ', '.join(results_dict[key])

    return df

def interpro_get(id):
    '''
    Function to generate an interpro query from a given UniProt id. Returns the json object.
    '''
    url = f'https://ebi.ac.uk/interpro/api/entry/interpro/protein/uniprot/{id}?format=json'

    response = requests.get(url=url)

    if response.status_code == 200:
        return response.json()

def decode_interpro(query):
    '''
    We're going to pull data from every metadata entry in the query. We will pull the type (domain, family, etc) and the name. Data
    will be saved as a dictionary with the type as the key and the name as the value (e.g. {'domain': 'Protein kinase domain'}).
    '''
    results_dict = {}
    results_list = query['results']

    for i in range(len(results_list)):
        # Define type and name
        ip_type = results_list[i]['metadata']['type']
        ip_name = results_list[i]['metadata']['name']

        if ip_type not in results_dict:
            results_dict[ip_type] = [ip_name]

        else:
            results_dict[ip_type].append(ip_name)

    return results_dict

# Run the function 
df = add_interpro(df)

# Save the dataframe
df.to_csv('./project_pipeline/data/interpro.tsv', sep='\t', index=False)


