In [1]:
import pandas as pd
import requests
import os
import numpy as np
from ratelimit import limits, sleep_and_retry

A notebook to calculate "lineage scores" for every UniRef cluster used to create the ColabFold clusters.

In [2]:
# url = "https://rest.uniprot.org/uniref/stream?format=json&query=%28%28id%3AUniRef100_A0A2G8JTM8%29%29"
# response = requests.get(url).json()
# print(response['results'].keys())
# # df = pd.json_normalize(response['results'])
# # taxon = response['results'][0]['commonTaxon']
# # print(taxon)
# # df = pd.json_normalize(taxon)
# # df.head()
# # tax = df.iloc[0]['taxonId']
# # print(tax)

In [3]:
def get_unirefs(fp):
    '''
    Read the UniRef IDs from the a3m file
    '''

    unirefs = []

    with open(fp) as f:
        for line in f:
            if "UniRef" in line:
                unirefs.append(line.strip())

    # Remove the '>' character from the beginning of each UniRef ID (e.g. '>UniRef100_A0A8B9YR53')
    cleaned_unirefs = [uniref.split(">")[1] for uniref in unirefs]

    return cleaned_unirefs

# @sleep_and_retry
# @limits(calls=1, period=timedelta(seconds=1).total_seconds())
def get_uniref_taxonId(uniref):
    '''
    Get the seed taxonID for a given UniRef cluster
    '''

    # UniProt REST API URL
    url = f"https://rest.uniprot.org/uniref/stream?format=json&query=%28%28id%3A{uniref}%29%29"

    response = requests.get(url).json()
    if len(response['results']) == 0:
        return None
    else:
        taxon = response['results'][0]['commonTaxon']
        df = pd.json_normalize(taxon)
        taxonId = df.iloc[0]['taxonId']

        return taxonId

# @sleep_and_retry
# @limits(calls=1, period=timedelta(seconds=1).total_seconds())
def get_uniprot_taxonId(uniprot):
    '''
    Get the taxonId for a given UniProt ID
    '''

    # UniProt REST API URL
    url = f'https://rest.uniprot.org/uniprotkb/{uniprot}.json'
    response = requests.get(url).json()
    taxonId = response['organism']['taxonId']

    return taxonId

# @sleep_and_retry
# @limits(calls=1, period=timedelta(seconds=1).total_seconds())
def get_lineage(taxonId):
    '''
    Get the lineage for a given taxonID
    '''
    
    url = f"https://rest.uniprot.org/taxonomy/{taxonId}.json"
    response = requests.get(url).json()
    if 'lineage' not in response.keys():
        return [taxonId]
    else:
        lineage = response['lineage']
        df = pd.json_normalize(lineage)

        # Turn taxonId column to list of strings
        taxa_list = df['taxonId'].tolist()

        return taxa_list

def cluster_lineages(df, path):
    '''
    Calculate the 'lineage score' (i.e. the number of shared taxa with the original organism) for each cluster
    '''

    # Keep track of which UniProt ID is being queried to avoid redundant queries
    current_uniprot = ''
    current_lineage = []

    # Empty Dataframe to keep track of the lineage scores
    lineage_df_list = []

    # Iterate over the rows in the DataFrame
    for index, row in df.iterrows():

        uniprot = row['uniprot']
        cluster = row['cluster']

        print(f'UniProt: {uniprot}, Cluster: {cluster}')

        # If the UniProt ID is different from the last one queried, get the lineage
        if uniprot != current_uniprot:
            current_uniprot = uniprot

            # Get the lineage for the UniProt ID
            uniprot_taxon = get_uniprot_taxonId(uniprot)
            current_lineage = get_lineage(uniprot_taxon)

        # Open the cluster file to get the UniRef IDs
        fn = f'{uniprot}_{cluster}.a3m'
        fp = os.path.join(path, uniprot, fn) # File path looks like data/O08967/O08967_000.a3m
        unirefs = get_unirefs(fp)

        # Iterate over the UniRef IDs to get their lineages
        for uniref in unirefs:
            print(f'UniProt: {uniprot}, Cluster: {cluster}, UniRef: {uniref}')
            cluster_taxon = get_uniref_taxonId(uniref)

            if cluster_taxon == None:
                cluster_lineage = []

                # Create temp_df to append to lineage_df_list
                temp_df = pd.DataFrame([[uniprot, row['state'], row['conformation'], cluster, fn, uniref, 
                                        cluster_taxon, current_lineage, cluster_lineage]],
                                        columns=['uniprot', 'state', 'conformation', 'cluster', 'filename', 'uniref', 
                                                 'taxonId', 'uniprot_lineage', 'cluster_lineage'])
                
                # Append temp_df to lineage_df_list
                lineage_df_list.append(temp_df)

            else:
                print(f'Taxon: {cluster_taxon}')
                cluster_lineage = get_lineage(cluster_taxon)

                # Add values to temp_df
                temp_df = pd.DataFrame([[uniprot, row['state'], row['conformation'], cluster, fn, uniref, 
                                        cluster_taxon, current_lineage, cluster_lineage]],
                                        columns=['uniprot', 'state', 'conformation', 'cluster', 'filename', 'uniref', 
                                                 'taxonId', 'uniprot_lineage', 'cluster_lineage'])

                # Append temp_df to lineage_df_list
                lineage_df_list.append(temp_df)

    # Concatenate the lineage_df_list into a single DataFrame
    lineage_df = pd.concat(lineage_df_list).reset_index(drop=True)

    return lineage_df

def calculate_lineage_scores(df):

    # Take the dataframe we made in cluster_lineages and get our lineage scores
    for idx, row in df.iterrows():
        uniprot_lineage = row['uniprot_lineage']
        cluster_lineage = row['cluster_lineage']

        # Calculate the lineage score
        lineage_score = len(set(uniprot_lineage).intersection(cluster_lineage))

        # Calculate the lineage lengths
        uniprot_lineage_length = len(uniprot_lineage)
        cluster_lineage_length = len(cluster_lineage)

        # Add the lineage score to the DataFrame
        df.loc[idx, 'lineage_score'] = lineage_score
        df.loc[idx, 'uniprot_lineage_length'] = uniprot_lineage_length
        df.loc[idx, 'cluster_lineage_length'] = cluster_lineage_length
    
    return df

In [4]:
df = pd.read_csv('./project_pipeline/data/two-state_af_cf.tsv', sep='\t')

path = './project_pipeline/data/input/Colabfold_cif/autoinhibited'

# Retrieve the lineage scores
lineage_df = cluster_lineages(df, path)

UniProt: O08967, Cluster: 007
UniProt: O08967, Cluster: 007, UniRef: UniRef100_A0A4U8UYC5
Taxon: 34508
UniProt: O08967, Cluster: 012
UniProt: O08967, Cluster: 012, UniRef: UniRef100_A0A7M5UZP8
Taxon: 252671
UniProt: O08967, Cluster: 012, UniRef: UniRef100_UPI001F5E8B16
Taxon: 6087
UniProt: O08967, Cluster: 034
UniProt: O08967, Cluster: 034, UniRef: UniRef100_A0A444U1T5
Taxon: 7906
UniProt: O08967, Cluster: U100-006
UniProt: O08967, Cluster: U100-006, UniRef: UniRef100_A0A7R8YUY7
Taxon: 343691
UniProt: O08967, Cluster: U100-006, UniRef: UniRef100_A0A1B0FMZ1
Taxon: 37546
UniProt: O08967, Cluster: U100-006, UniRef: UniRef100_UPI00094E420B
Taxon: 77166
UniProt: O08967, Cluster: U100-006, UniRef: UniRef100_UPI000644D545
Taxon: 1410327
UniProt: O08967, Cluster: U100-006, UniRef: UniRef100_H3D114
Taxon: 99883
UniProt: O08967, Cluster: U100-006, UniRef: UniRef100_A0A0P6A5G2
Taxon: 35525
UniProt: O08967, Cluster: U100-006, UniRef: UniRef100_UPI00094EDD39
Taxon: 109280
UniProt: O08967, Cluster: 

In [5]:
# Get the lineage scores
lineage_scores = calculate_lineage_scores(lineage_df)

lineage_scores.to_csv('./project_pipeline/data/lineage_scores.csv', index=False)

In [6]:
lineage = pd.read_csv('./project_pipeline/data/lineage_scores.csv')

two_state = pd.read_csv('./project_pipeline/data/two-state_af_cf.tsv', sep='\t')

two_state_sub = two_state[['uniprot', 'cluster']].drop_duplicates()

lineage_sub = lineage[['uniprot', 'cluster']].drop_duplicates()

two_state_sub['id'] = two_state_sub['uniprot'] + two_state_sub['cluster']
lineage_sub['id'] = lineage_sub['uniprot'] + lineage_sub['cluster']

two_statedif = set(two_state_sub['id']) - set(lineage_sub['id'])
print(len(two_statedif))

91
