In [2]:
from Bio.Seq import Seq

In [3]:
### get all TAIR IDs
from Bio import SeqIO

for record in SeqIO.parse("data/TAIR10/TAIR10proteins.fasta", "fasta"):
    with open("data/TAIR10/TAIR10_seqs.txt", "a") as f:
        f.write(str(record.seq) + "\n")

In [4]:
records = list(SeqIO.parse("data/TAIR10/TAIR10proteins.fasta", "fasta"))
print(records[0].id)

records[0].seq

AT1G51370.2


Seq('MVGGKKKTKICDKVSHEEDRISQLPEPLISEILFHLSTKDSVRTSALSTKWRYL...NPS')

In [5]:
### pipeline to get alphafold plddt

In [6]:
def DL_range(enumerated_values, min_length):
    ranges = []
    current_range = []

    # Iterate through enumerated_values
    for index, value in enumerated_values:
        if value in ['D', 'L']:
            if not current_range:
                current_range = [index, index]  # Start a new range
            else:
                current_range[1] = index  # Extend the current range
        else:
            if current_range:
                # Check if the range length meets the minimum length
                if current_range[1] - current_range[0] + 1 >= min_length:
                    ranges.append(tuple(current_range))  # Save the completed range
                current_range = []  # Reset the range

    # Append the last range if it exists and meets the minimum length
    if current_range and current_range[1] - current_range[0] + 1 >= min_length:
        ranges.append(tuple(current_range))

    return ranges

# Example usage
enumerated_values = list(enumerate(["A", "D", "L", "D", "L", "M", "D","L", "M"]))
min_length = 3
filtered_ranges = DL_range(enumerated_values, min_length)
print("Filtered ranges:", filtered_ranges)

Filtered ranges: [(1, 4)]


In [7]:
### API to get IDRs from an AlphaFold ID and a protein sequence
import requests
import json

def alphafold_idr(alphafold_ID, protein_sequence, min_IDR_length = 15):
    url = f'https://alphafold.ebi.ac.uk/files/AF-{alphafold_ID}-F1-confidence_v4.json'
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        
        if 'confidenceCategory' in data and 'confidenceScore' in data:
            confidence_values = data['confidenceCategory']
            #get positions with continuous D and L regions (IDRs)
            enumerated_values = list(enumerate(confidence_values))
            IDR_positions = DL_range(enumerated_values, min_IDR_length)
            #get confidence scores for the positions
            confidence_scores = data['confidenceScore']
            IDR_scores = []
            IDR_seqs = []
            IDR_starts = []
            IDR_ends = []
            for start, end in IDR_positions:
                IDR_scores.append(confidence_scores[start:end + 1])
                IDR_seqs.append(sequence[start:end + 1])
                IDR_starts.append(start)
                IDR_ends.append(end)
            return IDR_starts, IDR_ends, IDR_seqs, IDR_scores
    #error
    return None, None, None, None

In [8]:
from tqdm import tqdm

In [9]:
import pandas as pd

In [14]:
uniprot = pd.read_csv("data/TAIR10/ArabidopsisUniprot.tsv", sep="\t")
uniprot = uniprot[uniprot['AlphaFoldDB'].notnull()] #remove empty AlphaFoldDB entries
print(len(uniprot[["Entry", "AlphaFoldDB", "Sequence"]].drop_duplicates()) == len(uniprot)) #print number of unique entries
uniprot = uniprot[["Entry", "AlphaFoldDB", "Sequence"]].drop_duplicates()

len(uniprot)

True


132238

In [15]:
uniprot = pd.read_csv("data/TAIR10/ArabidopsisUniprot.tsv", sep="\t")
uniprot = uniprot[uniprot['AlphaFoldDB'].notnull()] #remove empty AlphaFoldDB entries

print(len(uniprot[["Entry", "AlphaFoldDB", "Sequence"]].drop_duplicates()) == len(uniprot)) #print number of unique entries
uniprot = uniprot[["Entry", "AlphaFoldDB", "Sequence"]].drop_duplicates()

import os
import csv

# Path to the IDRs.csv file
idr_file_path = "data/TAIR10/IDRs.csv"

# Check if the file exists and is not empty
if os.path.exists(idr_file_path) and os.path.getsize(idr_file_path) > 0:
    # Read the existing entries from IDRs.csv
    written_entries = set()
    with open(idr_file_path, "r") as idr_file:
        reader = csv.DictReader(idr_file, delimiter="\t")
        for row in reader:
            written_entries.add(row["Entry"])  # Collect already written entries

    # Filter the uniprot DataFrame to exclude already written entries
    uniprot = uniprot[~uniprot["Entry"].isin(written_entries)]
    print(f"Filtered uniprot DataFrame to exclude {len(written_entries)} already written entries.")
else:
    with open(idr_file_path, "w") as w:
        w.write("Entry\tStart\tEnd\tSequence\tConfidenceScore\n")  # Write header for the output file

for index, row in tqdm(uniprot.iterrows()):
    alphafold_ID = row['AlphaFoldDB'].replace(";", "")
    sequence = row['Sequence']
    IDR_starts, IDR_ends, IDR_seq, IDR_scores = alphafold_idr(alphafold_ID, sequence)
    
    if IDR_starts:  # If there are IDRs found
        with open("data/TAIR10/IDRs.csv", "a") as f:
            for start, end, seq, score in zip(IDR_starts, IDR_ends, IDR_seq, IDR_scores):
                f.write(f"{row['Entry']}\t{start}\t{end}\t{seq}\t{score}\n")


True
Filtered uniprot DataFrame to exclude 85806 already written entries.


67it [00:11,  5.82it/s]


KeyboardInterrupt: 

In [None]:
from localcider.sequenceParameters import SequenceParameters

