This is just a small notebook to add a "filenames" column to some important datasets: autoinhibitory AF structures, multi-domain AF structures, and single-domain AF structures.
TODO: Integrate these steps into the main pipeline.

In [6]:
import pandas as pd
import os

In [7]:
# Define some functions
def add_AF_filename(df, fp):
    # Specifically for AlphaFold files because CF have clusters

    # Get list of filenames from the file path
    filenames = os.listdir(fp)

    # Keep only the .cif files
    filenames = [f for f in filenames if f.endswith('.cif')]

    # Determine which files are in the dataframe
    df_proteins = df['uniprot'].tolist()

    # Create a dictionary of the filenames and the corresponding uniprot
    filename_dict = {}
    for filename in filenames:
        uniprot = filename.split('-')[1]
        filename_dict[uniprot] = filename

    # Create a list of the filenames in the order of the dataframe
    AF_filenames = []
    for protein in df_proteins:
        AF_filenames.append(filename_dict[protein])

    # Add the list to the dataframe
    df['filename'] = AF_filenames

    return df

def add_CF_filename(df, fp):

    # Get list of filenames from the file path
    filenames = os.listdir(fp)

    # Keep only model 1s
    filenames = [f for f in filenames if 'rank_001' in f]

    # Extract the uniprot from the filenames
    uniprots = [f.split('_')[0] for f in filenames]

    # Extract the cluster number from the filenames
    clusters = [f.split('_')[1] for f in filenames]
    
    # Create a dataframe of the filenames, uniprots, and clusters
    df_filenames = pd.DataFrame({'filename': filenames, 'uniprot': uniprots, 'cluster': clusters})

    # Merge the dataframes
    df = pd.merge(df, df_filenames, on='uniprot')

    return df

Make a dataframe with filenames for the AF autoinhibitory proteins.

In [8]:
df = pd.read_csv('./project_pipeline/data/classified_files_3.tsv', sep='\t').astype('object')

# Select relevant columns
df = df[['uniprot', 'region_1', 'region_2']]
df = df.drop_duplicates()

# Add the filename column
fp = './project_pipeline/data/input/Alphafold_cif/'
df = add_AF_filename(df, fp)

# Save the dataframe
df.to_csv('./project_pipeline/data/af_autoinhibited_best.tsv', sep='\t', index=False)


Add the filenames to the AF multi-domain and single-domain dataframes.

In [9]:
multi = pd.read_csv('./project_pipeline/data/multi_domain_pae.tsv', sep='\t').astype('object')
single = pd.read_csv('./project_pipeline/data/single_domain_pae.tsv', sep='\t').astype('object') # TODO add the sequence length to this file

# Add the filename column
fpm = './project_pipeline/data/input/Alphafold_multi_domain/'
fps = './project_pipeline/data/input/Alphafold_single_domain/'



# For both dataframes, we need to drop the rows with NaN values in the mean_pae column and then subset out the mean_pae column
multi = multi.dropna(subset=['mean_pae_1_1'])
multi = multi[['uniprot', 'region_1', 'region_2']]
multi = add_AF_filename(multi, fpm)

single = single.dropna(subset=['mean_pae'])
single = single[['uniprot', 'region']]
single = add_AF_filename(single, fps)

# Save the dataframe
multi.to_csv('./project_pipeline/data/af_multi_domain.tsv', sep='\t', index=False)
single.to_csv('./project_pipeline/data/af_single_domain.tsv', sep='\t', index=False)

Make files for the CF autoinhibited and multi-domain proteins.

In [10]:
# Convert cf_pdb to cf_autoinhibited
cf = pd.read_csv('./project_pipeline/data/cf_pdb.tsv', sep='\t').astype('object')

# Select only appropriate columns
cf = cf[['uniprot', 'cluster', 'region_1', 'region_2', 'filename']]
cf = cf.drop_duplicates()

# Save the dataframe
cf.to_csv('./project_pipeline/data/cf_autoinhibited.tsv', sep='\t', index=False)

# Make a cf multi-domain dataframe
md = pd.read_csv('./project_pipeline/data/multi_domain_regions.tsv', sep='\t').astype('object')

# Add the filename column (and clusters)
fp = './project_pipeline/data/input/Colabfold_cif/multi_domain/'

md = add_CF_filename(md, fp)

# Save the dataframe
md.to_csv('./project_pipeline/data/cf_multi_domain.tsv', sep='\t', index=False)