In [2]:
import os
import shutil
import pandas as pd

In [3]:
def read_clusters(path):
    # with open('/data/rsg/chemistry/jyim/paper/protdiff_icml/unconditional_results_3/sample_clustering/maxcluster_results.txt', 'r') as f:
    with open(path, 'r') as f:
        capture = False
        centroid_lines = []
        for line in f.read().splitlines():
            if capture:
                centroid_lines.append(line)
            if 'INFO  : Cluster  Centroid  Size        Spread' in line:
                capture = True

    centroid_lines = centroid_lines[:-1]

    data = {'Cluster size': [], 'path': [], 'length': [], 'sample_id': []} #, 'tm_v_pdb100': []}
    for line in centroid_lines:
        line = line.split()
        cluster_size = line[5]
        path = line[7]
        filename = os.path.basename(path).replace('.pdb', '')
        data['Cluster size'].append(int(cluster_size))
        data['path'].append(path)
        data['length'].append(int(filename.split('_')[1]))
        data['sample_id'].append(int(filename.split('_')[3]))

    clusters_df = pd.DataFrame(data)
    print(f'Number of clusters {clusters_df.shape[0]}')
    return clusters_df


In [6]:
def save_benchmark_examples(src_dir, dest_dir, cutoff=0.5):
    for length in [70, 100, 200, 300]:
        length_src_dir = os.path.join(src_dir, f'length_{str(length)}')
        length_dest_dir = os.path.join(dest_dir, f'length_{str(length)}_subset')
        os.makedirs(length_dest_dir, exist_ok=True)
        fname = os.path.join(length_dest_dir, 'sc_pdbs.txt')
        with open(fname, 'w') as f:
            for sample_name in os.listdir(length_src_dir):
                if '.' in sample_name:
                    continue
                sample_path = os.path.join(length_src_dir, sample_name, 'sde_1.pdb')
                sample_id = sample_name.split('_')[1]
                save_path = os.path.join(length_dest_dir, f'length_{length}_id_{sample_id}.pdb')
                shutil.copy(sample_path, save_path)
                f.write(save_path+'\n')
        print(
            f'./maxcluster64bit -l {length_dest_dir}/sc_pdbs.txt -C 2 -in -Rl {length_dest_dir}/all_by_all_lite \-TM -Tm {str(cutoff)} > {length_dest_dir}/maxcluster_results.txt'
        )


In [7]:
def save_clustering_examples(src_dir, dest_dir, cutoff=0.5):
    for sample_length in os.listdir(src_dir):
        if '.' in sample_length:
            continue
        length_dir = os.path.join(src_dir, sample_length)
        length = sample_length.split('_')[1]
        length_src_dir = os.path.join(src_dir, f'length_{str(length)}')
        length_dest_dir = os.path.join(dest_dir, f'length_{str(length)}_subset')
        os.makedirs(length_dest_dir, exist_ok=True)
        fname = os.path.join(length_dest_dir, 'sc_pdbs.txt')
        with open(fname, 'w') as f:
            for sample_name in os.listdir(length_src_dir):
                if '.' in sample_name:
                    continue
                sample_path = os.path.join(length_src_dir, sample_name, 'sde_1.pdb')
                sample_id = sample_name.split('_')[1]
                save_path = os.path.join(length_dest_dir, f'length_{length}_id_{sample_id}.pdb')
                shutil.copy(sample_path, save_path)
                f.write(save_path+'\n')
        print(
            f'./maxcluster64bit -l {length_dest_dir}/sc_pdbs.txt -C 2 -in -Rl {length_dest_dir}/all_by_all_lite \-TM -Tm {str(cutoff)} > {length_dest_dir}/maxcluster_results.txt'
        )


In [None]:
        for sample_length in os.listdir(sample_dir):
            if '.' in sample_length:
                continue
            length_dir = os.path.join(sample_dir, sample_length)
            length = sample_length.split('_')[1]

In [5]:
def write_samples(src_dir, dest_dir):
    designed_samples_dir = os.path.join(dest_dir, 'designed_samples')
    designed_samples_cluster_dir = os.path.join(dest_dir, 'designed_samples_clustering')

    # shutil.rmtree(designed_samples_dir)
    # shutil.rmtree(designed_samples_cluster_dir)

    os.makedirs(designed_samples_dir, exist_ok=True)
    os.makedirs(designed_samples_cluster_dir, exist_ok=True)
    fname = os.path.join(designed_samples_dir, 'sc_pdbs.txt')

    with open(fname, 'w') as f:
        for sample_length in os.listdir(src_dir):
            if '.' in sample_length:
                continue
            length_dir = os.path.join(src_dir, sample_length)
            length = sample_length.split('_')[1]
            for sample_name in os.listdir(length_dir):
                if '.' in sample_name:
                    continue
                sample_path = os.path.join(length_dir, sample_name, 'sde_1.pdb')
                sample_id = sample_name.split('_')[1]
                save_path = os.path.join(designed_samples_dir, f'length_{length}_id_{sample_id}.pdb')
                shutil.copy(sample_path, save_path)
                f.write(save_path+'\n')
    print(
        f'./maxcluster64bit -l {fname} -C 2 -in -Rl {designed_samples_cluster_dir}/all_by_all_lite \-TM -Tm 0.6 > {designed_samples_cluster_dir}/maxcluster_results.txt'
    )

# Benchmark clustering

In [8]:
# Noise 0.1
save_benchmark_examples(
    '/data/rsg/chemistry/jyim/projects/protein_diffusion/samples/continue_0/05D_01M_2023Y_21h_15m_16s/unconditional/noise_level_01_rf_benchmark_3',
    '/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_01_benchmark_alt',
    cutoff=0.4
)

./maxcluster64bit -l /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_01_benchmark_alt/length_70_subset/sc_pdbs.txt -C 2 -in -Rl /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_01_benchmark_alt/length_70_subset/all_by_all_lite \-TM -Tm 0.4 > /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_01_benchmark_alt/length_70_subset/maxcluster_results.txt
./maxcluster64bit -l /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_01_benchmark_alt/length_100_subset/sc_pdbs.txt -C 2 -in -Rl /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_01_benchmark_alt/length_100_subset/all_by_all_lite \-TM -Tm 0.4 > /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_01_benchmark_alt/length_100_subset/maxcluster_results.txt
./maxcluster64bit -l /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_01_benchmark_alt/length_200_subset/sc_pdbs.txt -C 2 -in -Rl /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_01_benchmark_alt/length_200_subset/all_by_all_lite \-TM 

In [9]:
# Noise 0.5
save_benchmark_examples(
    '/data/rsg/chemistry/jyim/projects/protein_diffusion/samples/continue_0/05D_01M_2023Y_21h_15m_16s/unconditional/noise_level_05_rf_benchmark',
    '/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_benchmark_alt',
    cutoff=0.4,
)


./maxcluster64bit -l /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_benchmark_alt/length_70_subset/sc_pdbs.txt -C 2 -in -Rl /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_benchmark_alt/length_70_subset/all_by_all_lite \-TM -Tm 0.4 > /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_benchmark_alt/length_70_subset/maxcluster_results.txt
./maxcluster64bit -l /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_benchmark_alt/length_100_subset/sc_pdbs.txt -C 2 -in -Rl /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_benchmark_alt/length_100_subset/all_by_all_lite \-TM -Tm 0.4 > /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_benchmark_alt/length_100_subset/maxcluster_results.txt
./maxcluster64bit -l /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_benchmark_alt/length_200_subset/sc_pdbs.txt -C 2 -in -Rl /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_benchmark_alt/length_200_subset/all_by_all_lite \-TM 

In [17]:
# Noise 1.0
save_benchmark_examples(
    '/data/rsg/chemistry/jyim/projects/protein_diffusion/samples/continue_0/05D_01M_2023Y_21h_15m_16s/unconditional/noise_level_10_rf_benchmark',
    '/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_10_benchmark_alt',
    cutoff=0.4,
)


./maxcluster64bit -l /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_10_benchmark_alt/length_70_subset/sc_pdbs.txt -C 2 -in -Rl /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_10_benchmark_alt/length_70_subset/all_by_all_lite \-TM -Tm 0.4 > /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_10_benchmark_alt/length_70_subset/maxcluster_results.txt
./maxcluster64bit -l /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_10_benchmark_alt/length_100_subset/sc_pdbs.txt -C 2 -in -Rl /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_10_benchmark_alt/length_100_subset/all_by_all_lite \-TM -Tm 0.4 > /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_10_benchmark_alt/length_100_subset/maxcluster_results.txt
./maxcluster64bit -l /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_10_benchmark_alt/length_200_subset/sc_pdbs.txt -C 2 -in -Rl /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_10_benchmark_alt/length_200_subset/all_by_all_lite \-TM 

# Sample clustering

In [None]:
for noise_scale in ['01', '05', '10']:
    print(f'On {noise_scale}')
    sample_dir = f'/data/rsg/chemistry/jyim/projects/protein_diffusion/samples/continue_0/05D_01M_2023Y_21h_15m_16s/unconditional/noise_level_{}'
    base_dir = f'/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_scale_{noise_scale}_results_alt'
    designed_samples_dir = os.path.join(base_dir, 'designed_samples')
    designed_samples_cluster_dir = os.path.join(base_dir, 'designed_samples_clustering')

    shutil.rmtree(designed_samples_dir)
    shutil.rmtree(designed_samples_cluster_dir)

    os.makedirs(designed_samples_dir, exist_ok=False)
    os.makedirs(designed_samples_cluster_dir, exist_ok=False)
    fname = os.path.join(designed_samples_dir, 'sc_pdbs.txt')
    
    with open(fname, 'w') as f:
        for sample_length in os.listdir(sample_dir):
            if '.' in sample_length:
                continue
            length_dir = os.path.join(sample_dir, sample_length)
            length = sample_length.split('_')[1]
            for sample_name in os.listdir(length_dir):
                if '.' in sample_name:
                    continue
                sample_path = os.path.join(length_dir, sample_name, 'sde_1.pdb')
                sample_id = sample_name.split('_')[1]
                save_path = os.path.join(designed_samples_dir, f'length_{length}_id_{sample_id}.pdb')
                shutil.copy(sample_path, save_path)
                f.write(save_path+'\n')
    print(
        f'./maxcluster64bit -l {fname} -C 2 -in -Rl {designed_samples_cluster_dir}/all_by_all_lite \-TM -Tm 0.5 > {designed_samples_cluster_dir}/maxcluster_results.txt'
    )

# Read clusters

In [20]:
lengths = [70, 100, 200, 300]
noise_scales = ['01', '05', '10']
all_clusters = {}
for noise in noise_scales:
    for length in lengths:
        cluster_name = f'noise_{noise}_length_{str(length)}'
        cluster_dir = f'/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_{noise}_benchmark_alt/length_{str(length)}_subset/maxcluster_results.txt'
        if not os.path.exists(cluster_dir):
            continue
        print(f'Reading {cluster_name}')
        all_clusters[cluster_name] = read_clusters(cluster_dir)
    print()


Reading noise_01_length_70
Number of clusters 45
Reading noise_01_length_100
Number of clusters 23
Reading noise_01_length_200
Number of clusters 50
Reading noise_01_length_300
Number of clusters 46

Reading noise_05_length_70
Number of clusters 42
Reading noise_05_length_100
Number of clusters 12
Reading noise_05_length_200
Number of clusters 54
Reading noise_05_length_300
Number of clusters 42

Reading noise_10_length_70
Number of clusters 55
Reading noise_10_length_100
Number of clusters 19
Reading noise_10_length_200
Number of clusters 64
Reading noise_10_length_300
Number of clusters 49



In [57]:
# Noise levels 
# noise_scale_01_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_scale_01_results/designed_samples_clustering/maxcluster_results.txt')
noise_scale_01_seq_100_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_01_seqs_100_results/designed_samples_clustering/maxcluster_results.txt')
noise_scale_05_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_scale_05_results/designed_samples_clustering/maxcluster_results.txt')
noise_scale_10_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_scale_10_results/designed_samples_clustering/maxcluster_results.txt')


Number of clusters 484
Number of clusters 503
Number of clusters 485


In [7]:
# Subsets
noise_10_length_70_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_10_subset_samples/length_70_subset/maxcluster_results.txt')
noise_10_length_100_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_10_subset_samples/length_100_subset/maxcluster_results.txt')
noise_10_length_200_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_10_subset_samples/length_200_subset/maxcluster_results.txt')
noise_10_length_300_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_10_subset_samples/length_300_subset/maxcluster_results.txt')
print()
noise_05_length_70_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_subset_samples/length_70_subset/maxcluster_results.txt')
noise_05_length_100_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_subset_samples/length_100_subset/maxcluster_results.txt')
noise_05_length_200_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_subset_samples/length_200_subset/maxcluster_results.txt')
noise_05_length_300_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_subset_samples/length_300_subset/maxcluster_results.txt')
print()
noise_01_length_70_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_01_subset_samples/length_70_subset/maxcluster_results.txt')
noise_01_length_100_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_01_subset_samples/length_100_subset/maxcluster_results.txt')
noise_01_length_200_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_01_subset_samples/length_200_subset/maxcluster_results.txt')
noise_01_length_300_cluster = read_clusters('/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_01_subset_samples/length_300_subset/maxcluster_results.txt')


Number of clusters 90
Number of clusters 81
Number of clusters 86
Number of clusters 80

Number of clusters 67
Number of clusters 52
Number of clusters 67
Number of clusters 52

Number of clusters 72
Number of clusters 59
Number of clusters 49
Number of clusters 62


# Process samples for clustering

In [12]:
src_dir = '/data/rsg/chemistry/jyim/projects/protein_diffusion/samples/unconditional_512_0/27D_11M_2022Y_09h_58m_16s/unconditional/noise_level_05_rf_diversity'
dest_dir = '/data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_rf_diversity'
write_samples(src_dir, dest_dir)

./maxcluster64bit -l /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_rf_diversity/designed_samples/sc_pdbs.txt -C 2 -in -Rl /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_rf_diversity/designed_samples_clustering/all_by_all_lite \-TM -Tm 0.6 > /data/rsg/chemistry/jyim/paper/protdiff_icml/noise_level_05_rf_diversity/designed_samples_clustering/maxcluster_results.txt


# Read number of clusters in RF2diffusion

In [31]:
s2i_df = pd.read_csv('../S2I_analysis.csv')

In [54]:
uncond_df = s2i_df[s2i_df.problem.isin(['uncond_70','uncond_100','uncond_200','uncond_300'])]
subset_df = uncond_df

In [56]:
for len_group, len_df in subset_df.groupby(['len', 'noise']):
    print(len_group, len(len_df['cluster_06'].unique()))

(70.0, 0.0) 16
(70.0, 0.5) 26
(70.0, 1.0) 34
(100.0, 0.0) 26
(100.0, 0.5) 35
(100.0, 1.0) 65
(200.0, 0.0) 29
(200.0, 0.5) 65
(200.0, 1.0) 83
(300.0, 0.0) 17
(300.0, 0.5) 67
(300.0, 1.0) 91


In [10]:
centroid_df = {
    'Number of clusters': [],
    'Noise scale': [],
    'Noise scale': [],0
    'Method': [],
}

In [None]:
centroid_df['Number of clusters'].append(noise_10_length_70_cluster.shape[0])
centroid_df['Noise scale'].append(1.0)
centroid_df['Length'].append(1.0)
centroid_df['Method'].append('FrameDiff')

centroid_df['Number of clusters'].append(noise_10_length_100_cluster.shape[0])
centroid_df['Noise scale'].append(1.0)
centroid_df['Method'].append('FrameDiff')

centroid_df['Number of clusters'].append(noise_10_length_70_cluster.shape[0])
centroid_df['Noise scale'].append(1.0)
centroid_df['Method'].append('FrameDiff')

In [22]:
s2i_df['denoiser.noise_scale_frame'].value_counts()

1    21600
Name: denoiser.noise_scale_frame, dtype: int64

In [23]:
s2f_df['denoiser.noise_scale_ca'].value_counts()

1    21600
Name: denoiser.noise_scale_ca, dtype: int64

In [6]:
relevant_lengths_df = s2f_df[s2f_df.len.isin([70,100,200,300])]

In [7]:
subset_df = relevant_lengths_df.groupby('len').sample(100)

In [None]:
subset_df.model

1    21600
Name: denoiser.noise_scale_frame, dtype: int64

1    21600
Name: denoiser.noise_scale_ca, dtype: int64

In [13]:
for len_group, len_df in subset_df.groupby('len'):
    print(len_group, len(len_df['tm_cluster_0.60'].unique()))

70 52
100 55
200 59
300 61


In [29]:
cath_csv = pd.read_csv('/data/rsg/chemistry/jyim/large_data/cath/metadata.csv')
monomer_csv = cath_csv[cath_csv.oligomeric_detail.isin(['monomeric'])]
test_csv = monomer_csv[monomer_csv.cath_split == 'test']

In [31]:
test_csv.to_csv('monomer_test_cath.csv', index=False)

In [33]:
test_csv.sort_values('modeled_seq_len')

Unnamed: 0.1,Unnamed: 0,chain_name,cath_code,cath_split,processed_path,raw_path,oligomeric_count,oligomeric_detail,resolution,structure_method,num_chains,seq_len,modeled_seq_len,coil_percent,helix_percent,strand_percent,radius_gyration,split
44,44,1waz.A,['1.10.287'],test,/data/rsg/chemistry/jyim/large_data/cath/wa/1w...,/data/rsg/chemistry/jyim/large_data/pdb/30_08_...,1,monomeric,0.00,solution nmr,1,46,46,0.326087,0.673913,0.000000,1.286347,test
161,161,2uux.A,['4.10.410'],test,/data/rsg/chemistry/jyim/large_data/cath/uu/2u...,/data/rsg/chemistry/jyim/large_data/pdb/30_08_...,1,monomeric,1.40,x-ray diffraction,1,135,55,0.672727,0.090909,0.236364,1.153543,test
329,329,1vib.A,['1.10.287'],test,/data/rsg/chemistry/jyim/large_data/cath/vi/1v...,/data/rsg/chemistry/jyim/large_data/pdb/30_08_...,1,monomeric,0.00,solution nmr,1,55,55,0.436364,0.563636,0.000000,1.333609,test
274,274,1gyz.A,['1.10.1900'],test,/data/rsg/chemistry/jyim/large_data/cath/gy/1g...,/data/rsg/chemistry/jyim/large_data/pdb/30_08_...,1,monomeric,0.00,solution nmr,1,60,60,0.350000,0.650000,0.000000,1.064580,test
388,388,1i2t.A,['1.10.1900'],test,/data/rsg/chemistry/jyim/large_data/cath/i2/1i...,/data/rsg/chemistry/jyim/large_data/pdb/30_08_...,1,monomeric,1.04,x-ray diffraction,1,191,61,0.131148,0.868852,0.000000,1.188198,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,343,4amm.A,['3.40.366'],test,/data/rsg/chemistry/jyim/large_data/cath/am/4a...,/data/rsg/chemistry/jyim/large_data/pdb/30_08_...,1,monomeric,1.40,x-ray diffraction,1,715,384,0.375000,0.447917,0.177083,2.256468,test
642,642,3oc9.A,['3.90.550'],test,/data/rsg/chemistry/jyim/large_data/cath/oc/3o...,/data/rsg/chemistry/jyim/large_data/pdb/30_08_...,1,monomeric,1.80,x-ray diffraction,1,782,400,0.430000,0.310000,0.260000,2.239021,test
585,585,3qcp.A,['3.40.30'],test,/data/rsg/chemistry/jyim/large_data/cath/qc/3q...,/data/rsg/chemistry/jyim/large_data/pdb/30_08_...,1,monomeric,2.30,x-ray diffraction,1,596,410,0.458537,0.465854,0.075610,2.341498,test
768,768,4cit.A,['1.10.606'],test,/data/rsg/chemistry/jyim/large_data/cath/ci/4c...,/data/rsg/chemistry/jyim/large_data/pdb/30_08_...,1,monomeric,1.80,x-ray diffraction,1,723,410,0.397561,0.578049,0.024390,2.243544,test
