In [1]:
import pandas as pd
import numpy as np
import random
from random import sample, randrange
from collections import Counter,defaultdict
from statistics import mean

import matplotlib as mpl
from matplotlib import cm,style
from matplotlib.animation import FuncAnimation
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from IPython import display
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, scale
from sklearn.model_selection import train_test_split,GridSearchCV,KFold, cross_val_score
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report,roc_curve, roc_auc_score, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

import scipy.cluster.hierarchy as hc
from scipy.cluster.hierarchy import linkage,dendrogram
from scipy.spatial.distance import squareform,pdist

import pyrepseq as rs
import pyrepseq.plotting as rsp
import pyrepseq.distance as rsd
import pyrepseq.stats as rss

from Levenshtein import hamming as hamming_distance
from Levenshtein import distance as levenshtein_distance

import warnings
warnings.simplefilter('ignore')

## KNN

In [2]:
def find_neighbours(x_train, y_train, x_test, k, distance_metric = levenshtein_distance):
    '''
    Calculates a list of the k-nearest neighbours
    of a test sequence based on a distance metric (default = Levenshtein)
    Inputs:
        - x_train: training sequences
        - y_train: epitopes corresponding to sequences
        - x_test: testing sequence
        - k: number of neighbours
    Returns:
        - list of k 3-tuples  (sequence, distance, epitope)
    '''
    distances = []
    
    for i in range(len(x_train)):
        dist = distance_metric(x_test, x_train[i]) # calculate distance
        distances.append((x_train[i], dist, y_train[i]))
        
    distances.sort(key=lambda x: x[1]) # sort by ascending distance
    
    neighbours = distances[:k] #k-nearest sequences
    return neighbours

def majority_vote(neighbours):
    '''
    Finds the majority class of k-nearest neighbours
    Input:
        - neighbours: list of k 3-tuples (sequence, distance, epitope)
    Returns:
        - maj: majority class
        - maj_prob: probability of majority class
    '''
    class_counter = Counter()
    number_of_neighbours = len(neighbours)
    
    for i in range(number_of_neighbours):
        dist, label = neighbours[i][1], neighbours[i][2]

        #class_counter[label] += 1 / (dist**2 + 1) #weighted
        class_counter[label] += 1 # frequency of each epitope in k-neighbours(unweighted)

    epitopes, votes = zip(*class_counter.most_common()) 
    
    # find majority epitope and probability 
    maj = class_counter.most_common(1)[0][0] 
    maj_votes = class_counter.most_common(1)[0][1]
    maj_prob = (maj_votes / sum(votes))
    
    return maj, maj_prob

def KNN(x_train, x_test, y_train, y_test, k = 9, 
        distance_metric=levenshtein_distance):
    '''
    Predicts the epitopes of x_test based on K-NN and
    finds the accuracy from y_test
    Inputs:
        - x_train, x_test, y_train, y_test: training and testing sequences and epitopes
        - k: number of nearest neighbours
        - distance_metric: distance metric to find K-NN
    Returns:
        - accuracy: fraction of epitopes correctly
        predicted by K-NN classifier 
        - y_pred: predicted epitopes
    
    '''
    y_pred =[]
    for i, test_seq in enumerate(x_test):
        neighbours = find_neighbours(x_train, y_train,test_seq, k,distance_metric) # find nearenst neighbours
        predicted_epi, prob_pred = majority_vote(neighbours) # find majority neighbouring epitope and probability
        y_pred.append(predicted_epi)
        
    accuracy = accuracy_score(y_test, y_pred)

    return(accuracy,y_pred)

In [3]:
def cross_validation_split(dataset, test_rat = 0.2):
    '''
    Splits data in k folds.
    Inputs:
        - dataset
        - folds: number of folds
    Returns:
        - k arrays of the split dataset  
    '''
    dataset_split = []
    df_copy = dataset
    n_folds = int(1/test_rat)
    fold_size = int(len(dataset) / n_folds)
        
    # for loop to save each fold
    for i in range(n_folds):
        fold = []

        while len(fold) < fold_size:
            index = df_copy.index[randrange(len(df_copy))] # index of a random element
            fold.append(df_copy.loc[index].values.tolist()) # save the randomly selected line
            df_copy = df_copy.drop(index) # delete selected line to avoid re-selection

        dataset_split.append(np.asarray(fold)) # save the i-th fold   

        
    return dataset_split 

def kfoldCV(dataset, k, test_rat = 0.2):
    '''
    Find cross-validated accuracy of K-NN classifier
    Inputs:
        - dataset
        - folds: number of folds
        - k: number of neighbours
    Returns:
        - kfold_acc: accuracies from each fold
        - mean_acc: mean accuracy between all folds
    '''
    split_data=cross_validation_split(dataset, test_rat = 0.2)
    n_folds = int(1/test_rat)
    kfold_acc=[]
    
    # evaluate K-NN performance for each fold
    for i in range(n_folds):
        r = list(range(n_folds))
        r.pop(i) # removes i-th group
        for j in r :
            if j == r[0]:
                cv_train = split_data[j]
            else:    
                cv_train=np.concatenate((cv_train,split_data[j]), axis=0)
                
        # determine training and test sets
        x_train, y_train = cv_train[:,0:1],cv_train[:,1]
        x_test, y_test= split_data[i][:,0:1], split_data[0][:,1]

        accuracy, y_predictions = KNN(x_train, x_test, y_train,y_test ,k) # assess KNN performance on this fold
        kfold_acc.append(accuracy)
        
    mean_acc  = mean(kfold_acc) # average accuracy between all k-folds
        
    return kfold_acc, mean_acc

In [4]:
def plot_KNN_accs(two_seqs, test_neighbours_no, cv_show = False, n_iter = 1, 
                  test_rat = 0.2, rand_state = 1):
    x_train, x_test, y_train, y_test = train_test_split(two_seqs['CDR3'], two_seqs['Epitope'], test_size = test_rat, random_state = rand_state)
    x_train = x_train.to_numpy()
    y_train = y_train.to_numpy()
    x_test = x_test.to_numpy()
    y_test = y_test.to_numpy()
    
    n_folds = int(1/test_rat)
    cv_dataset = (two_seqs.drop(columns=['Chain Length']))
    
    knn_tts_accs =[]
    cv_accs=[]


    for k in test_neighbours_no:

        train_test_split_acc, y_predictions = KNN(x_train, x_test, y_train, y_test,k)
        knn_tts_accs.append(train_test_split_acc)
        
        print(f"k={k} \t\t train_test_split accuracy = {train_test_split_acc:.6f}")
        
        if cv_show:
            kfold_acc, cv_acc = kfoldCV(cv_dataset, k=k)
            cv_accs.append(cv_acc)
            print(f"CV accuracy = {cv_acc:.6f}")

    ep = two_seqs.value_counts('Epitope')
    epi = ', '.join(ep.index.tolist())
    random_accuracy = 1/ep.size
    
    plt.figure()
    plt.grid(alpha=0.2)
    plt.title(f"K-NN Classifier for {epi} epitopes")
    plt.xlabel("k")
    plt.ylabel("Accuracy")
    plt.plot(test_neighbours_no,knn_tts_accs,'.-',label='train test split', linewidth=1.2)
    plt.axhline(y = random_accuracy, color = 'r', linestyle = '--',label = 'random classifier', alpha = 0.4)
    if cv_show:
        plt.plot(test_neighbours_no,cv_accs,'.-',label='cross-validation',linewidth=1.2)
    plt.legend()    
    plt.show()

## Alignment

In [5]:
def lr_align(sequences,showmaxlength=True, set_maxlength = 25, global_maxlength = 19, include_local = False): 
    
    oldmaxlength = sequences['Chain Length'].max()
    if oldmaxlength>set_maxlength:
        new_seqs = sequences.drop(sequences[sequences['Chain Length'] > set_maxlength].index, inplace = False)
    else:
        new_seqs = sequences

    newmaxlength = new_seqs['Chain Length'].max()
        
    if showmaxlength:
        print(f"Old max. length = {oldmaxlength}, new local max. length = {newmaxlength}.")
        
    if include_local: l_aligned, r_aligned = [],[]
    
    l_aligned_glob, r_aligned_glob = [],[]
    
    for seq in new_seqs['CDR3']:
        length = len(seq)
        
        if include_local: 
            lseq,rseq = seq,seq
            while length < newmaxlength:
                lseq,rseq  = lseq+'-', '-'+rseq
                length +=1
            l_aligned.append(lseq)
            r_aligned.append(rseq)
            
            
        lseq_glob,rseq_glob = seq,seq
        while length < global_maxlength:
                lseq_glob,rseq_glob  = lseq_glob+'-', '-'+rseq_glob
                length +=1
        l_aligned_glob.append(lseq_glob)
        r_aligned_glob.append(rseq_glob)
    
    
    d = {'Epitope':new_seqs['Epitope'],
         'Old length':new_seqs['Chain Length'],
 
         'Glob lalg': l_aligned_glob, 
         'Glob ralg': r_aligned_glob,
         'Glob alg length': global_maxlength
        }
    if include_local: 
        d['Loc lalg'] = l_aligned
        d['Loc ralg'] = r_aligned
        d['Loc alg length'] = newmaxlength
        
            
    df = pd.DataFrame(data=d)
    #return(l_aligned, r_aligned, new_seqs['Epitope'].to_numpy() )
    return df


## Clustering

In [6]:
def similarity_clustermap(df, dist_thres=6,epitope_name = 'epitope', alpha_column='cdr3a', beta_column='cdr3b',
                          norm=None,
                          linkage_kws=dict(method='average', optimal_ordering=True),
                          #cluster_kws=dict(t=6, criterion='distance'),
                          cbar_kws=dict(label='Sequence Distance',
                              format='%d', orientation='horizontal'),
                          meta_columns=None,
                          meta_to_colors=None,
                          **kws):
    """
    Plots a sequence-similarity clustermap.

    Parameters
    ----------
    df : pandas DataFrame with data
    alpha_column, beta_column: column name with alpha and beta amino acid information
    norm: function to normalize distances
    linkage_kws: keyword arguments for linkage algorithm
    cluster_kws: keyword arguments for clustering algorithm
    cbar_kws: keyword arguments for colorbar
    meta_columns: list-like
        metadata to plot alongside the cluster assignment 
    meta_to_colors: list-like
        list of functions mapping metadata labels to colors
        first element of list is for clusters
    kws: keyword arguments passed on to the clustermap.

    """
    
    print("Distance threshold: ",dist_thres)
    if meta_to_colors is None:
        if meta_columns is None:
            meta_to_colors = [rsp.labels_to_colors_hls]
        else:
            meta_to_colors = [rsp.labels_to_colors_hls]*(len(meta_columns)+1)

    sequences_alpha = df[alpha_column]
    sequences_beta = df[beta_column]
    sequences = sequences_alpha + '_' + sequences_beta

    distances_alpha = rsd.pdist(sequences_alpha)
    distances_beta = rsd.pdist(sequences_beta)
    distances = distances_alpha + distances_beta
    linkage = hc.linkage(distances, **linkage_kws)
    cluster = hc.fcluster(linkage, **dict(t=dist_thres, criterion='distance'))

    cmap = plt.cm.viridis
    cmaplist = [cmap(i) for i in range(cmap.N)]
    cmap = mpl.colors.LinearSegmentedColormap.from_list(
        'Custom cmap', list(reversed(cmaplist)), cmap.N)

    if norm is None:
        bounds = np.arange(0, dist_thres+1, 1) 
        norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
        # plot tick in the middle of the discretized colormap
        cbar_kws.update(dict(ticks=bounds[:-1]+0.5))

    cluster_colors = pd.Series(meta_to_colors[0](cluster, min_count=2),
                               name='Cluster')
    if not meta_columns is None:
        colors_list = [cluster_colors]
        if type(meta_columns) is dict:
            meta_colors = [pd.Series(mapper(df[col]), name=meta_columns[col])
                    for col, mapper in zip(meta_columns, meta_to_colors[1:])]
        else:
            meta_colors = [pd.Series(mapper(df[col]), name=col)
                    for col, mapper in zip(meta_columns, meta_to_colors[1:])]
        colors_list.extend(meta_colors)
        colors = pd.concat(colors_list, axis=1)
    else:
        colors = cluster_colors
    
    
    # default clustermap kws
    clustermap_kws = dict(cbar_kws=cbar_kws,
                          dendrogram_ratio=0.12, colors_ratio=0.04,
                          cbar_pos=(0.38, .99, .4, .02),
                          rasterized=True, figsize=(4.2, 4.2),
                          xticklabels=[], yticklabels=[],
                          )
    clustermap_kws.update(kws)
    
    cg = rsp.clustermap_split(pd.DataFrame(squareform(distances_alpha)),
                          pd.DataFrame(squareform(distances_beta)),
                          row_linkage=linkage, col_linkage=linkage,
                          cmap=cmap, norm=norm,
                          row_colors=colors,
                          **clustermap_kws)

    if norm is None:
        cbar_labels = [str(b) for b in bounds[:-1]]
        cbar_labels[-1] = '>' + cbar_labels[-1]
        cg.cax.set_xticklabels(cbar_labels)
    cg.ax_heatmap.set_xlabel(r'CDR3$\beta$ Sequence')
    cg.ax_heatmap.set_ylabel(r'CDR3$\beta$ Sequence')

    cg.ax_heatmap.text(len(df)*0.6,len(df)*0.15,f'{epitope_name}', fontsize=12,color='white')
    cg.ax_heatmap.text(len(df)*0.63,len(df)*0.23,f'threshold: {dist_thres}', fontsize=10,color='white')
    cg.ax_col_dendrogram.set_visible(False)

    return distances, cg, linkage, cluster

## Probability matrices

In [7]:
def pos_count(sequences, 
              default_pseudocount = 0.2,
              states_alph = ["-","A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y"]
             ):
    """
    Creates frequency/probability matrix of AAs at each position from a set of sequences.
    Includes a frequency pseudocount set by 'default_pseudocount'.
    """
    data = {'Seqs': sequences}
    df = pd.DataFrame(data, columns = ['Seqs'])

    freq_df = (df.Seqs.apply(list).apply(pd.Series).apply(pd.value_counts))
    
    present_letters = list(freq_df.index.values)
    
    # amino acids not present in a chain 
    absent_letters = []    
    for char in states_alph:
        if char not in present_letters:
            absent_letters.append(char)

    for i in absent_letters:
        freq_df =  freq_df.reindex(states_alph)
       
    new_freq_df = freq_df.fillna(0) 
    new_freq_df += default_pseudocount #add pseudocount to all positions
    
    prob_df = new_freq_df.apply(lambda x: x.div(x.sum()))

    return new_freq_df, prob_df

def clusters_prob_mat(clustered_seqs, alignment = 'Glob lalg', default_pseudocount = 0.2):
    """
    Creates a dictionary of probability matrices for each cluster in clustered_seqs.
    
    """
    prob_mats = {}
    
    for n in range(len(clustered_seqs.keys())):
        
        seqs = clustered_seqs[f'Group {n}'][alignment]
        train_freq,train_prob = pos_count(seqs, default_pseudocount = default_pseudocount)

        prob_mats[f"Group {n}"] = {}
        prob_mats[f"Group {n}"] = train_prob

    return (prob_mats)

def make_probability_dict(variables, lDict,rDict, 
                          epitope_names = ['YLQPRTFLL','GILGFVFTL'], var_name = 'Threshold',
                          default_pseudocount = 0.2, default_threshold = 7):
    
    """
    Creates a dictionary of probability matrices of all clusters in lDict and rDict.
    Inputs:
        - variables: list of variables (e.g. threshold, pseudocounts)
    """
    
    lprob_Dict = {}
    rprob_Dict = {}
    
    if var_name == 'Threshold': 

        for t in variables:
            lprob_Dict[f"Threshold {t}"] = {}
            rprob_Dict[f"Threshold {t}"] = {}

            for ep in epitope_names:
                lalg, ralg = lDict[f'Threshold {t}'][ep], rDict[f'Threshold {t}'][ep]

                lprob_Dict[f"Threshold {t}"][ep] = {}
                rprob_Dict[f"Threshold {t}"][ep] = {}

                lprob_mat = clusters_prob_mat(clustered_seqs = lalg, alignment = 'Glob lalg', default_pseudocount = default_pseudocount)
                rprob_mat = clusters_prob_mat(clustered_seqs = ralg, alignment = 'Glob ralg', default_pseudocount = default_pseudocount)

                lprob_Dict[f"Threshold {t}"][ep] = lprob_mat
                rprob_Dict[f"Threshold {t}"][ep] = rprob_mat
                
    if var_name == 'Pseudocount': 

        for pc in variables:
            lprob_Dict[f"Pseudocount {pc}"] = {}
            rprob_Dict[f"Pseudocount {pc}"] = {}

            for ep in epitope_names:
                lalg, ralg = lDict[f'Threshold {default_threshold}'][ep], rDict[f'Threshold {default_threshold}'][ep]

                lprob_Dict[f"Pseudocount {pc}"][ep] = {}
                rprob_Dict[f"Pseudocount {pc}"][ep] = {}

                lprob_mat = clusters_prob_mat(clustered_seqs = lalg, alignment = 'Glob lalg', default_pseudocount = pc)
                rprob_mat = clusters_prob_mat(clustered_seqs = ralg, alignment = 'Glob ralg', default_pseudocount = pc)

                lprob_Dict[f"Pseudocount {pc}"][ep] = lprob_mat
                rprob_Dict[f"Pseudocount {pc}"][ep] = rprob_mat

            
    return(lprob_Dict, rprob_Dict)


def seq_prob(seq_prob, test_seq):
    """
    Probability of a sequence (test_seq) given 
    probability matrix (seq_prob) from training sequences
    """
    probs = []
    for i, AA in enumerate(test_seq):
        AA_prob = seq_prob.loc[AA,i]
        probs.append(AA_prob)
    all_prob = np.prod(probs) # overall prob is the products of each AA in each pos.
    return all_prob


## Probability Model

In [8]:
def classify(variable, test_seqs, 
             lprobs, rprobs, ep1_name = 'YLQPRTFLL',ep2_name ='GILGFVFTL',
             var_name = 'Threshold',
             default_pseudocount = 0.2, default_threshold = 7):
    """
    Predicts the epitope specificity of each sequence in test_seqs based on the maximum
    probability from all clusters from each epitope. 
        - variable: value of variable being tested (t,pseudocount)
        - lprobs: dict of left padded probability matrices.
        - rprobs: dict of right padded probability matrices
        
    Returns the accuracies of the left/right padded sequences 
    
    """
    
    lcorrect_count,rcorrect_count = 0,0
    lclustersizes1,rclustersizes1,lclustersizes2,rclustersizes2 = ([] for i in range(4))    
    
    if var_name == 'Threshold': 
        lprob_dict = lprobs[f'Threshold {variable}']
        rprob_dict = rprobs[f'Threshold {variable}']
        
    elif var_name == 'Pseudocount':
        lprob_dict = lprobs[f'Pseudocount {variable}']
        rprob_dict = rprobs[f'Pseudocount {variable}']
        
    number_of_groups_ep1= min( len(lprob_dict[ep1_name]) , len(rprob_dict[ep1_name]) )
    number_of_groups_ep2= min( len(lprob_dict[ep2_name]) , len(rprob_dict[ep2_name]) )

    for k in range(len(test_seqs.index)):

        true_ep = test_seqs.iloc[k]['Epitope']
        
        # left/right padded test sequence
        l_test = test_seqs['Glob lalg'][k]
        r_test = test_seqs['Glob ralg'][k]

        print(f"Seq {k} ({true_ep}): lalg = {l_test}, ralg = {r_test}")
        
        lmin_prob1, rmin_prob1, lmin_prob2, rmin_prob2 = 0,0,0,0
        lprobs1, rprobs1, lprobs2, rprobs2 =[],[],[],[]

        # EPITOPE 1
        for n in range(number_of_groups_ep1):
            lprob_mat1 = lprob_dict[ep1_name][f'Group {n}']
            lseq_probs1 = seq_prob(lprob_mat1,l_test)
            lprobs1.append(lseq_probs1)

            if lmin_prob1<lseq_probs1:
                lmin_prob1 = lseq_probs1
                lmaxgroup1 = (f'Group {n}')

            rprob_mat1 = rprob_dict[ep1_name][f'Group {n}']
            rseq_probs1 = seq_prob(rprob_mat1,r_test)
            rprobs1.append(rseq_probs1)

            if rmin_prob1<rseq_probs1:
                rmin_prob1 = rseq_probs1
                rmaxgroup1 = (f'Group {n}')

        lmax_prob_1 = max(lprobs1)
        rmax_prob_1 = max(rprobs1)

        if var_name == 'Threshold': 
            lclustersize1 = len(lalg_Dict[f'Threshold {variable}'][ep1_name][lmaxgroup1])
            rclustersize1 = len(ralg_Dict[f'Threshold {variable}'][ep1_name][rmaxgroup1])

        elif var_name == 'Pseudocount': 
            lclustersize1 = len(lalg_Dict[f'Threshold {default_threshold}'][ep1_name][lmaxgroup1])
            rclustersize1 = len(ralg_Dict[f'Threshold {default_threshold}'][ep1_name][rmaxgroup1])
            
        lclustersizes1.append(lclustersize1)
        rclustersizes1.append(rclustersize1)
        print(f"{ep1_name:.3}: (left) max prob = {lmax_prob_1:.3} from {lmaxgroup1} ({lclustersize1} seqs), \t(right) max prob = {rmax_prob_1:.3} from {rmaxgroup1} ({rclustersize1} seqs)")

        # EPITOPE 2
        for m in range(number_of_groups_ep2):

            lprob_mat2 = lprob_dict[ep2_name][f'Group {m}']
            lseq_probs2 = seq_prob(lprob_mat2,l_test)
            lprobs2.append(lseq_probs2)

            if lmin_prob2<lseq_probs2:
                lmin_prob2 = lseq_probs2
                lmaxgroup2 = (f'Group {m}')

            rprob_mat2 = rprob_dict[ep2_name][f'Group {m}']
            rseq_probs2 = seq_prob(rprob_mat2,r_test)
            rprobs2.append(rseq_probs2)

            if rmin_prob2<rseq_probs2:
                rmin_prob2 = rseq_probs2
                rmaxgroup2 = (f'Group {m}')      

    
        lmax_prob_2 = max(lprobs2)
        rmax_prob_2 = max(rprobs2)
        
        if var_name == 'Threshold': 
            lclustersize2 = len(lalg_Dict[f'Threshold {variable}'][ep2_name][lmaxgroup2])
            rclustersize2 = len(ralg_Dict[f'Threshold {variable}'][ep2_name][rmaxgroup2])

        elif var_name == 'Pseudocount': 
            lclustersize2 = len(lalg_Dict[f'Threshold {default_threshold}'][ep2_name][lmaxgroup2])
            rclustersize2 = len(ralg_Dict[f'Threshold {default_threshold}'][ep2_name][rmaxgroup2])
        
        lclustersizes2.append(lclustersize2)
        rclustersizes2.append(rclustersize2)
        
        print(f"{ep2_name:.3}: (left) max prob = {lmax_prob_2:.3} from {lmaxgroup2} ({lclustersize2} seqs), \t(right) max prob = {rmax_prob_2:.3} from {rmaxgroup2} ({rclustersize2} seqs)")

        epitope_names = [ep1_name, ep2_name]
        
        if lmax_prob_1>lmax_prob_2: 
            lpred_epitope = ep1_name
        elif lmax_prob_1<lmax_prob_2:
            lpred_epitope = ep2_name
        else: lpred_epitope = random.choice(epitope_names)
            
        if rmax_prob_1>rmax_prob_2: 
            rpred_epitope = ep1_name
        elif rmax_prob_1<rmax_prob_2:
            rpred_epitope = ep2_name
        else: rpred_epitope = random.choice(epitope_names) # randomly choose if probs are equal

        print("(L) Predicted:", lpred_epitope, "\t(R) Predicted:", rpred_epitope)
        print()

        if lpred_epitope == true_ep: lcorrect_count+=1
  
        if rpred_epitope == true_ep: rcorrect_count+=1

    left_accuracy = lcorrect_count/len(test_seqs.index) 
    right_accuracy = rcorrect_count/len(test_seqs.index) 

    print(f"L_acc = {left_accuracy}, R_acc = {right_accuracy}")   
    print()

    
    return (lclustersizes1,rclustersizes1,lclustersizes2,rclustersizes2, left_accuracy, right_accuracy)
