## Classification

In [11]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn.objects as so
from sklearn.model_selection import train_test_split
from tcrpeg.classification import classification
from sklearn.utils import shuffle
from collections import Counter

In [2]:
# Add autoreload for easier development
%load_ext autoreload
%autoreload 2
from tcrpeg_toolkit.p_infer_model_evaluation import PinferCalculation
from tcrpeg.evaluate import evaluation
from tcrpeg.TCRpeg import TCRpeg

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

# List all models in the directory
files_models = os.listdir('/Users/vanessamhanna/Nextcloud/TCRpeg/models/')
# Filter for carto x files
models = [i for i in files_models if '-36' in i]

# List all embeddings files in the directory
files_embeddings = os.listdir('/Users/vanessamhanna/Nextcloud/TCRpeg/embeddings/txt/')
# Filter for carto x files
embeddings = [i for i in files_embeddings if '-36' in i]

#sample id
df_f = pd.read_csv('/Users/vanessamhanna/Nextcloud/TCRpeg/analysis/structured_data_signature_bis_36.csv')
samples = df_f['sample_id'].unique()

results = {
    'sample': [],
    'auc': [],
    'aup': []
}
for sample in samples:
    print(sample)
    filtered_df = df_f[df_f['sample_id']== sample ]
    filtered_df['label'] = filtered_df['result'].apply(lambda x:1 if x=='signature' else 0)
    
    model= next((i for i in models if sample in i), None)
    model_file = '/Users/vanessamhanna/Nextcloud/TCRpeg/models/' + model
    
    embedding= next((i for i in embeddings if sample in i),None)  
    embedding_file = '/Users/vanessamhanna/Nextcloud/TCRpeg/embeddings/txt/' + embedding
    
    tcrp_model = TCRpeg(embedding_path=embedding_file, device='cpu', hidden_size=128, num_layers=5)
    tcrp_model.create_model(load=True, path=model_file)
    
    tcrpeg_c = classification(tcrpeg=tcrp_model,embedding_size=64*10, device='cpu') 
    
    train, test = train_test_split(filtered_df,test_size=0.2)
    x_train= train['sequence']
    x_test= test['sequence']
    y_train=train['label']
    y_test=test['label']
    # Convert pandas Series to NumPy arrays
    X_train = x_train.to_numpy()
    X_test = x_test.to_numpy()
    y_train = y_train.to_numpy()
    y_test = y_test.to_numpy()
    
    tcrpeg_c.train(x_train=x_train,y_train=y_train,epochs=3,batch_size=8,lr=1e-3,val_split=0.2)
    auc,aup,y_pres,y_trues = tcrpeg_c.evaluate(x_test=x_test,y_test=y_test,batch_size=100)
    
    # Store results in the dictionary
    results['sample'].append(sample)
    results['auc'].append(auc)
    results['aup'].append(aup)

# Convert the results dictionary to a DataFrame for easier analysis
results_df = pd.DataFrame(results)

In [7]:
results_df
results_df.to_csv('/Users/vanessamhanna/Nextcloud/TCRpeg/analysis/classification_36.csv', index=False)


In [None]:
#random permutation test

# List all models in the directory
files_models = os.listdir('/Users/vanessamhanna/Nextcloud/TCRpeg/models/')
# Filter for carto x files
models = [i for i in files_models if '-35' in i]

# List all embeddings files in the directory
files_embeddings = os.listdir('/Users/vanessamhanna/Nextcloud/TCRpeg/embeddings/txt/')
# Filter for carto x files
embeddings = [i for i in files_embeddings if '-35' in i]

#sample id
samples = df_f['sample_id'].unique()

results = {
    'sample': [],
    'auc': [],
    'aup': []
}
for sample in samples:
    print(sample)
    filtered_df = df_f[df_f['sample_id']== sample ]
    filtered_df['label'] = filtered_df['result'].apply(lambda x:1 if x=='signature' else 0)
    
    model= next((i for i in models if sample in i), None)
    model_file = '/Users/vanessamhanna/Nextcloud/TCRpeg/models/' + model
        
    embedding= next((i for i in embeddings if sample in i),None)  
    embedding_file = '/Users/vanessamhanna/Nextcloud/TCRpeg/embeddings/txt/' + embedding
    
    tcrp_model = TCRpeg(embedding_path=embedding_file, device='cpu', hidden_size=128, num_layers=5)
    tcrp_model.create_model(load=True, path=model_file)
    
    tcrpeg_c = classification(tcrpeg=tcrp_model,embedding_size=64*10, device='cpu') 
    
    # Shuffle the DataFrame
    filtered_df['label'] = shuffle(filtered_df['label'].values, random_state=42)

    train, test = train_test_split(filtered_df,test_size=0.2, stratify=filtered_df['label'])
    x_train= train['sequence']
    x_test= test['sequence']
    y_train=train['label']
    y_test=test['label']
    # Convert pandas Series to NumPy arrays
    X_train = x_train.to_numpy()
    X_test = x_test.to_numpy()
    y_train = y_train.to_numpy()
    y_test = y_test.to_numpy()
    
    tcrpeg_c.train(x_train=x_train,y_train=y_train,epochs=3,batch_size=8,lr=1e-3,val_split=0.2)
    auc,aup,y_pres,y_trues = tcrpeg_c.evaluate(x_test=x_test,y_test=y_test,batch_size=100)
    
    # Store results in the dictionary
    results['sample'].append(sample)
    results['auc'].append(auc)
    results['aup'].append(aup)

# Convert the results dictionary to a DataFrame for easier analysis
results_df_permutation = pd.DataFrame(results)

In [None]:
results_df_permutation
results_df_permutation.to_csv('/Users/vanessamhanna/Nextcloud/TCRpeg/analysis/classification_permutation_35.csv', index=False)

In [18]:
def match_proportions(source, target):
    """
    Adjust the values in the source array so that its proportions match the proportions of the target array.
    
    Parameters:
    -----------
    source : numpy.ndarray
        The source array whose proportions need to be adjusted.
    target : numpy.ndarray
        The target array whose proportions are to be matched.
    
    Returns:
    --------
    numpy.ndarray
        The source array with adjusted proportions to match the target proportions.
    """
    # Calculate the proportions of each element in the target array
    target_counts = Counter(target)
    target_proportions = {k: v / len(target) for k, v in target_counts.items()}
    signature_size = target_proportions[1]

    # Calculate the counts needed in the source array to match the target proportions
    source_counts = Counter(source['label'])
    matched_source = []
    
    total_source = int(source_counts[1] / signature_size)
    non_signature_size = total_source - source_counts[1]
    
    non_signature = source[source['label'] == 0]
    non_signature_source = non_signature.sample(n=non_signature_size, random_state=42)

    matched_source = pd.concat([source[source['label'] == 1], non_signature_source])
    matched_source.sample(frac=1, random_state=42).reset_index(drop=True)    
    
    # np.unique(matched_source['label'], return_counts=True)
    x_test= matched_source['sequence']
    y_test= matched_source['label']
    y_test = y_test.to_numpy()
    
    return x_test, y_test

In [None]:
# classification with another test test 

# List all models in the directory
files_models = os.listdir('/Users/vanessamhanna/Nextcloud/TCRpeg/models/')
# Filter for carto x files
models = [i for i in files_models if '-26' in i]

# # List all embeddings files in the directory
# files_embeddings = os.listdir('/Users/vanessamhanna/Nextcloud/TCRpeg/embeddings/txt/')
# # Filter for carto x files
# embeddings = [i for i in files_embeddings if '-26' in i]

# general embeddings
files_embeddings = os.listdir('/Users/vanessamhanna/Nextcloud/TCRpeg/embeddings/')
embeddings = [i for i in files_embeddings if '_aa.txt' in i]

#sample id
df_f = pd.read_csv('/Users/vanessamhanna/Nextcloud/TCRpeg/analysis/structured_data_signature_bis_26.csv')
df_f['label'] = df_f['result'].apply(lambda x:1 if x=='signature' else 0)
samples = df_f['sample_id'].unique()

results = {
    'sample': [],
    'auc': [],
    'aup': []
}
for sample in samples:
    print(sample)
    filtered_df = df_f[df_f['sample_id']== sample ]
    
    model= next((i for i in models if sample in i), None)
    model_file = '/Users/vanessamhanna/Nextcloud/TCRpeg/models/' + model
    
    # embedding= next((i for i in embeddings if sample in i),None)  
    # embedding_file = '/Users/vanessamhanna/Nextcloud/TCRpeg/embeddings/txt/' + embedding
    embedding_file = '/Users/vanessamhanna/Nextcloud/TCRpeg/embeddings/all_unique_aa.txt' 
    
    tcrp_model = TCRpeg(embedding_path=embedding_file, device='cpu', hidden_size=128, num_layers=5)
    tcrp_model.create_model(load=True, path=model_file)
    
    tcrpeg_c = classification(tcrpeg=tcrp_model,embedding_size=64*10, device='cpu') 
    
    #train set
    x_train= filtered_df['sequence']
    y_train= filtered_df['label']

    tcrpeg_c.train(x_train=x_train,y_train=y_train,epochs=3,batch_size=8, lr=1e-3,val_split=0.2)

    # test set
    pop_data = os.path.basename(sample).split('-')[2]
    keep = [i for i in samples if pop_data in i and i != sample] 

    for pop in keep:
        test_set = df_f[df_f['sample_id']== pop]

        x_test, y_test = match_proportions(test_set, y_train)

        auc,aup,y_pres,y_trues = tcrpeg_c.evaluate(x_test=x_test,y_test=y_test,batch_size=100)
        
        # Store results in the dictionary
        results['sample'].append(sample)
        results['auc'].append(auc)
        results['aup'].append(aup)

# Convert the results dictionary to a DataFrame for easier analysis
results_df = pd.DataFrame(results)