# What if we evaluated the datasets with some very naïve predictors?
- based on sequence length
- based on %aa
- combination of both

In [37]:
import pandas as pd
from sklearn.metrics import auc, roc_curve

train_datasets = [
    'processed_data/2023_fc1_train.csv',
    'processed_data/oldPp_fc1_train.csv',
    'processed_data/old_naive_train.csv',
    'processed_data_v1.1_notransfer/2023_fc1_train.csv',
    'processed_data_v1.2_nominlen/2023_fc1_train.csv',
    'processed_data_v1.2_nominlen/oldPp_fc1_train.csv',
    'processed_data_v1.2_nominlen/old_naive_train.csv',
]

test_datasets = [
    'processed_data/2023_fc1_test.csv',
    'processed_data/oldPp_fc1_test.csv',
    'processed_data/old_naive_test.csv',
    'processed_data_v1.1_notransfer/2023_fc1_test.csv',
    'processed_data_v1.2_nominlen/2023_fc1_test.csv',
    'processed_data_v1.2_nominlen/oldPp_fc1_test.csv',
    'processed_data_v1.2_nominlen/old_naive_test.csv',
]



In [34]:
# fixed cut-off
for dataset in test_datasets:
    best_cutoff = -1
    best_auroc = -1
    auroc_at_75 = -1
    df = pd.read_csv(dataset)
    df['seqlen'] = df['SEQ'].apply(len)
    for seqlen_cutoff in range(60,250):
        df['PRED'] = df.seqlen.map(lambda x: 1 if int(x) < seqlen_cutoff else 0)
        df['PRED'] = df.seqlen.map(lambda x: 1 if int(x) < seqlen_cutoff else 0)
        fpr, tpr, _ = roc_curve(df['LABEL'], df['PRED'])
        auroc = auc(fpr, tpr)
        if auroc > best_auroc:
            best_auroc = auroc
            best_cutoff = seqlen_cutoff
        if seqlen_cutoff == 75:
            auroc_at_75 = auroc
    print(f'{dataset}, best cutoff: {best_cutoff}, best auroc: {best_auroc:.3f}, auroc at 75: {auroc_at_75:.3f}')
# predicted percentage based on seq len
for dataset in test_datasets:
    df = pd.read_csv(dataset)
    df['seqlen'] = df['SEQ'].apply(len)
    df['PRED'] = df.seqlen.map(lambda x: (200-x)/200)
    fpr, tpr, _ = roc_curve(df['LABEL'], df['PRED'])
    auroc = auc(fpr, tpr)
    print(f'{dataset}, auroc: {auroc:.3f}')


../processed_data/2023_fc1_test.csv, best cutoff: 72, best auroc: 0.605, auroc at 75: 0.595
../processed_data/oldPp_fc1_test.csv, best cutoff: 74, best auroc: 0.615, auroc at 75: 0.612
../processed_data/old_naive_test.csv, best cutoff: 72, best auroc: 0.657, auroc at 75: 0.653
../processed_data_v1.1_notransfer/2023_fc1_test.csv, best cutoff: 72, best auroc: 0.606, auroc at 75: 0.603
../processed_data_v1.2_nominlen/2023_fc1_test.csv, best cutoff: 71, best auroc: 0.624, auroc at 75: 0.617
../processed_data_v1.2_nominlen/oldPp_fc1_test.csv, best cutoff: 76, best auroc: 0.588, auroc at 75: 0.586
../processed_data_v1.2_nominlen/old_naive_test.csv, best cutoff: 72, best auroc: 0.620, auroc at 75: 0.615
../processed_data/2023_fc1_test.csv, auroc: 0.620
../processed_data/oldPp_fc1_test.csv, auroc: 0.638
../processed_data/old_naive_test.csv, auroc: 0.686
../processed_data_v1.1_notransfer/2023_fc1_test.csv, auroc: 0.622
../processed_data_v1.2_nominlen/2023_fc1_test.csv, auroc: 0.645
../processed

In [43]:
from sklearn.linear_model import LogisticRegression
import warnings 
warnings.filterwarnings('ignore')
# frequency per amino acid into a logistic regression
for train_set, test_set in zip(train_datasets, test_datasets):
    train_df = pd.read_csv(train_set)
    test_df = pd.read_csv(test_set)
    train_df['seqlen'] = train_df['SEQ'].apply(len)
    test_df['seqlen'] = test_df['SEQ'].apply(len)
    for aa in 'ACDEFGHIKLMNPQRSTVWY':
        train_df[f'freq_{aa}'] = train_df['SEQ'].apply(lambda x: x.count(aa)/len(x))
        test_df[f'freq_{aa}'] = test_df['SEQ'].apply(lambda x: x.count(aa)/len(x))
    
    train_df['combined_freq_FLMWYI'] = train_df['freq_F'] + train_df['freq_L'] + train_df['freq_M'] + train_df['freq_W'] + train_df['freq_Y'] + train_df['freq_I']
    train_df['combined_freq_KRH'] = train_df['freq_K'] + train_df['freq_R'] + train_df['freq_H']
    train_df['combined_freq_DEST'] = train_df['freq_D'] + train_df['freq_E'] + train_df['freq_S'] + train_df['freq_T']
    train_df['seqlen_pct'] = train_df['seqlen'].map(lambda x: (200-x)/200)
    test_df['combined_freq_FLMWYI'] = test_df['freq_F'] + test_df['freq_L'] + test_df['freq_M'] + test_df['freq_W'] + test_df['freq_Y'] + test_df['freq_I']
    test_df['combined_freq_KRH'] = test_df['freq_K'] + test_df['freq_R'] + test_df['freq_H']
    test_df['combined_freq_DEST'] = test_df['freq_D'] + test_df['freq_E'] + test_df['freq_S'] + test_df['freq_T']
    test_df['seqlen_pct'] = test_df['seqlen'].map(lambda x: (200-x)/200)

    best_auroc, best_auroc_grouped, best_auroc_full = -1, -1, -1
    for C in [0.01, 0.1, 1, 10, 100]:
        clf = LogisticRegression(C=C, class_weight="balanced")
        clf.fit(train_df[[f'freq_{aa}' for aa in 'ACDEFGHIKLMNPQRSTVWY']], train_df['LABEL'])
        test_df['PRED'] = clf.predict(test_df[[f'freq_{aa}' for aa in 'ACDEFGHIKLMNPQRSTVWY']])
        fpr, tpr, _ = roc_curve(test_df['LABEL'], test_df['PRED'])
        auroc = auc(fpr, tpr)
        if auroc > best_auroc:
            best_auroc = auroc

        clf = LogisticRegression(C=C, class_weight="balanced")
        clf.fit(train_df[['combined_freq_FLMWYI', 'combined_freq_KRH', 'combined_freq_DEST']], train_df['LABEL'])
        test_df['PRED_GROUPED'] = clf.predict(test_df[[f'combined_freq_FLMWYI', 'combined_freq_KRH', 'combined_freq_DEST']])
        fpr, tpr, _ = roc_curve(test_df['LABEL'], test_df['PRED_GROUPED'])
        auroc_grouped = auc(fpr, tpr)
        if auroc_grouped > best_auroc_grouped:
            best_auroc_grouped = auroc_grouped

        clf = LogisticRegression(C=C, class_weight="balanced")
        clf.fit(train_df[[f'freq_{aa}' for aa in 'ACDEFGHIKLMNPQRSTVWY'] + ['seqlen_pct']], train_df['LABEL'])
        test_df['PRED_FULL'] = clf.predict(test_df[[f'freq_{aa}' for aa in 'ACDEFGHIKLMNPQRSTVWY'] + ['seqlen_pct']])
        fpr, tpr, _ = roc_curve(test_df['LABEL'], test_df['PRED_FULL'])
        auroc_full = auc(fpr, tpr)
        if auroc_full > best_auroc_full:
            best_auroc_full = auroc_full
    
    print(f'{test_set}, all_frequencies, auroc: {best_auroc:.3f}, grouped_frequencies: {best_auroc_grouped:.3f}, all_with_seqlen: {best_auroc_full:.3f}')

../processed_data/2023_fc1_test.csv, all_frequencies, auroc: 0.636, grouped_frequencies: 0.630, all_with_seqlen: 0.651
../processed_data/oldPp_fc1_test.csv, all_frequencies, auroc: 0.631, grouped_frequencies: 0.614, all_with_seqlen: 0.660
../processed_data/old_naive_test.csv, all_frequencies, auroc: 0.667, grouped_frequencies: 0.654, all_with_seqlen: 0.703
../processed_data_v1.1_notransfer/2023_fc1_test.csv, all_frequencies, auroc: 0.628, grouped_frequencies: 0.625, all_with_seqlen: 0.652
../processed_data_v1.2_nominlen/2023_fc1_test.csv, all_frequencies, auroc: 0.606, grouped_frequencies: 0.606, all_with_seqlen: 0.651
../processed_data_v1.2_nominlen/oldPp_fc1_test.csv, all_frequencies, auroc: 0.615, grouped_frequencies: 0.615, all_with_seqlen: 0.627
../processed_data_v1.2_nominlen/old_naive_test.csv, all_frequencies, auroc: 0.645, grouped_frequencies: 0.641, all_with_seqlen: 0.670
