In [19]:
import numpy as np
import pandas as pd
import sklearn
#### SVM
from sklearn.svm import SVC
from sklearn.metrics import classification_report, matthews_corrcoef, precision_score,recall_score, accuracy_score, f1_score
#
from feature_extraction import feature_extraction
from functions import get_model_results

### Data preparation

In [20]:
training = pd.read_table('../clean_metadata_training')
benchmarking = pd.read_table('../clean_metadata_benchmarking')
#randomization
training  = training.sample(frac=1, random_state=42)

positives_train = training[training['Signal peptide'].isnull() == False]
negatives_train = training[training['Signal peptide'].isnull() == True]

#K-fold splitting
pos_cv =  np.array_split(positives_train, 5)
neg_cv = np.array_split(negatives_train,5)



### SVM training 

In [21]:
### MODEL ###
#Define here the name of your model based on the features combination
model = 'SVM-comp-transmem_tendency-global_comp-hp_global'

### Features combination ###
global_comp= True
hp = False
hp_global = True
charge = False
h_tendency =False
transmem_tendency = True
glob_transmem_tendency = False

##### 5-fold crossvalidation

In [22]:
crossvalidation_results = pd.DataFrame(columns=['training','validation','testing', 'best C','best \u03B3','best K','MCC val','ACC test','MCC test','precision','recall','F1-score'])

for run in range(5):
    #subsets definition
    training_set = pd.concat((pos_cv[(run+2)%5],neg_cv[(run+2)%5],pos_cv[(run+3)%5],neg_cv[(run+3)%5],pos_cv[(run+4)%5],neg_cv[(run+4)%5]),ignore_index=True)
    validation_set = pd.concat((pos_cv[(run+1)%5],neg_cv[(run+1)%5]), ignore_index=True)
    testing_set = pd.concat((pos_cv[run%5],neg_cv[run%5]),ignore_index=True)


    #grid-search (hyperparameter tuning)
    C =  [1, 2, 4, 8]
    gamma = [1, 2, 'scale']
    K =  [18,19,20, 21, 22, 23, 24,25,26]

    tuning_hyp= pd.DataFrame(columns=['hyperparameters','MCC'])
    data_to_concat = []
    run_results = []

    for k in K:
        X_train, y_train = feature_extraction(training_set,k,global_comp=global_comp,hp=hp,hp_global=hp_global,charge=charge,h_tendency=h_tendency,transmem_tendency=transmem_tendency,glob_transmem_tendency =glob_transmem_tendency)
        X_val, y_val = feature_extraction(validation_set,k,global_comp=global_comp,hp=hp,hp_global=hp_global,charge=charge,h_tendency=h_tendency,transmem_tendency=transmem_tendency,glob_transmem_tendency =glob_transmem_tendency)
        for c in C:
            for g in gamma:
                '''Training'''
                clf = SVC(C=c,kernel='rbf',gamma=g)
                clf.fit(X_train[1:],y_train)
                '''Validation'''
                pred_val = clf.predict(X_val[1:])
                mcc_val = matthews_corrcoef(y_val,pred_val)
                data_to_concat.append({'hyperparameters': (k, c, g), 'MCC': mcc_val})
    tuning_hyp = pd.concat([tuning_hyp, pd.DataFrame(data_to_concat)], ignore_index=True)
    #select the hyperparameters combination that maximizes the MCC
    max_mcc = tuning_hyp['MCC'].max()
    i_max = tuning_hyp.index[tuning_hyp['MCC'] == tuning_hyp['MCC'].max()].tolist()
    best_hyp = tuning_hyp.iloc[i_max[0], tuning_hyp.columns.get_loc('hyperparameters')]

    '''Testing'''
    #train on the best hyperparameters
    X_train_best, y_train_best = feature_extraction(training_set,best_hyp[0],global_comp=global_comp,hp=hp,hp_global=hp_global,charge=charge,h_tendency=h_tendency,transmem_tendency=transmem_tendency,glob_transmem_tendency =glob_transmem_tendency)
    X_test, y_test = feature_extraction(testing_set,best_hyp[0],global_comp=global_comp,hp=hp,hp_global=hp_global,charge=charge,h_tendency=h_tendency,transmem_tendency=transmem_tendency,glob_transmem_tendency =glob_transmem_tendency)
    clf_best = SVC(C=best_hyp[1],kernel='rbf',gamma=best_hyp[2])
    clf_best.fit(X_train_best[1:],y_train_best)
    pred_test = clf_best.predict(X_test[1:])
    acc_test = accuracy_score(y_test,pred_test)
    mcc_test = matthews_corrcoef(y_test,pred_test)
    precision = precision_score(y_test,pred_test)
    recall = recall_score(y_test,pred_test)
    f1 = f1_score(y_test,pred_test)
    run_results.append({'training': str((run+2)%5)+','+str((run+3)%5)+','+str((run+4)%5),'validation':str((run+1)%5),'testing':str((run+5)%5),'best C':best_hyp[1],'best \u03B3':best_hyp[2],'best K':best_hyp[0],'MCC val':max_mcc,'ACC test': acc_test,'MCC test':mcc_test,'precision':precision,'recall':recall,'F1-score':f1})
    crossvalidation_results = pd.concat([crossvalidation_results,pd.DataFrame(run_results)],ignore_index=True)
    




#### results

In [23]:
#results of the 5 runs
crossvalidation_results.to_csv(f"results/cv-{model}.tsv",sep="\t")
crossvalidation_results

Unnamed: 0,training,validation,testing,best C,best γ,best K,MCC val,ACC test,MCC test,precision,recall,F1-score
0,234,1,0,1,2,26,0.90011,0.9794,0.890019,0.888889,0.914286,0.901408
1,340,2,1,8,2,21,0.903141,0.977045,0.871954,0.925,0.845714,0.883582
2,401,3,2,4,2,25,0.897566,0.979988,0.89117,0.907514,0.897143,0.902299
3,12,4,3,8,2,22,0.891706,0.979976,0.890662,0.912281,0.891429,0.901734
4,123,0,4,8,2,22,0.910814,0.981732,0.900494,0.913295,0.908046,0.910663


In [24]:
#pick the most frequent parameter and the average of the scores
model_results = get_model_results(crossvalidation_results,model)
model_results.to_csv('results/cv_results.tsv', mode='a',sep='\t',header=False)
model_results

Unnamed: 0,model,best C,best γ,best K,ACC,MCC,precision,recall,F1-score
0,SVM-comp-transmem_tendency-global_comp-hp_global,8,2,22,0.98 ± 0.00,0.89 ± 0.00,0.91 ± 0.01,0.89 ± 0.01,0.90 ± 0.00
