In [None]:
from doc2vec import read_fasta_to_kmers, encode_and_labels
from model_tools import evalution_metrics
import numpy as np
import pandas as pd
import joblib
import os

In [None]:
# load data
train_pos = read_fasta_to_kmers('../data/afp_pos_seq_len50train_2710.fasta')
train_neg = read_fasta_to_kmers('../data/afp_neg_seq_len50train_2710.fasta')
test_pos = read_fasta_to_kmers('../data/afp_pos_seq_len50independant_test_301.fasta')
test_neg = read_fasta_to_kmers('../data/afp_neg_seq_len50independant_test_301.fasta')

In [None]:
# encoding through Doc2Vec pretrained 
doc2vec_model = '../Doc2Vec_model/AFP_doc2vec.model'
train_data, train_labels = encode_and_labels(train_pos, train_neg, doc2vec_model)
test_data, test_labels = encode_and_labels(test_pos, test_neg, doc2vec_model)

## Random forest

In [None]:
from sklearn import ensemble
forest = ensemble.RandomForestClassifier(n_estimators = 100)
forest_fit = forest.fit(train_data, train_labels)
joblib.dump(forest, '../Doc2Vec_model/forest_doc2vec.pkl')

In [None]:
forest = joblib.load('../Doc2Vec_model/forest_doc2vec.pkl')
labels_score = forest.predict(test_data)
evalution_metrics(test_labels, labels_score)

## Support vector machine

In [None]:
from sklearn import svm
svc = svm.SVC()
svc_fit = svc.fit(train_data, train_labels)
joblib.dump(svc, '../Doc2Vec_model/svm_doc2vec.pkl')

In [None]:
svc = joblib.load('../Doc2Vec_model/svm_doc2vec.pkl')
labels_score = svc.predict(test_data)
evalution_metrics(test_labels, labels_score)

## 10-fold cross-validation

In [None]:
from sklearn.model_selection import KFold

def fold_cv(train_data, labels, mode='svm', output_dir = '.'):
    if not os.path.isdir(output_dir):
            os.makedirs(output_dir)
    kfold = KFold(n_splits=10, shuffle=True)
    # K-fold Cross Validation model evaluation
    df = pd.DataFrame(columns=['accuracy', 'precision', 'sensitivity', 'specificity', 'f1', 'mcc'])
    fold_no = 1
    if mode not in ['rf','svm']:
        raise Exception('mode should be svm or rf')
    for train, val in kfold.split(train_data, labels):
        # Generate a print
        print('------------------------------------------------------------------------')
        print(f'Training for fold {fold_no} ...')
        
        if mode == 'svm':
            svc = svm.SVC()
            svc_fit = svc.fit(train_data[train], labels[train])
            labels_score = svc.predict(train_data[val])
            joblib.dump(svc, os.path.join(output_dir, 'svm_%s.pkl'%fold_no))
        
        if mode == 'rf':
            forest = ensemble.RandomForestClassifier(n_estimators = 100)
            forest_fit = forest.fit(train_data[train], labels[train])
            labels_score = forest.predict(train_data[val])
            joblib.dump(forest, os.path.join(output_dir, 'forest_%s.pkl'%fold_no))
        
        metrics_dict = evalution_metrics(labels[val], labels_score, save=False)
        print(metrics_dict)
        df.loc[fold_no] = metrics_dict.values()
        # Increase fold number
        fold_no = fold_no + 1
    df.loc['Mean'] = df.mean()
    df.to_csv(os.path.join(output_dir,'%s_cv.csv'%mode))
    return(df)

In [None]:
fold_cv(train_data, train_labels, mode='svm', output_dir = '../Doc2Vec_model/10_fold_svm')

In [None]:
fold_cv(train_data, train_labels, mode='rf', output_dir = '../Doc2Vec_model/10_fold_rf')