In [1]:
import pandas as pd
from datetime import datetime

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import clone
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score

import general_param as gparams

In [2]:
kmers_labeled = pd.read_pickle(gparams.kmers_data)

In [3]:
kmers_labeled

Unnamed: 0,kmers,labels
0,ctgaa tgaag gaagc aagcc agcct gcctt ccttt cttt...,1
1,gagcc agccc gcccc cccca cccac ccacc cacct acct...,1
2,tccag ccagc cagct agctt gcttt ctttc tttcg ttcg...,1
3,tgcat gcatt cattc attcg ttcgc tcgca cgcag gcag...,1
4,aggcg ggcgg gcggg cgggt gggtt ggttc gttcg ttcg...,1
...,...,...
5995,cgaat gaatg aatgc atgct tgctc gctcc ctccg tccg...,0
5996,ctcca tccat ccatg catgg atggg tgggg ggggg gggg...,0
5997,cggca ggcag gcagg caggg agggc gggcc ggccg gccg...,0
5998,acgac cgacg gacgg acggt cggta ggtag gtagg tagg...,0


We do the last feature engineering step directly here, so to keep the bag-of-words encoding in sparse form.

In [4]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, gparams.ngram_nbr))
X = vectorizer.fit_transform(kmers_labeled.kmers)

# Model Selection

We can go with any classification method here, so we will set up a procedure in which we can simply substitute the metod.

We will consider:

- [support vector classification](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)
- [logistic regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
- [random forests](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html?highlight=random%20forest#sklearn-ensemble-randomforestclassifier)
- [decision trees](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)
- [gradient boosting classifiers](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)
- [~~linear discriminant analysis~~](https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html)
- [~~neural networks~~](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html)


In [5]:
def fit_model(model, x_train, x_test, y_train, y_test):
    '''
    Perform a fit and retur the f1_score.
    
    Note that this method perfoms no in-place operations
    on the model, so the model is not affected by this
    function.
    '''
    _model = clone(model)
    start = datetime.now()
    _modle = _model.fit(x_train, y_train)
    y_pred = _model.predict(x_test)
    print('\n=======\n')
    print(_model)
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print('Classification Report')
    print(classification_report(y_test, y_pred))
    print(f'Duration {datetime.now() - start}')
    return f1_score(y_test, y_pred, average='macro')

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, kmers_labeled.labels, test_size = 0.2, random_state=gparams.random_state)

In [7]:
svc = SVC(kernel='linear', random_state=gparams.random_state)
logres = LogisticRegression(random_state=gparams.random_state)
dtc = DecisionTreeClassifier(class_weight='balanced', random_state=gparams.random_state)
frc = RandomForestClassifier(max_depth=2, class_weight='balanced_subsample', random_state=gparams.random_state)
gbc = GradientBoostingClassifier(random_state=gparams.random_state)

# lda does not fly with this input dim on my laptop (requires dense input)
# lda = LinearDiscriminantAnalysis()
# lda_pipe = make_pipeline(FunctionTransformer(lambda x: x.todense(), accept_sparse=True), lda)

# this will take forever I fear:
#mlpc = MLPClassifier(hidden_layer_sizes=(10,), random_state=gparams.random_state, max_iter=100)

In [None]:
for model in [svc, logres, dtc, frc]:
    fit_model(model, X_train, X_test, Y_train, Y_test)

### Decision

For the sake of time, we will continue with the support vector classifier only