In [1]:
import pandas as pd
from datetime import datetime

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import clone
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score

import general_param as gparams

In [2]:
kmers_labeled = pd.read_pickle(gparams.kmers_data)

We do the last feature engineering step directly here, so to keep the bag-of-words encoding in sparse form.

In [4]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, gparams.ngram_nbr))
X = vectorizer.fit_transform(kmers_labeled.kmers)

# Model Selection

We can go with any classification method here, so we will set up a procedure in which we can simply substitute the metod.

We will consider:

- [support vector classification](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)
- [logistic regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
- [random forests](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html?highlight=random%20forest#sklearn-ensemble-randomforestclassifier)
- [decision trees](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)
- [gradient boosting classifiers](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)
- [~~linear discriminant analysis~~](https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html)
- [~~neural networks~~](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html)


In [5]:
def fit_model(model, x_train, x_test, y_train, y_test):
    '''
    Perform a fit and retur the f1_score.
    
    Note that this method perfoms no in-place operations
    on the model, so the model is not affected by this
    function.
    '''
    _model = clone(model)
    start = datetime.now()
    _modle = _model.fit(x_train, y_train)
    y_pred = _model.predict(x_test)
    print('\n=======\n')
    print(_model)
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print('Classification Report')
    print(classification_report(y_test, y_pred))
    print(f'Duration {datetime.now() - start}')
    return f1_score(y_test, y_pred, average='macro')

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, kmers_labeled.labels, test_size = 0.2, random_state=gparams.random_state)

In [9]:
svc = SVC(kernel='rbf', random_state=gparams.random_state)
lsvc = SVC(kernel='linear', random_state=gparams.random_state)
logres = LogisticRegression(random_state=gparams.random_state)
dtc = DecisionTreeClassifier(class_weight='balanced', random_state=gparams.random_state)
frc = RandomForestClassifier(max_depth=2, class_weight='balanced_subsample', random_state=gparams.random_state)
gbc = GradientBoostingClassifier(random_state=gparams.random_state)

# lda does not fly with this input dim on my laptop (requires dense input)
# lda = LinearDiscriminantAnalysis()
# lda_pipe = make_pipeline(FunctionTransformer(lambda x: x.todense(), accept_sparse=True), lda)

# this will take forever I fear:
#mlpc = MLPClassifier(hidden_layer_sizes=(10,), random_state=gparams.random_state, max_iter=100)

In [10]:
for model in [svc, lsvc, logres, dtc, frc]:
    fit_model(model, X_train, X_test, Y_train, Y_test)



SVC(random_state=12)
Confusion Matrix:
[[1002    0]
 [ 179   19]]
Classification Report
              precision    recall  f1-score   support

           0       0.85      1.00      0.92      1002
           1       1.00      0.10      0.18       198

    accuracy                           0.85      1200
   macro avg       0.92      0.55      0.55      1200
weighted avg       0.87      0.85      0.80      1200

Duration 0:03:07.525904


SVC(kernel='linear', random_state=12)
Confusion Matrix:
[[1000    2]
 [  60  138]]
Classification Report
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1002
           1       0.99      0.70      0.82       198

    accuracy                           0.95      1200
   macro avg       0.96      0.85      0.89      1200
weighted avg       0.95      0.95      0.94      1200

Duration 0:02:44.289560


LogisticRegression(random_state=12)
Confusion Matrix:
[[1001    1]
 [  73  125]]
Classification Rep

## Note
To properly evaluate the models we would need to do some hyperparameter tuning for all of them.
But this takes some time, especially if each fit lasts for several minutes.

### Decision

For the sake of time, we will **continue only with the support vector classifier**.