In [1]:
import os
os.chdir(os.path.join(os.getcwd(),'..'))

In [2]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform

In [3]:
stage_in = lambda x: x[6:7]
age_in = lambda x: x[-2:]

In [4]:
from services import DataReader

X, y = DataReader().read_fva_solutions('fva_without.transports.txt')
_, y = DataReader().read_data('BC', True)

In [5]:
from preprocessing import DynamicPreprocessing

# pre = DynamicPreprocessing(['flux-diff', 'feature-selection','pathway-scoring', 'transport-elimination'])
pre = DynamicPreprocessing(['flux-diff', 'pathway-scoring', 'transport-elimination'])
X_pre = pre.fit_transform(X, y)

In [77]:
X_not_h, y_not_h = list(zip(*filter(lambda x: x[1] != 'h', zip(X_pre, y))))

X_not_h = DictVectorizer(sparse=False).fit_transform(X_not_h)

In [88]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.multiclass import OneVsOneClassifier

pipe = Pipeline([
            ('pca', PCA()),
            ('clf', OneVsOneClassifier(LogisticRegression(C=0.3e-6, random_state=43)))
            # ('clf', OneVsOneClassifier(AdaBoostClassifier(random_state=43))),
            # ('clf', OneVsOneClassifier(SVC(C=1e-6, kernel='rbf', random_state=43)))
            #('clf', MLPClassifier(activation="logistic",
            #                       random_state=43,
            #                       hidden_layer_sizes=(300, 100),
                                   #   alpha=1e-2,
            #                       max_iter=1000))

        ])

In [94]:
from sklearn.model_selection import cross_val_score, StratifiedKFold, GroupKFold

kf = StratifiedKFold(n_splits=5, random_state=43)

score = cross_val_score(pipe, X_not_h, y_not_h, cv=kf, n_jobs=-1, scoring='f1_micro')
print('kfold test %s: %s' % (scoring, score))
print('mean: %s' % score.mean())
print('std: %s' % score.std())

kfold test f1_micro: [ 0.39285714  0.35714286  0.53571429  0.25        0.26923077]
mean: 0.360989010989
std: 0.102271626414


In [95]:
from sklearn.metrics import classification_report

kf = StratifiedKFold(n_splits=5, random_state=43)

for train_index, test_index in kf.split(X_not_h, y_not_h):
        X_train, y_train= X_not_h[train_index], np.array(y_not_h)[train_index]
        X_test, y_test= X_not_h[test_index], np.array(y_not_h)[test_index]
        
        clf = pipe.fit(X_train, y_train)
        print(classification_report(clf.predict(X_test), y_test))

             precision    recall  f1-score   support

          1       0.75      0.60      0.67         5
          2       0.20      0.40      0.27         5
          3       0.60      0.40      0.48        15
          4       0.00      0.00      0.00         3

avg / total       0.49      0.39      0.42        28

             precision    recall  f1-score   support

          1       0.75      0.30      0.43        10
          2       0.40      0.67      0.50         6
          3       0.30      0.50      0.37         6
          4       0.00      0.00      0.00         6

avg / total       0.42      0.36      0.34        28

             precision    recall  f1-score   support

          1       1.00      0.40      0.57        10
          2       0.40      1.00      0.57         4
          3       0.70      0.64      0.67        11
          4       0.00      0.00      0.00         3

avg / total       0.69      0.54      0.55        28

             precision    recall  f1-