# Using notes' topic mixtures as features to develop models for predicting length of MV

In [111]:
import pandas as pd
import numpy as np

## Read data

In [112]:
train_mixture_filename = r'/home/mcb/li_lab/zwen8/data/mimic/d2_phy_nurse_48/infer/train_topics.csv'
train_duration_filename = r'/home/mcb/li_lab/zwen8/data/mimic/d2_phy_nurse_48/train_vent.txt'
valid_mixture_filename = r'/home/mcb/li_lab/zwen8/data/mimic/d2_phy_nurse_48/infer/validation_topics.csv'
valid_duration_filename = r'/home/mcb/li_lab/zwen8/data/mimic/d2_phy_nurse_48/validation_vent.txt'
test_mixture_filename = r'/home/mcb/li_lab/zwen8/data/mimic/d2_phy_nurse_48/infer/test_topics.csv'
test_duration_filename = r'/home/mcb/li_lab/zwen8/data/mimic/d2_phy_nurse_48/test_vent.txt'

In [113]:
train_mixtures = pd.read_csv(train_mixture_filename, header=None)
train_duration = pd.read_csv(train_duration_filename, header=None, sep=' ')
train_duration = train_duration[1].astype(np.float64)
valid_mixtures = pd.read_csv(valid_mixture_filename, header=None)
valid_duration = pd.read_csv(valid_duration_filename, header=None, sep=' ')
valid_duration = valid_duration[1].astype(np.float64)
test_mixtures = pd.read_csv(test_mixture_filename, header=None)
test_duration = pd.read_csv(test_duration_filename, header=None, sep=' ')
test_duration = test_duration[1].astype(np.float64)

In [114]:
train_mixtures.shape

(5831, 50)

In [115]:
train_duration.shape

(5831,)

## Convert to binarized duration

In [116]:
bi_train_duration = (train_duration > 7 * 24).astype(int)
bi_valid_duration = (valid_duration > 7 * 24).astype(int)
bi_test_duration = (test_duration > 7 * 24).astype(int)

## Logistic regression

In [117]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

In [118]:
lr = LogisticRegression(solver='liblinear',  max_iter=200)

In [119]:
lr.fit(train_mixtures, bi_train_duration)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [120]:
valid_predictions_lr = lr.predict(valid_mixtures)

In [121]:
print(roc_auc_score(bi_valid_duration, valid_predictions_lr))
print(classification_report(bi_valid_duration, valid_predictions_lr))

0.6956092916984006
              precision    recall  f1-score   support

           0       0.70      0.86      0.77       404
           1       0.75      0.53      0.62       325

    accuracy                           0.71       729
   macro avg       0.72      0.70      0.70       729
weighted avg       0.72      0.71      0.70       729



## Naive Bayes

In [122]:
from sklearn.naive_bayes import GaussianNB

In [123]:
GNB = GaussianNB()

In [124]:
GNB.fit(train_mixtures, bi_train_duration)

GaussianNB(priors=None, var_smoothing=1e-09)

In [125]:
valid_predictions_GNB = GNB.predict(valid_mixtures)

In [126]:
print(roc_auc_score(bi_valid_duration, valid_predictions_GNB))
print(classification_report(bi_valid_duration, valid_predictions_GNB))

0.5877532368621479
              precision    recall  f1-score   support

           0       0.61      0.91      0.73       404
           1       0.70      0.26      0.38       325

    accuracy                           0.62       729
   macro avg       0.66      0.59      0.56       729
weighted avg       0.65      0.62      0.57       729



## SVM

### Linear SVM

In [127]:
from sklearn.svm import LinearSVC

In [128]:
lsvm = LinearSVC()

In [129]:
lsvm.fit(train_mixtures, bi_train_duration)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [130]:
valid_predictions_lsvm = GNB.predict(valid_mixtures)

In [131]:
print(roc_auc_score(bi_valid_duration, valid_predictions_lsvm))
print(classification_report(bi_valid_duration, valid_predictions_lsvm))

0.5877532368621479
              precision    recall  f1-score   support

           0       0.61      0.91      0.73       404
           1       0.70      0.26      0.38       325

    accuracy                           0.62       729
   macro avg       0.66      0.59      0.56       729
weighted avg       0.65      0.62      0.57       729



### Non-linear SVM

In [132]:
from sklearn.svm import SVC

In [133]:
svm = SVC()

In [134]:
svm.fit(train_mixtures, bi_train_duration)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [135]:
valid_predictions_svm = GNB.predict(valid_mixtures)

In [136]:
print(roc_auc_score(bi_valid_duration, valid_predictions_lsvm))
print(classification_report(bi_valid_duration, valid_predictions_lsvm))

0.5877532368621479
              precision    recall  f1-score   support

           0       0.61      0.91      0.73       404
           1       0.70      0.26      0.38       325

    accuracy                           0.62       729
   macro avg       0.66      0.59      0.56       729
weighted avg       0.65      0.62      0.57       729



## Do Lasso regression

In [137]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [138]:
lasso = Lasso()

In [139]:
lasso.fit(train_mixtures, train_duration)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [140]:
valid_predictions_lasso = lasso.predict(valid_mixtures)

In [141]:
print(mean_squared_error(valid_duration, valid_predictions_lasso))

41810.96397345644


In [142]:
valid_predictions_lasso[0:5]

array([191.45907561, 198.95308671, 141.98132046, 154.46365004,
       222.04591674])

In [143]:
valid_duration.head()

0     68.016667
1    113.550000
2     80.450000
3    281.000000
4    161.000000
Name: 1, dtype: float64

## Grid Search LR
Logistic regression seems to be the best classifier. See how good it can be.

In [144]:
from sklearn.model_selection import GridSearchCV

In [145]:
estimator = LogisticRegression(solver='liblinear')

In [169]:
hyperparams = {
    'C': np.arange(0.2, 6, step=0.2),
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 150, 200, 250, 300]
              }

In [170]:
searcher = GridSearchCV(estimator=estimator, n_jobs=12, param_grid=hyperparams, scoring='roc_auc', cv=5)

In [171]:
searcher.fit(train_mixtures, bi_train_duration)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=12,
             param_grid={'C': array([0.2, 0.4, 0.6, 0.8, 1. , 1.2, 1.4, 1.6, 1.8, 2. , 2.2, 2.4, 2.6,
       2.8, 3. , 3.2, 3.4, 3.6, 3.8, 4. , 4.2, 4.4, 4.6, 4.8, 5. , 5.2,
       5.4, 5.6, 5.8]),
                         'max_iter': [100, 150, 200, 250, 300],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='

In [172]:
print(searcher.best_params_)

{'C': 0.8, 'max_iter': 100, 'penalty': 'l2'}


In [173]:
print(searcher.best_score_)

0.7509745916907533
