In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import pickle

In [12]:
# baseline/tfidf_lr_baseline.py

class TFIDF_LR_Classifier:
    def __init__(self, max_features=50000, ngram_range=(1,2), C=1.0):
        self.vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
        self.model = OneVsRestClassifier(LogisticRegression(C=C, solver='liblinear'))
        self.mlb = MultiLabelBinarizer()

    def fit(self, texts, labels):
        print("Fitting TF-IDF vectorizer...")
        X = self.vectorizer.fit_transform(texts)
        Y = self.mlb.fit_transform(labels)
        print("Training Logistic Regression...")
        self.model.fit(X, Y)

    def predict(self, texts):
        X = self.vectorizer.transform(texts)
        Y_pred = self.model.predict(X)
        return self.mlb.inverse_transform(Y_pred)

    def predict_proba(self, texts):
        X = self.vectorizer.transform(texts)
        Y_scores = self.model.predict_proba(X)
        return Y_scores

    def evaluate(self, texts, labels):
        X = self.vectorizer.transform(texts)
        Y_true = self.mlb.transform(labels)
        Y_pred = self.model.predict(X)
        micro = f1_score(Y_true, Y_pred, average='micro')
        macro = f1_score(Y_true, Y_pred, average='macro')
        return {'micro_f1': micro, 'macro_f1': macro}

    def save(self, path_prefix):
        with open(path_prefix + '_vectorizer.pkl', 'wb') as f:
            pickle.dump(self.vectorizer, f)
        with open(path_prefix + '_model.pkl', 'wb') as f:
            pickle.dump(self.model, f)
        with open(path_prefix + '_mlb.pkl', 'wb') as f:
            pickle.dump(self.mlb, f)

    def load(self, path_prefix):
        with open(path_prefix + '_vectorizer.pkl', 'rb') as f:
            self.vectorizer = pickle.load(f)
        with open(path_prefix + '_model.pkl', 'rb') as f:
            self.model = pickle.load(f)
        with open(path_prefix + '_mlb.pkl', 'rb') as f:
            self.mlb = pickle.load(f)


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 加载数据
df = pd.read_pickle('../data/mimic3_data_test.pkl')
texts = df['TEXT'].tolist()
labels = df['ICD9_CODE'].tolist()

# 划分训练/验证集
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

# 初始化与训练模型
clf = TFIDF_LR_Classifier(max_features=10000, ngram_range=(1,2), C=1.0)
clf.fit(X_train, y_train)

# 评估
results = clf.evaluate(X_val, y_val)
print("Validation Micro-F1:", results['micro_f1'])
print("Validation Macro-F1:", results['macro_f1'])

# 可选：保存模型
clf.save('models/tfidf_lr_baseline')


Fitting TF-IDF vectorizer...
Training Logistic Regression...


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1044a0220>>
Traceback (most recent call last):
  File "/opt/anaconda3/envs/ribonn/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 