In [26]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import os
from collections import Counter

In [16]:
# baseline/tfidf_lr_baseline.py

class TFIDF_LR_Classifier:
    def __init__(self, max_features=50000, ngram_range=(1,2), C=1.0):
        self.vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
        #self.model = OneVsRestClassifier(LogisticRegression(C=C, solver='liblinear')) too slow! changed
        self.model = OneVsRestClassifier(
            LogisticRegression(C=C, solver='saga', max_iter=1000),
            n_jobs=-1  # parallel
        )
        self.mlb = MultiLabelBinarizer()

    def fit(self, texts, labels):
        print("Fitting TF-IDF vectorizer...")
        X = self.vectorizer.fit_transform(texts)
        Y = self.mlb.fit_transform(labels)
        print("Training Logistic Regression...")
        self.model.fit(X, Y)

    def predict(self, texts):
        X = self.vectorizer.transform(texts)
        Y_pred = self.model.predict(X)
        return self.mlb.inverse_transform(Y_pred)

    def predict_proba(self, texts):
        X = self.vectorizer.transform(texts)
        Y_scores = self.model.predict_proba(X)
        return Y_scores

    def evaluate(self, texts, labels):
        X = self.vectorizer.transform(texts)
        Y_true = self.mlb.transform(labels)
        Y_pred = self.model.predict(X)
        micro = f1_score(Y_true, Y_pred, average='micro')
        macro = f1_score(Y_true, Y_pred, average='macro')
        return {'micro_f1': micro, 'macro_f1': macro}

    def save(self, path_prefix):
        with open(path_prefix + '_vectorizer.pkl', 'wb') as f:
            pickle.dump(self.vectorizer, f)
        with open(path_prefix + '_model.pkl', 'wb') as f:
            pickle.dump(self.model, f)
        with open(path_prefix + '_mlb.pkl', 'wb') as f:
            pickle.dump(self.mlb, f)

    def load(self, path_prefix):
        with open(path_prefix + '_vectorizer.pkl', 'rb') as f:
            self.vectorizer = pickle.load(f)
        with open(path_prefix + '_model.pkl', 'rb') as f:
            self.model = pickle.load(f)
        with open(path_prefix + '_mlb.pkl', 'rb') as f:
            self.mlb = pickle.load(f)


In [31]:
# 加载数据
df = pd.read_pickle('../data/mimic3_data_test.pkl')
texts = df['TEXT'].tolist()
labels = df['ICD9_CODE'].tolist()

# 划分训练/验证集
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [None]:
# 初始化与训练模型
clf = TFIDF_LR_Classifier(max_features=10000, ngram_range=(1,2), C=1.0)
clf.fit(X_train, y_train)


# 评估
results = clf.evaluate(X_val, y_val)
print("Validation Micro-F1:", results['micro_f1'])
print("Validation Macro-F1:", results['macro_f1'])

# 可选：保存模型
clf.save('..models/tfidf_lr_baseline')

Fitting TF-IDF vectorizer...
Training Logistic Regression...




## 快速调试版本 baseline 脚本，它会：

✅ 限制标签为最常见的前 100 个
✅ 使用 saga solver + 多线程加速训练
✅ 限制训练集规模（可选）
✅ 输出模型性能评估

In [28]:

# === Step 1: 加载数据 ===
df = pd.read_pickle('../data/mimic3_data_test.pkl')

# === Step 2: 选取最常见的 top-N 标签（如前100个） ===
all_labels = [code for label_list in df['ICD9_CODE'] for code in label_list]
top_labels = set([label for label, _ in Counter(all_labels).most_common(100)])

df['filtered_labels'] = df['ICD9_CODE'].apply(lambda codes: [c for c in codes if c in top_labels])
df = df[df['filtered_labels'].map(len) > 0]  # 去除无标签数据

# === Step 3: 构建训练集/验证集 ===
texts = df['TEXT'].tolist()
labels = df['filtered_labels'].tolist()
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

# 可选：快速调试时限制训练集数量
X_train = X_train[:5000]
y_train = y_train[:5000]

# === Step 4: 向量化 + 模型训练 ===
print("Fitting TF-IDF vectorizer...")
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)

print("Binarizing labels...")
mlb = MultiLabelBinarizer()
Y_train_bin = mlb.fit_transform(y_train)

print(f"TF-IDF shape: {X_train_vec.shape}, Label shape: {Y_train_bin.shape}")

print("Training Logistic Regression...")
clf = OneVsRestClassifier(
    LogisticRegression(C=1.0, solver='saga', max_iter=1000),
    n_jobs=-1
)
clf.fit(X_train_vec, Y_train_bin)

# === Step 5: 验证集评估 ===
X_val_vec = vectorizer.transform(X_val)
Y_val_bin = mlb.transform(y_val)
Y_val_pred = clf.predict(X_val_vec)

micro = f1_score(Y_val_bin, Y_val_pred, average='micro')
macro = f1_score(Y_val_bin, Y_val_pred, average='macro')

print(f"Validation Micro-F1: {micro:.4f}")
print(f"Validation Macro-F1: {macro:.4f}")


Fitting TF-IDF vectorizer...
Binarizing labels...
TF-IDF shape: (5000, 10000), Label shape: (5000, 100)
Training Logistic Regression...
Validation Micro-F1: 0.2177
Validation Macro-F1: 0.0680
