In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import torch

# 1. 数据预处理

import csv

with open('text.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    text_data = [row for row in reader]

with open('iclr.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    data = [[float(col) for col in row] for row in reader]  # read all columns
    feature_data = [row[:-1] for row in data]
    y = [int(row[-1]) for row in data]  # read the last column as the label

text_df = pd.DataFrame(text_data)
feature_df = pd.DataFrame(feature_data)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 2. 标准化人工特征
X_feature = feature_df.values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_feature)
X_scaled = torch.from_numpy(X_scaled)

In [3]:
# 3. BERT 编码
import torch

# 检查是否有 GPU 可用
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f'Using GPU: {torch.cuda.get_device_name(device)}')
else:
    device = torch.device('cpu')
    print('Using CPU')

tokenizer = BertTokenizer.from_pretrained('allenai/specter2_base')
model = BertModel.from_pretrained('allenai/specter2_base')

# 将模型移动到 GPU 上
model.to(device)

from tqdm import tqdm

X_text = text_df.iloc[:, 0].tolist()
X_bert = []
for text in tqdm(X_text, desc='Processing BERT encoding', unit='text'):
    if isinstance(text, str):
        input_ids = tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        # 将输入张量移动到 GPU 上
        input_ids = input_ids.to(device)
        with torch.no_grad():
            output = model(input_ids)[0][:, 0, :]  # 取[CLS]向量
        # 将输出张量移动到 CPU 上
        X_bert.append(output.squeeze().cpu().numpy())
    else:
        X_bert.append(np.zeros(model.config.hidden_size))

# 将 X_bert 转换为 torch.Tensor
X_bert = torch.from_numpy(np.array(X_bert))

Using GPU: NVIDIA GeForce RTX 3060


  return self.fget.__get__(instance, owner)()
Processing BERT encoding:   0%|          | 0/3810 [00:00<?, ?text/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Processing BERT encoding: 100%|██████████| 3810/3810 [01:33<00:00, 40.63text/s]


In [4]:
# 4. 拼接特征
X = torch.cat([X_bert, X_scaled], dim=1)

# 5. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X.cpu().numpy(), y, test_size=0.2, random_state=42)

In [5]:
print(len(set(y)))

3


In [8]:
# 6. 建模
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 7. 计算性能指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted') 
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.5341
Precision: 0.4276
Recall: 0.5341
F1-score: 0.4734


  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
from sklearn.svm import SVC

# 6. 建模
model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 7. 计算性能指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.5591
Precision: 0.4527
Recall: 0.5591
F1-score: 0.4988


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
from sklearn.tree import DecisionTreeClassifier

# 6. 建模
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 7. 计算性能指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.4331
Precision: 0.4330
Recall: 0.4331
F1-score: 0.4323


In [15]:
from sklearn.neighbors import KNeighborsClassifier

# 6. 建模
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 7. 计算性能指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.4646
Precision: 0.4467
Recall: 0.4646
F1-score: 0.4512


In [None]:
pip install xgboost

In [11]:
from xgboost import XGBClassifier

# 6. 建模
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 7. 计算性能指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.5446
Precision: 0.4957
Recall: 0.5446
F1-score: 0.5017


In [12]:
from sklearn.ensemble import GradientBoostingClassifier

# 6. 建模
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 7. 计算性能指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.5564
Precision: 0.5358
Recall: 0.5564
F1-score: 0.5113


In [13]:
from sklearn.ensemble import AdaBoostClassifier

# 6. 建模
model = AdaBoostClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 7. 计算性能指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.5236
Precision: 0.4928
Recall: 0.5236
F1-score: 0.4957
