In [17]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

def load_json(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

# 加载数据
domain1 = load_json(r"domain1_train_data.json")  # 平衡
domain2 = load_json(r"domain2_train_data.json")  # 非平衡

# 转换成DataFrame
df1 = pd.DataFrame(domain1)
df2 = pd.DataFrame(domain2)

# 合并
df = pd.concat([df1, df2], ignore_index=True)

# 用空格连接 tokens（索引序列）
df['text'] = df['text'].apply(lambda x: ' '.join(map(str, x)))


In [18]:
X = df['text']
y = df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# token 被视为“词”（整数索引）
vectorizer = TfidfVectorizer(analyzer="word", token_pattern=r"\S+")

X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

clf = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_val_tfidf)
print(classification_report(y_val, y_pred))
print("Validation Accuracy:", accuracy_score(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.65      0.87      0.74       150
           1       0.98      0.93      0.96      1050

    accuracy                           0.93      1200
   macro avg       0.81      0.90      0.85      1200
weighted avg       0.94      0.93      0.93      1200

Validation Accuracy: 0.925


In [21]:
# 加载测试数据（与训练集格式一致）
test_data = load_json(r"test_data.json")
df_test = pd.DataFrame(test_data)
df_test['text'] = df_test['text'].apply(lambda x: ' '.join(map(str, x)))

# 向量化
X_test_tfidf = vectorizer.transform(df_test['text'])

# 预测
y_test_pred = clf.predict(X_test_tfidf)


In [13]:
submission = pd.DataFrame({'id': range(len(y_test_pred)), 'class': y_test_pred})
submission.to_csv("submission.csv", index=False)
