In [1]:
# 外部库引入
import jieba
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [4]:
# 测试中文分词
def chinese_tokenizer(text):
    return jieba.lcut(text)

text = "我爱自然语言处理"
tokens = chinese_tokenizer(text)
print("分词结果:", tokens)


def chinese_word_cut(text):
    """对中文文本进行分词，返回空格分隔的字符串"""
    return " ".join(jieba.lcut(str(text)))

print(chinese_word_cut("我爱自然语言处理"))

分词结果: ['我', '爱', '自然语言', '处理']
我 爱 自然语言 处理


In [5]:
'''
    自定义逻辑回归实现
'''

# 更新sigmoid函数确保数值稳定
def sigmoid(z):
    z = np.clip(z, -100, 100)
    return 1 / (1 + np.exp(-z))

class LogisticRegression:
    def __init__(self, num_classes=6, lr=0.05, epochs=5000, lambda_reg=0.1):
        self.lr = lr
        self.epochs = epochs
        self.num_classes = num_classes
        self.lambda_reg = lambda_reg  # L2正则化系数
        self.models = {}
    
    def train(self, X, y):
        n_samples, n_features = X.shape
        for cls in range(self.num_classes):
            y_binary = np.where(y == cls, 1, 0).astype(float)
            weights = np.zeros(n_features)
            bias = 0.0
            
            # 添加正则化训练
            for epoch in range(self.epochs):
                linear = np.dot(X, weights) + bias
                pred = sigmoid(linear)
                
                # 梯度计算（加入L2正则化）
                dw = (1/n_samples) * np.dot(X.T, (pred - y_binary)) + (self.lambda_reg/n_samples)*weights
                db = (1/n_samples) * np.sum(pred - y_binary)
                
                weights -= self.lr * dw
                bias -= self.lr * db
                
            self.models[cls] = (weights, bias)
    
    def predict(self, X):
        classes = list(self.models.keys())
        probs = np.zeros((X.shape[0], self.num_classes))
        
        for cls, (w, b) in self.models.items():
            linear = np.dot(X, w) + b
            probs[:, cls] = sigmoid(linear)
        
        return np.argmax(probs, axis=1)


In [6]:
'''
    自定义线性判别分析（LDA）实现
'''

# 线性判别分析（LDA）分类器
class LDAClassifier:
    def __init__(self):
        self.means = {}
        self.priors = {}
        self.cov_inv = None
        self.classes = []

    def train(self, X, y):
        self.classes = np.unique(y)
        D = X.shape[1]
        cov = np.zeros((D, D))
        for cls in self.classes:
            Xc = X[y == cls]
            self.means[cls] = np.mean(Xc, axis=0)
            self.priors[cls] = Xc.shape[0] / X.shape[0]
            cov += (Xc - self.means[cls]).T @ (Xc - self.means[cls])
        cov /= X.shape[0]
        self.cov_inv = np.linalg.inv(cov)

    def predict(self, X):
        scores = []
        for cls in self.classes:
            mean = self.means[cls]
            score = X @ self.cov_inv @ mean - 0.5 * mean.T @ self.cov_inv @ mean + np.log(self.priors[cls])
            scores.append(score)
        return self.classes[np.argmax(np.stack(scores, axis=1), axis=1)]


In [13]:
# --- 数据处理与训练 ---
# 读取数据（确保文件路径正确）
df = pd.read_csv("dataset.csv", sep="\t", header=None, names=["text", "label"])

print("数据样本示例：")
print(df.head())

# 配置TF-IDF向量化器
vectorizer = TfidfVectorizer(
    tokenizer=chinese_tokenizer,
    max_features=1000,  # 限制特征数量
    ngram_range=(1, 2), # 包含单字和双字词组
    min_df=2,           # 忽略低频词
    max_df=0.8          # 忽略高频词
)
X = vectorizer.fit_transform(df["text"])
print(f"特征矩阵形状：{X.shape}")

# 标签编码
unique_classes = df["label"].unique()
label_map = {label: idx for idx, label in enumerate(unique_classes)}
y = df["label"].map(label_map).values

# 分层分割数据集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42
)

print("\n训练集大小:", X_train.shape[0])
print("测试集大小:", X_test.shape[0])

数据样本示例：
                    text          label
0      还有双鸭山到淮阴的汽车票吗13号的   Travel-Query
1                从这里怎么回家   Travel-Query
2       随便播放一首专辑阁楼里的佛里的歌     Music-Play
3              给看一下墓王之王嘛  FilmTele-Play
4  我想看挑战两把s686打突变团竞的游戏视频     Video-Play
特征矩阵形状：(12100, 1000)

训练集大小: 10890
测试集大小: 1210


In [14]:
# 训练模型
## Logistic Regression Training
model = LogisticRegression(
    num_classes=len(unique_classes),
    lr=0.05,
    epochs=5000,
    lambda_reg=0.2
)
model.train(X_train.toarray(), y_train)
print("模型训练完成!")

# 预测与评估
y_pred = model.predict(X_test.toarray())

print("\n==== 评估结果 ====")
print("准确率:", np.mean(y_pred == y_test))
print("\n分类报告:")
print(classification_report(y_test, y_pred, target_names=unique_classes))

print("\n混淆矩阵:")
print(confusion_matrix(y_test, y_pred))

模型训练完成!

==== 评估结果 ====
准确率: 0.8181818181818182

分类报告:
                       precision    recall  f1-score   support

         Travel-Query       0.87      0.96      0.91       122
           Music-Play       0.78      0.85      0.82       130
        FilmTele-Play       0.68      0.78      0.72       136
           Video-Play       0.72      0.78      0.75       133
         Radio-Listen       0.92      0.78      0.84       129
HomeAppliance-Control       0.83      0.95      0.89       122
        Weather-Query       0.84      0.87      0.85       123
         Alarm-Update       0.89      0.94      0.91       126
       Calendar-Query       0.89      0.92      0.90       121
       TVProgram-Play       0.00      0.00      0.00        24
           Audio-Play       0.00      0.00      0.00        23
                Other       0.00      0.00      0.00        21

             accuracy                           0.82      1210
            macro avg       0.62      0.65      0.63      121

In [None]:
# LDA Classifier Training
# -----------------------------
X_text_cut = df['text'].apply(chinese_tokenizer).apply(lambda x: " ".join(x))
vectorizer = TfidfVectorizer(
    max_features=500,      # 根据数据量调整
    ngram_range=(1, 2),    # 单字词 + 双字词组合
    lowercase=False,       # 中文不需要
    token_pattern=r'\S+'   # 匹配非空白字符（因为我们已经分好词了）
)

X = vectorizer.fit_transform(X_text_cut).toarray()
y = df['label'].values

print(f"\n✅ 特征矩阵形状: {X.shape} (样本数 × 特征数)")

# 检查类别数量
print(f"✅ 类别列表: {np.unique(y)}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"✅ 训练集大小: {X_train.shape[0]}, 测试集大小: {X_test.shape[0]}")


lda_clf = LDAClassifier()
lda_clf.train(X_train, y_train)

y_pred = lda_clf.predict(X_test)

print("\n" + "="*50)
print("📊 测试集预测结果")
print("="*50)
print(f"真实标签: {y_test}")
print(f"预测标签: {y_pred}")

print("\n✅ 准确率:", accuracy_score(y_test, y_pred))
print("\n✅ 分类报告:")
print(classification_report(y_test, y_pred))


✅ 特征矩阵形状: (12100, 500) (样本数 × 特征数)
✅ 类别列表: ['Alarm-Update' 'Audio-Play' 'Calendar-Query' 'FilmTele-Play'
 'HomeAppliance-Control' 'Music-Play' 'Other' 'Radio-Listen'
 'TVProgram-Play' 'Travel-Query' 'Video-Play' 'Weather-Query']
✅ 训练集大小: 9680, 测试集大小: 2420

📊 测试集预测结果
真实标签: ['HomeAppliance-Control' 'Video-Play' 'HomeAppliance-Control' ...
 'Calendar-Query' 'Travel-Query' 'Travel-Query']
预测标签: ['HomeAppliance-Control' 'Video-Play' 'HomeAppliance-Control' ...
 'Calendar-Query' 'Travel-Query' 'Travel-Query']

✅ 准确率: 0.8338842975206612

✅ 分类报告:
                       precision    recall  f1-score   support

         Alarm-Update       0.99      0.91      0.94       253
           Audio-Play       0.56      0.67      0.61        45
       Calendar-Query       0.99      0.93      0.96       242
        FilmTele-Play       0.72      0.83      0.77       271
HomeAppliance-Control       0.94      0.89      0.92       243
           Music-Play       0.81      0.79      0.80       261
            