In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 示例数据集（您可以替换为真实数据）
# 创建示例文本数据
texts = [
    "I love this movie, it's fantastic",
    "This film is terrible, I hate it",
    "Great acting and wonderful story",
    "Awful movie, waste of time",
    "Amazing cinematography and plot",
    "Boring and poorly executed",
    "Excellent performance by actors",
    "Disappointing and dull film",
    "Brilliant direction and screenplay",
    "Horrible and confusing story"
]

# 对应标签（1表示正面，0表示负面）
labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

# 创建DataFrame
data = pd.DataFrame({'text': texts, 'label': labels})

# 1. 数据预处理
X = data['text']
y = data['label']

# 2. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. 文本向量化（词袋模型）
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 4. 训练朴素贝叶斯分类器
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vec, y_train)

# 5. 预测
y_pred = nb_classifier.predict(X_test_vec)

# 6. 评估模型
accuracy = accuracy_score(y_test, y_pred)
print(f"准确率: {accuracy:.2f}")

print("\n分类报告:")
print(classification_report(y_test, y_pred))

print("\n混淆矩阵:")
print(confusion_matrix(y_test, y_pred))

# 7. 预测新文本
new_texts = ["This movie is really good", "I don't like this film"]
new_texts_vec = vectorizer.transform(new_texts)
predictions = nb_classifier.predict(new_texts_vec)
probabilities = nb_classifier.predict_proba(new_texts_vec)

for i, text in enumerate(new_texts):
    print(f"\n文本: {text}")
    print(f"预测类别: {'正面' if predictions[i] == 1 else '负面'}")
    print(f"概率: 负面={probabilities[i][0]:.2f}, 正面={probabilities[i][1]:.2f}")

准确率: 0.33

分类报告:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3


混淆矩阵:
[[0 2]
 [0 1]]

文本: This movie is really good
预测类别: 正面
概率: 负面=0.32, 正面=0.68

文本: I don't like this film
预测类别: 正面
概率: 负面=0.49, 正面=0.51


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
