In [9]:
import os
os.chdir(r"D:\qq\WORK1\WORK")
print("cwd =", os.getcwd())
import re
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import time


REVIEW_PATTERN = re.compile(r'<review id="(?P<id>\d+)"(?:\s+label="(?P<label>\d+)")?>(?P<text>.*?)</review>', re.DOTALL)

def load_data(file_path, default_label=None):
    texts = []
    labels = []
  
    print(f"Reading {file_path}...")
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except UnicodeDecodeError:

       with open(file_path, 'r', encoding='gb18030') as f:
            content = f.read()

    matches = REVIEW_PATTERN.finditer(content)
    for match in matches:
        text = match.group('text').strip()
        label_str = match.group('label')
      
        if label_str is not None:
            labels.append(int(label_str))
        elif default_label is not None:
            labels.append(default_label)
        else:
         
            continue
          
        texts.append(text)
      
    print(f"Loaded {len(texts)} samples from {file_path}")
    return texts, labels

def tokenize(text):
    return jieba.lcut(text)

def main():
    start_time = time.time()
  
    # 文本
    train_pos_path = r'train/evaltask2_sample_data/cn_sample_data/sample.positive.txt'
    train_neg_path = r'train/evaltask2_sample_data/cn_sample_data/sample.negative.txt'
    test_label_path = r'test mark/Sentiment Classification with Deep Learning/test.label.cn.txt'
  
    # 加载数据
    pos_texts, pos_labels = load_data(train_pos_path, default_label=1)
    neg_texts, neg_labels = load_data(train_neg_path, default_label=0)
  
    train_texts = pos_texts + neg_texts
    train_labels = pos_labels + neg_labels
  
    #加载测试数据
    test_texts, test_labels = load_data(test_label_path)
  
    print(f"\nTotal Training Samples: {len(train_texts)}")
    print(f"Total Test Samples: {len(test_texts)}")
  
    # 输出
    print("\nVectorizing text...")
    vectorizer = TfidfVectorizer(tokenizer=tokenize, max_features=5000) 
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
  
    print(f"Feature matrix shape: {X_train.shape}")
  
    # KNN 
    print("\nTraining KNN Classifier (k=10, metric='cosine')...")
    knn = KNeighborsClassifier(n_neighbors=10, metric='cosine')
    knn.fit(X_train, train_labels)
  
    print("Predicting on test set...")
    y_pred = knn.predict(X_test)
  
    print("\n" + "="*40)
    print("SENTIMENT ANALYSIS REPORT")
    print("="*40)
  
    acc = accuracy_score(test_labels, y_pred)
    print(f"Accuracy: {acc:.4f}")
  
    print("\nClassification Report:")
    print(classification_report(test_labels, y_pred, target_names=['Negative', 'Positive']))
  
    print("\nConfusion Matrix:")
    print(confusion_matrix(test_labels, y_pred))
  
    print(f"\nTotal execution time: {time.time() - start_time:.2f} seconds")

if __name__ == "__main__":
    main()


cwd = D:\qq\WORK1\WORK
Reading train/evaltask2_sample_data/cn_sample_data/sample.positive.txt...
Loaded 5000 samples from train/evaltask2_sample_data/cn_sample_data/sample.positive.txt
Reading train/evaltask2_sample_data/cn_sample_data/sample.negative.txt...
Loaded 5000 samples from train/evaltask2_sample_data/cn_sample_data/sample.negative.txt
Reading test mark/Sentiment Classification with Deep Learning/test.label.cn.txt...
Loaded 2500 samples from test mark/Sentiment Classification with Deep Learning/test.label.cn.txt

Total Training Samples: 10000
Total Test Samples: 2500

Vectorizing text...




Feature matrix shape: (10000, 5000)

Training KNN Classifier (k=10, metric='cosine')...
Predicting on test set...

SENTIMENT ANALYSIS REPORT
Accuracy: 0.6832

Classification Report:
              precision    recall  f1-score   support

    Negative       0.67      0.72      0.70      1250
    Positive       0.70      0.64      0.67      1250

    accuracy                           0.68      2500
   macro avg       0.68      0.68      0.68      2500
weighted avg       0.68      0.68      0.68      2500


Confusion Matrix:
[[906 344]
 [448 802]]

Total execution time: 6.61 seconds


In [2]:
!pip install jieba pandas numpy scikit-learn matplotlib seaborn

Collecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
     ---------------------------------------- 0.0/19.2 MB ? eta -:--:--
     ---------------------------------------- 0.0/19.2 MB ? eta -:--:--
     ---------------------------------------- 0.0/19.2 MB ? eta -:--:--
     --------------------------------------- 0.0/19.2 MB 245.8 kB/s eta 0:01:19
     --------------------------------------- 0.1/19.2 MB 357.2 kB/s eta 0:00:54
     --------------------------------------- 0.1/19.2 MB 420.8 kB/s eta 0:00:46
     --------------------------------------- 0.2/19.2 MB 654.4 kB/s eta 0:00:30
      -------------------------------------- 0.3/19.2 MB 999.0 kB/s eta 0:00:19
      --------------------------------------- 0.4/19.2 MB 1.2 MB/s eta 0:00:17
     - -------------------------------------- 0.8/19.2 MB 2.0 MB/s eta 0:00:10
     - -------------------------------------- 0.9/19.2 MB 2.1 MB/s eta 0:00:09
     -- ------------------------------------- 1.4/19.2 MB 2.7 MB/s eta 0:00:07
     --