In [3]:
from datasets import load_dataset
dataset = load_dataset("ag_news")
# Combine the dataset and save it to a CSV file
import pandas as pd
df_1 = pd.DataFrame(dataset['train'])
df_2 = pd.DataFrame(dataset['test'])
df = pd.concat([df_1, df_2], ignore_index=True)
df.to_csv('ag_news.csv', index=False)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [2]:
for split in dataset.keys():
    dataset[split].to_csv(f"{split}.csv")

Creating CSV from Arrow format:   0%|          | 0/120 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Load the datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print(train_df.head())

Train shape: (120000, 2)
Test shape: (7600, 2)
                                                text  label
0  Wall St. Bears Claw Back Into the Black (Reute...      2
1  Carlyle Looks Toward Commercial Aerospace (Reu...      2
2  Oil and Economy Cloud Stocks' Outlook (Reuters...      2
3  Iraq Halts Oil Exports from Main Southern Pipe...      2
4  Oil prices soar to all-time record, posing new...      2


In [4]:
# Initialize CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# Fit and transform the training data
X_train_bow = vectorizer.fit_transform(train_df['text'])

# Transform the test data
X_test_bow = vectorizer.transform(test_df['text'])

print("Bag of Words Matrix (Train):", X_train_bow.shape)
print("Bag of Words Matrix (Test):", X_test_bow.shape)

# Show some feature names (words)
print("Some feature names:", vectorizer.get_feature_names_out()[:20])

Bag of Words Matrix (Train): (120000, 5000)
Bag of Words Matrix (Test): (7600, 5000)
Some feature names: ['00' '000' '04' '05' '10' '100' '101' '10th' '11' '11th' '12' '120'
 '12th' '13' '14' '146' '15' '150' '151' '16']


In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Initialize KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Train the classifier
print("Training KNN classifier...")
knn.fit(X_train_bow, train_df['label'])

# Predict on test data
print("Predicting on test data...")
y_pred = knn.predict(X_test_bow)

# Calculate accuracy
accuracy = accuracy_score(test_df['label'], y_pred)
print(f"KNN Accuracy: {accuracy:.4f}")

Training KNN classifier...
Predicting on test data...
KNN Accuracy: 0.7249


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])

print("TF-IDF Matrix (Train):", X_train_tfidf.shape)
print("TF-IDF Matrix (Test):", X_test_tfidf.shape)

# Train the KNN classifier on TF-IDF data
print("Training KNN classifier on TF-IDF data...")
knn_tfidf = KNeighborsClassifier(n_neighbors=5)
knn_tfidf.fit(X_train_tfidf, train_df['label'])

# Predict on test data
print("Predicting on test data (TF-IDF)...")
y_pred_tfidf = knn_tfidf.predict(X_test_tfidf)

# Calculate accuracy
accuracy_tfidf = accuracy_score(test_df['label'], y_pred_tfidf)
print(f"KNN Accuracy (TF-IDF): {accuracy_tfidf:.4f}")

TF-IDF Matrix (Train): (120000, 5000)
TF-IDF Matrix (Test): (7600, 5000)
Training KNN classifier on TF-IDF data...
Predicting on test data (TF-IDF)...
KNN Accuracy (TF-IDF): 0.8901


## Chinese Text Classification (TNEWS)

We will now perform the same exercise on Chinese news titles from the CLUE benchmark (TNEWS dataset).

**Key differences:**
1. We need to use **jieba** to segment Chinese characters into words (tokens) because Chinese text doesn't use spaces.
2. We will use the `validation` set as our test set, because the official `test` set labels are often hidden for fair benchmarking.

In [4]:
import pandas as pd
dataset = load_dataset("clue", "tnews")

In [10]:
dataset

DatasetDict({
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 53360
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 10000
    })
})

In [11]:
df = pd.DataFrame(dataset["validation"])

In [7]:
import jieba
from datasets import load_dataset

# Load TNEWS dataset
# We use the CLUE benchmark version of TNEWS
print("Loading TNEWS dataset...")
c_dataset = load_dataset("clue", "tnews")

# Display dataset structure
print(c_dataset)
print("First example:", c_dataset['train'][0])

Loading TNEWS dataset...


  import pkg_resources


DatasetDict({
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 53360
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 10000
    })
})
First example: {'sentence': '上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？', 'label': 7, 'idx': 0}


In [8]:
# Function to segment Chinese text using jieba
def segment_text(examples):
    # 'sentence' is the column name in TNEWS containing the text
    return {"text_cut": [" ".join(jieba.cut(text)) for text in examples["sentence"]]}

# Apply segmentation to all splits
print("Segmenting text (this may take a few seconds)...")
c_dataset = c_dataset.map(segment_text, batched=True)

print("Original:", c_dataset['train'][0]['sentence'])
print("Segmented:", c_dataset['train'][0]['text_cut'])

Segmenting text (this may take a few seconds)...
Original: 上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？
Segmented: 上课时 学生 手机 响个 不停 ， 老师 一怒之下 把 手机 摔 了 ， 家长 拿 发票 让 老师 赔 ， 大家 怎么 看待 这种 事 ？


In [9]:
# Convert to pandas for easier machine learning workflow
train_df_c = c_dataset['train'].to_pandas()
test_df_c = c_dataset['validation'].to_pandas() # Using validation as test

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
# Limit max_features to 5000 as well
count_vectorizer_c = CountVectorizer(max_features=5000)

# Fit and transform
print("Vectorizing Chinese text (Bag of Words)...")
X_train_bow_c = count_vectorizer_c.fit_transform(train_df_c['text_cut'])
X_test_bow_c = count_vectorizer_c.transform(test_df_c['text_cut'])

# Train KNN
print("Training KNN on Chinese data (BoW)...")
knn_bow_c = KNeighborsClassifier(n_neighbors=5)
knn_bow_c.fit(X_train_bow_c, train_df_c['label'])

# Evaluate
print("Predicting (BoW)...")
y_pred_bow_c = knn_bow_c.predict(X_test_bow_c)
accuracy_bow_c = accuracy_score(test_df_c['label'], y_pred_bow_c)

print(f"Chinese News Classification Accuracy (KNN+BoW): {accuracy_bow_c:.4f}")

Vectorizing Chinese text (Bag of Words)...
Training KNN on Chinese data (BoW)...
Predicting (BoW)...
Chinese News Classification Accuracy (KNN+BoW): 0.2599


In [11]:
# Initialize TF-IDF Vectorizer
# Note: We don't need stop_words='english' obviously.
# We can provide a chinese stop_words list, but we'll skip it for simplicity here.
tfidf_vectorizer_c = TfidfVectorizer(max_features=5000)

# Fit and transform
print("Vectorizing Chinese text (TF-IDF)...")
X_train_c = tfidf_vectorizer_c.fit_transform(train_df_c['text_cut'])
X_test_c = tfidf_vectorizer_c.transform(test_df_c['text_cut'])

# Train KNN
print("Training KNN on Chinese data (TF-IDF)...")
knn_c = KNeighborsClassifier(n_neighbors=5)
knn_c.fit(X_train_c, train_df_c['label'])

# Evaluate
print("Predicting (TF-IDF)...")
y_pred_c = knn_c.predict(X_test_c)
accuracy_c = accuracy_score(test_df_c['label'], y_pred_c)

print(f"Chinese News Classification Accuracy (KNN+TFIDF): {accuracy_c:.4f}")

Vectorizing Chinese text (TF-IDF)...
Training KNN on Chinese data (TF-IDF)...
Predicting (TF-IDF)...
Chinese News Classification Accuracy (KNN+TFIDF): 0.2518
