In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn import metrics
from collections import Counter

In [3]:
df = pd.read_csv("D:\\Github\\NLP\\Artificial_Intelligence_for_NLP\\Week_08_0824_svm_naive_bayes_decision_tree\\Assignment\\cleaned_data.csv", 
                 index_col=0)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [4]:
df.tail()

Unnamed: 0,content,label,content_cut
86858,新华社照片，多伦多，2017年6月7日\n（体育）（2）冰球——国家女子冰球队海外选秀在多伦...,1,新华社 照片 多伦多 2017年 6月 7日 体育 2 冰球 国家 女子 冰球队 海外 选秀...
86859,新华社兰州6月3日电（王衡、徐丹）记者从甘肃省交通运输厅获悉，甘肃近日集中开建高速公路、普通...,1,新华社 兰州 6月 3日 电 王衡 徐丹 记者 从 甘肃省 交通 运输厅 获悉 甘肃 近日 ...
86860,\n\n2017年5月29日，在法国巴黎郊外的凡尔赛宫，法国总统马克龙出席新闻发布会。（新华...,1,n2017 年 5月 29日 在 法国 巴黎 郊外 的 凡尔赛宫 法国 总统 马克龙 出席 ...
86861,\n\n2017年5月25日，在美国马萨诸塞州剑桥市，哈佛大学毕业生在毕业典礼上欢呼。（新华...,1,n2017 年 5月 25日 在 美国 马萨诸塞州 剑桥市 哈佛 大学 毕业生 在 毕业 典...
86862,新华社德国杜塞尔多夫６月６日电题：乒乓女球迷　\n 新华社记者王子江、张寒\n 熊老...,1,新华社 德国 杜塞尔多夫 ６月 ６日 电 题 乒乓 女 球迷 新华社 记者 王子江 张寒 熊...


In [6]:
df.label.value_counts() / len(df.label)

1    0.9034
0    0.0966
Name: label, dtype: float64

In [7]:
vectorized = TfidfVectorizer(max_features=1000)

In [8]:
X = vectorized.fit_transform(df.content_cut)

In [9]:
X = X.toarray()
y = df.label.to_numpy()

In [10]:
X.shape, y.shape

((86863, 1000), (86863,))

In [11]:
def corpus_split(X, y):
    train_dataset, test_dataset, train_labels, test_labels = train_test_split(X, y, test_size=.2)
    train_dataset, valid_dataset, train_labels, valid_labels = train_test_split(train_dataset, train_labels, test_size=.25)
    
    return train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels

In [12]:
train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels = corpus_split(X, y)

In [13]:
train_dataset.shape, train_labels.shape, valid_dataset.shape, valid_labels.shape, test_dataset.shape, test_labels.shape

((52117, 1000), (52117,), (17373, 1000), (17373,), (17373, 1000), (17373,))

分离的训练集正负样本占比和整体相等。

In [22]:
print(round(train_labels[train_labels == 1].shape[0] / train_labels.shape[0], 4))
print(round(train_labels[train_labels == 0].shape[0] / train_labels.shape[0], 4))

0.9034
0.0966


In [27]:
km_cluster = KMeans(2)

In [28]:
km_cluster.fit(train_dataset)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [31]:
km_cluster.cluster_centers_

array([[0.0049451 , 0.00284407, 0.01047298, ..., 0.00121321, 0.00089456,
        0.00030944],
       [0.00537977, 0.00295522, 0.00827494, ..., 0.00809344, 0.00820927,
        0.00337306]])

In [47]:
result = (km_cluster.labels_ == train_labels).astype(int)
print(km_cluster.labels_)
print(train_labels)
Counter(train_labels), Counter(km_cluster.labels_)

[0 1 0 ... 1 1 1]
[1 1 1 ... 1 1 1]


(Counter({1: 47085, 0: 5032}), Counter({0: 11477, 1: 40640}))

准确率

In [35]:
np.mean(result)

0.6839227123587314

In [37]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(train_labels, km_cluster.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(train_labels, km_cluster.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(train_labels, km_cluster.labels_))
print("Adjusted Rand-Index: %.3f"% metrics.adjusted_rand_score(train_labels, km_cluster.labels_))
print("Silhouette Coefficient: %0.3f"% metrics.silhouette_score(train_dataset, km_cluster.labels_, sample_size=1000))

Homogeneity: 0.074
Completeness: 0.045
V-measure: 0.056
Adjusted Rand-Index: -0.086
Silhouette Coefficient: 0.023


In [38]:
print(metrics.classification_report(train_labels, km_cluster.labels_))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      5032
           1       0.88      0.76      0.81     47085

    accuracy                           0.68     52117
   macro avg       0.44      0.38      0.41     52117
weighted avg       0.79      0.68      0.73     52117



In [40]:
y_pred = km_cluster.predict(test_dataset)
print(metrics.classification_report(test_labels, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1645
           1       0.88      0.76      0.82     15728

    accuracy                           0.69     17373
   macro avg       0.44      0.38      0.41     17373
weighted avg       0.80      0.69      0.74     17373



效果不如瞎猜，毕竟正数据占数据的90%