In [1]:
import pandas as pd
import numpy as np
from foundation import load_category_mapper, sample, load_stop_words
import jieba
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Category Mapper & Stop Words

Required Files

- category_mapper.json
- stopwords.txt

In [2]:
# load category mapper from the json file. The key is the category string, and the value is corresponding int value.
cat_mapper = load_category_mapper()
# load stop words from json
stop_words = load_stop_words()

In [3]:
print(f"key: ent, val: {cat_mapper['ent']}")
print(f"stop words len: {len(stop_words)}")

key: ent, val: 10
stop words len: 1266


# Preprocessing

Corpus generation:

- `__format_str(string)` removes any characters in the string that not belongs to chinese.
- Spliting the word by jieba.

In [5]:
def __is_chinese(uchar):
    if u'\u4e00' <= uchar <= u'\u9fa5':
        return True
    else:
        return False


def __format_str(string):
    content_str = ''
    for i in string:
        if __is_chinese(i):
            content_str = content_str + i
        else:
            content_str += ","
    return content_str

def generate_corpus(dataset, stopwords):
    corpus = list()
    for idx, row in dataset.iterrows():
        if idx == 1:
            print(f"The progress of generating corpus is {idx * 100 / len(dataset)}% ({idx}/{len(dataset)}).")
        elif idx % 100 == 0:
            print(f"The progress of generating corpus is {idx * 100 / len(dataset)}% ({idx}/{len(dataset)}).")
        text = __format_str(row["title"] + row["content"])
        words = jieba.cut(text)
        pwords = list()
        for word in words:
            if word not in stopwords:
                pwords.append(word)
        text = " ".join(pwords)
        corpus.append(text)
    print(f"The progress of generating corpus is 100% ({len(corpus)}/{len(corpus)}).")
    return corpus

Enrich your stopwords by `corpus_count()`, which will list the top 100, specified by `size`, most frequent words. If the word has no meaning, you could append it into `stopwords.txt`.

In [6]:
def corpus_count(corpus, size=100):
    corpus_split = [item for sublist in [i.split(" ") for i in corpus] for item in sublist]
    count_words = dict(Counter(corpus_split))
    outputwords_sorted = sorted(count_words.items(), key=lambda x: x[1], reverse=True)[:size]
    print(outputwords_sorted)

# Vectorization

In [7]:
# get vector representing `y` from database, which must have a column named category.
def generate_y(dataset):
    y = np.zeros((len(dataset),))
    for i in range(len(dataset)):
        y[i] = cat_mapper[dataset.iloc[i, :]["category"]]
    return y

# Example: Multinomial Naive Bayes with Sampled Data and sklearn

In [None]:
# sample 10,000 data from the dataset.
sample("news.csv", 10000, "news-sample.csv", "news-sample-train.csv", "news-sample-test.csv", 0.3)

## Load Sampled Dataset

In [8]:
# load sampled train dataset and sampled test dataset
sampled_train_dataset = pd.read_csv("news-sample-train.csv")
sampled_test_dataset = pd.read_csv("news-sample-test.csv")

In [4]:
print(sampled_train_dataset.head())
print("-------------------------")
print(sampled_test_dataset.head())
print("-------------------------")

NameError: name 'sampled_train_dataset' is not defined

## Corpus

In [10]:
sampled_train_corpus = generate_corpus(sampled_train_dataset, stop_words)
sampled_test_corpus = generate_corpus(sampled_test_dataset, stop_words)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/5l/6k3fjmx956sb52cw3f7w5x700000gn/T/jieba.cache


The progress of generating corpus is 0.0% (0/7000).


Loading model cost 0.837 seconds.
Prefix dict has been built successfully.


The progress of generating corpus is 0.014285714285714285% (1/7000).
The progress of generating corpus is 1.4285714285714286% (100/7000).
The progress of generating corpus is 2.857142857142857% (200/7000).
The progress of generating corpus is 4.285714285714286% (300/7000).
The progress of generating corpus is 5.714285714285714% (400/7000).
The progress of generating corpus is 7.142857142857143% (500/7000).
The progress of generating corpus is 8.571428571428571% (600/7000).
The progress of generating corpus is 10.0% (700/7000).
The progress of generating corpus is 11.428571428571429% (800/7000).
The progress of generating corpus is 12.857142857142858% (900/7000).
The progress of generating corpus is 14.285714285714286% (1000/7000).
The progress of generating corpus is 15.714285714285714% (1100/7000).
The progress of generating corpus is 17.142857142857142% (1200/7000).
The progress of generating corpus is 18.571428571428573% (1300/7000).
The progress of generating corpus is 20.0% (1400/

In [11]:
print(sampled_train_corpus[0])

送 拉杆箱 富士通 新低 售 富士通 外观设计 采用 简约 风格 以轻 无边 镁合金 顶盖 线条 唯美 内饰 依然 十分 干净 尽显 华丽 风格 搭配 寸 流明 超炫丽 宽屏 效果 清晰 自然 整机 功能 全面 摄像头 功能 一应俱全 不足 重量 便携性 十分 突出 适合 外出 携带 点评 富士通 轻薄 系列 款 富士通 整机 配置 均衡 稳定 采用 处理器 满足用户 需求 整机 突出 轻薄 化 特点 外观 做工 精致 适合 外出 携带 整机 功能 全面 续航力 算 不错 价位 依然 偏高 喜欢 用户 联系 以下 商家 产品型号 富士通 星 钻 黑 产品价格 送 拉杆箱 商家 名称 北京 惠泽 八方 科技 商家 地址 中关村 科贸 电子城 数字 物流 港 层 室 商家 电话 商家 联系人 宫 松岗 富士通 屏幕 大小 寸 重量 公斤 类型 高端 轻薄 已有 位 用户 评论 点击 查看 评论 推荐 买 观望


In [12]:
corpus_count(sampled_train_corpus)
print()
corpus_count(sampled_test_corpus)

[('公司', 8324), ('中国', 7530), ('市场', 6854), ('美国', 3741), ('时间', 3731), ('企业', 3663), ('比赛', 3573), ('发展', 3499), ('投资', 2967), ('亿元', 2920), ('经济', 2891), ('基金', 2838), ('出现', 2806), ('情况', 2748), ('工作', 2628), ('技术', 2619), ('产品', 2548), ('北京', 2509), ('增长', 2460), ('影响', 2440), ('数据', 2426), ('用户', 2347), ('信息', 2345), ('手机', 2246), ('行业', 2238), ('相关', 2237), ('国家', 2154), ('科技', 2153), ('提供', 2114), ('希望', 2077), ('国际', 2056), ('业务', 2042), ('资金', 2014), ('未来', 1949), ('学生', 1942), ('银行', 1926), ('全球', 1922), ('股', 1919), ('表现', 1913), ('价格', 1902), ('服务', 1901), ('平台', 1862), ('进入', 1856), ('计划', 1847), ('选择', 1836), ('重要', 1814), ('项目', 1807), ('使用', 1783), ('只', 1770), ('发布', 1769), ('发现', 1763), ('超过', 1757), ('网络', 1755), ('专业', 1751), ('支持', 1745), ('球队', 1736), ('球员', 1732), ('孩子', 1696), ('一定', 1696), ('关注', 1683), ('政策', 1677), ('能力', 1658), ('学校', 1623), ('万元', 1623), ('达到', 1617), ('微博', 1615), ('去年', 1593), ('美元', 1588), ('报道', 1587), ('管理', 1585), ('实现', 1583), ('媒体', 

## Vectorization

In [13]:
# vectorizer & transformer
counter_vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()
# train_X & test_X
cvectors_train_X = counter_vectorizer.fit_transform(sampled_train_corpus)
train_X = tfidf_transformer.fit_transform(cvectors_train_X)
test_X = counter_vectorizer.transform(sampled_test_corpus)

In [14]:
print(cvectors_train_X)

  (0, 528)	1
  (0, 3697)	1
  (0, 3750)	1
  (0, 5143)	1
  (0, 8978)	1
  (0, 8985)	1
  (0, 10767)	1
  (0, 11037)	1
  (0, 11112)	1
  (0, 13432)	2
  (0, 13599)	1
  (0, 14615)	1
  (0, 14663)	1
  (0, 16257)	2
  (0, 16353)	1
  (0, 16576)	1
  (0, 17761)	1
  (0, 22067)	3
  (0, 22954)	1
  (0, 23495)	2
  (0, 23998)	1
  (0, 29235)	1
  (0, 31359)	1
  (0, 31512)	5
  (0, 31721)	1
  :	:
  (6999, 103639)	2
  (6999, 104081)	4
  (6999, 104163)	1
  (6999, 104920)	1
  (6999, 105333)	1
  (6999, 105855)	1
  (6999, 106413)	1
  (6999, 106873)	1
  (6999, 107429)	1
  (6999, 107996)	1
  (6999, 108419)	1
  (6999, 108603)	2
  (6999, 110106)	1
  (6999, 110568)	2
  (6999, 110598)	1
  (6999, 110634)	2
  (6999, 111047)	1
  (6999, 111298)	1
  (6999, 111677)	2
  (6999, 114137)	1
  (6999, 116594)	1
  (6999, 116802)	1
  (6999, 116803)	1
  (6999, 117574)	1
  (6999, 120207)	1


In [15]:
train_y = generate_y(sampled_train_dataset)
test_y = generate_y(sampled_test_dataset)

In [16]:
print(train_y.shape)
print(test_y.shape)
print(train_y)
print(test_y)

(7000,)
(3000,)
[28. 22. 28. ... 28. 27. 12.]
[28. 12. 27. ... 26. 27. 26.]


## Model

In [17]:
model = MultinomialNB()
model.fit(train_X, train_y)

MultinomialNB()

## Predict & Assessment

In [18]:
pred_y = model.predict(test_X)
print(f"accuracy score: {accuracy_score(test_y, pred_y)}")

accuracy score: 0.746
