In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from foundation import *

# Example: Multinomial Naive Bayes with Sampled Data and sklearn

In [2]:
def sample(dspath, n, sampledspath, traindspath, testdspath, traincorpuspath, testcorpuspath, testsize):
    print(f">>> Loading dataset from \"{dspath}\".")
    data = pd.read_csv(dspath).sample(n)
    print(">>> Saving samples.")
    data.to_csv(sampledspath)
    print(">>> Spliting dataset.")
    train, test = train_test_split(data, test_size=testsize)
    print(f"train.shape: {train.shape}")
    print(f"test.shape: {test.shape}")
    print(">>> Saving the train and the test dataset.")
    print(">>> Task 1/2: Train dataset")
    train.to_csv(traindspath, index=False)
    print(">>> Task 2/2: Test dataset")
    test.to_csv(testdspath, index=False)
    print(">>> Generating corpus.")
    print(">>> Task 1/2: Train corpus")
    train_corpus = generate_corpus(train, stop_words)
    print(">>> Task 2/2: Test corpus")
    test_corpus = generate_corpus(test, stop_words)
    print(f"train_corpus len: {len(train_corpus)}")
    print(f"test_corpus len: {len(test_corpus)}")
    print(">>> Saving the train and the test corpus.")
    print(">>> Task 1/2: Train corpus")
    pd.DataFrame(train_corpus).to_csv(traincorpuspath, index=False)
    print(">>> Task 2/2: Test corpus")
    pd.DataFrame(test_corpus).to_csv(testcorpuspath, index=False)

In [3]:
# sample
sample("news.csv", 50000, "news-sample.csv", "news-sample-train.csv", "news-sample-test.csv", "news-corpus-sample-train.csv", "news-corpus-sample-test.csv", 0.5)

>>> Loading dataset from "news.csv".


  if (await self.run_code(code, result,  async_=asy)):


>>> Saving samples.
>>> Spliting dataset.
train.shape: (25000, 7)
test.shape: (25000, 7)
>>> Saving the train and the test dataset.
>>> Task 1/2: Train dataset
>>> Task 2/2: Test dataset


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/5l/6k3fjmx956sb52cw3f7w5x700000gn/T/jieba.cache


>>> Generating corpus.
>>> Task 1/2: Train corpus
The progress of generating corpus is 0.0% (0/25000).


Loading model cost 0.643 seconds.
Prefix dict has been built successfully.


The progress of generating corpus is 4.0% (1000/25000).
The progress of generating corpus is 8.0% (2000/25000).
The progress of generating corpus is 12.0% (3000/25000).
The progress of generating corpus is 16.0% (4000/25000).
The progress of generating corpus is 20.0% (5000/25000).
The progress of generating corpus is 24.0% (6000/25000).
The progress of generating corpus is 28.0% (7000/25000).
The progress of generating corpus is 32.0% (8000/25000).
The progress of generating corpus is 36.0% (9000/25000).
The progress of generating corpus is 40.0% (10000/25000).
The progress of generating corpus is 44.0% (11000/25000).
The progress of generating corpus is 48.0% (12000/25000).
The progress of generating corpus is 52.0% (13000/25000).
The progress of generating corpus is 56.0% (14000/25000).
The progress of generating corpus is 60.0% (15000/25000).
The progress of generating corpus is 64.0% (16000/25000).
The progress of generating corpus is 68.0% (17000/25000).
The progress of generatin

## Load Corpus

In [4]:
sampled_train_corpus = pd.read_csv("news-corpus-sample-train.csv").stack().tolist()
sampled_test_corpus = pd.read_csv("news-corpus-sample-test.csv").stack().tolist()

In [5]:
print(f"sampled_train_corpus shape: {len(sampled_train_corpus)}")
print(f"sampled_test_corpus shape: {len(sampled_test_corpus)}")
print(sampled_train_corpus[0])

sampled_train_corpus shape: 25000
sampled_test_corpus shape: 25000
男子 偷 银行 窃走 万元 假币 心存侥幸 消费 被捕 凌晨 时许 占 攀爬 脚手架 进入 正在 装修 银行 二楼 会议室 办公桌 抽屉 盗走 假 人民币 万元 占 发现自己 盗得 假币 抱 侥幸心理 县城 餐馆 假币 付账 店主 报警 抓获


In [17]:
print(">>> sampled_train_corpus")
corpus_count(sampled_train_corpus)
print(">>> sampled_test_corpus")
corpus_count(sampled_test_corpus)

>>> sampled_train_corpus
[('公司', 5450), ('中国', 5190), ('市场', 5026), ('美国', 2614), ('时间', 2566), ('发展', 2551), ('比赛', 2358), ('基金', 2266), ('企业', 2223), ('经济', 2149), ('投资', 2142), ('亿元', 2112), ('技术', 2052), ('工作', 1947), ('产品', 1925), ('银行', 1849), ('情况', 1838), ('数据', 1804), ('出现', 1792), ('增长', 1783), ('影响', 1720), ('信息', 1713), ('手机', 1613), ('行业', 1593), ('平台', 1557), ('用户', 1552), ('提供', 1515), ('希望', 1461), ('相关', 1459), ('北京', 1444), ('科技', 1436), ('业务', 1434), ('服务', 1416), ('未来', 1415), ('球员', 1408), ('价格', 1372), ('国家', 1341), ('表现', 1313), ('重要', 1306), ('发现', 1292), ('计划', 1290), ('选择', 1283), ('实现', 1276), ('管理', 1265), ('股', 1262), ('超过', 1261), ('进入', 1257), ('支持', 1252), ('专业', 1241), ('国际', 1232), ('球队', 1226), ('全球', 1222), ('资金', 1189), ('使用', 1179), ('岁', 1176), ('只', 1164), ('孩子', 1152), ('美元', 1140), ('发布', 1138), ('学校', 1131), ('政策', 1127), ('机会', 1124), ('达到', 1123), ('一定', 1116), ('方式', 1114), ('能力', 1112), ('关注', 1110), ('报道', 1108), ('学生', 1103), ('去年', 1101

## Model

In [6]:
# vectorizer & transformer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
# train_X & test_X
train_X = tfidf_vectorizer.fit_transform(sampled_train_corpus)
test_X = tfidf_vectorizer.transform(sampled_test_corpus)
# train_y & test_y
train_y = generate_y(pd.read_csv("news-sample-train.csv"))
test_y = generate_y(pd.read_csv("news-sample-test.csv"))

In [7]:
print(f"train_X shape: {train_X.shape}")
print(f"test_X shape: {test_X.shape}")
print(f"train_y shape: {train_y.shape}")
print(f"test_y shape: {test_y.shape}")

print(test_y)

train_X shape: (25000, 5000)
test_X shape: (25000, 5000)
train_y shape: (25000,)
test_y shape: (25000,)
[ 9. 11.  1. ...  9.  1.  1.]


In [21]:
model = xgb.XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=500, silent=True, objective='multi:softmax')
model.fit(train_X, train_y)
pred_y = model.predict(test_X)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [None]:
# lightgbm
lgb_train = lgb.Dataset(test_X, train_y)
lgb_test = lgb.Dataset(test_X, test_y, reference=lgb_train)
params = {'max_depth': 5, 'min_data_in_leaf': 20, 'num_leaves': 35, 
          'learning_rate': 0.1, 'lambda_l1': 0.1, 'lambda_l2': 0.2,
          'objective': 'multiclass', 'num_class': 12, 'verbose': -1}
num_boost_round = 1000

gbm = lgb.train(params, lgb_train, num_boost_round, verbose_eval=100, valid_sets=lgb_test)
pred_y_possbabilty = gbm.predict(train_X, num_iteration=gbm.best_iteration)
pred_y = np.argmax(pred_y_possbabilty, axis=1)

print(pred_y.shape)

## Assessment

In [22]:
print(f"accuracy_score: {accuracy_score(test_y, pred_y)}")
print(f"precision_score: {precision_score(test_y, pred_y, average='weighted')}")
print(f"recall_score: {recall_score(test_y, pred_y, average='weighted')}")
print(f"f1_score: {f1_score(test_y, pred_y, average='weighted')}")

accuracy_score: 0.90104
precision_score: 0.9011257557233033
recall_score: 0.90104
f1_score: 0.8999520112952446
