In [1]:
# Data preparation. Read all articles from csv.
import pandas as pd
tags = pd.read_csv("tag_id_name.csv")
tags_list = [str(tag) for tag in tags.tag_name]
articles = pd.read_csv('article_tag_full_prod.csv', usecols=['a_content', 'tag_names'])
del tags

In [None]:
#EDA
import re

tags_set = set()
tags_for_articles = []
for i, row in articles.iterrows():
    tag_names = re.split('\|', str(row['tag_names']))
    tags_set.update(tag_names)
    tags_for_articles.extend(tag_names)

#Part 1 - find tags in the sample articles
print(len(tags_set)) # 12,549 different tags
total_num_tags = len(tags_for_articles)
print(total_num_tags) # 1,583,746 tags used for all articles

#Part 2 - tags distribution
distribution = [(tag, tags_for_articles.count(tag)) for tag in tags_set]
distribution.sort(key = lambda x: x[1], reverse=True)
#Top 3 tags: 疫情, 新冠肺炎疫情, 新冠肺炎
print('Top 3: ', distribution[0], ', ', distribution[1], ', ', distribution[2])
#Accounting percentage of top 3 tags
print('Acounting percentage: ', (distribution[0][1] + distribution[1][1] + distribution[2][1])/total_num_tags * 100, '%')
#Amount of tags that have more than 3 articles instances: 9876 through out 12,549 tags
print('Amount of tags that have more than 3 articles instances: ', len(list(filter(lambda tup: tup[1] >= 3, distribution))))
#Amount of tags that have 2 articles instances: 962 through out 12,549 tags
print('Amount of tags that have 2 articles instances: ', len(list(filter(lambda tup: tup[1] == 2, distribution))))
#Amount of tags that have 1 articles instances: 1711 through out 12,549 tags
print('Amount of tags that have 2 articles instances: ', len(list(filter(lambda tup: tup[1] == 1, distribution))))

In [None]:
# Store the processed articles and corresponding tags in txt files
import string
import re
from ckiptagger import data_utils, construct_dictionary, WS

# Data cleaning (remove punctuation, special characters, spaces)
def clean(article):
    removed_parenthese = re.sub('[()]', '', article)
    removed_num = re.sub(r'[0-9]+', '', removed_parenthese)
    removed_w = re.sub(r'[^\w]', '', removed_num)
    removed_english = re.sub(r'[a-zA-Z]', '', removed_w)
    return removed_english

ws = WS('./data')

article_tags = []
pa = open('preprocessed_articles.txt', 'w', encoding='utf8')
pat = open('preprocessed_articles_tags.txt', 'w', encoding='utf8')
for i, row in articles.iterrows():
    tag_names = re.split('\|', str(row['tag_names']))
    article_tags.append(tag_names)
    sentence_list = re.split('。|！|？| ', clean(str(row['a_content'])))
    word_sentence_list = ws(sentence_list)
    not_null_sentence_list = filter(lambda sentence: len(sentence) > 0, word_sentence_list)
    row['a_content'] = ' '.join(sum(not_null_sentence_list, []))
    pa.write(row['a_content'] + '\n')
    pat.write('[' + ', '.join(map(str, tag_names)) + ']\n')
pa.close()
pat.close()
del ws

In [2]:
# clean the articles directly and get ready for tag score calculation
import string
import re
from ckiptagger import data_utils, construct_dictionary, WS
import pandas as pd

def clean(article):
    removed_parenthese = re.sub('[()]', '', article)
    removed_num = re.sub(r'[0-9]+', '', removed_parenthese)
    removed_w = re.sub(r'[^\w]', '', removed_num)
    removed_english = re.sub(r'[a-zA-Z]', '', removed_w)
    return removed_english

ws = WS('./data')

pa = open('new_preprocessed_articles.txt', 'w', encoding='utf8')
pat = open('new_preprocessed_articles_tags.txt', 'w', encoding='utf8')

for i, row in articles.iterrows():
    if pd.isnull(row['a_content']):
        continue
    pat.write('[' + ', '.join(map(str, re.split('\|', str(row['tag_names'])))) + ']\n')
    sentence_list = re.split('。|！|？| ', clean(str(row['a_content'])))
    word_sentence_list = ws(sentence_list)
    not_null_sentence_list = filter(lambda sentence: len(sentence) > 0, word_sentence_list)
    pa.write(' '.join(sum(not_null_sentence_list, [])) + '\n')
del ws
pa.close()
pat.close()


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [3]:
# some articles doesn't have article content but have tags. filter out those articles and tags. construct df_articles dataframe
import pandas as pd
import re

fa = open('new_preprocessed_articles.txt', 'r', encoding='utf8')
ft = open('new_preprocessed_articles_tags.txt', 'r', encoding='utf8')

tags_set = set()
tags_for_articles = []

processed_articles = {'article_content': [re.sub('\\n', '', str(article)) for article in fa], 'article_tags': []}

for tag in ft:
    corresponding_tags = re.split(', ', str(tag).replace('[', '').replace(']', '').replace('\n', ''))
    processed_articles['article_tags'].append(corresponding_tags)
    tags_set.update(corresponding_tags)
    tags_for_articles.extend(corresponding_tags)

fa.close()
ft.close()

In [4]:
df_articles = pd.DataFrame(data=processed_articles)

In [5]:
# calculate tag score by its frequency distribution. score = occurrance / total number of article tags
tag_scores = {}
for tag in tags_set:
    tag_scores[tag] = tags_for_articles.count(tag)/len(tags_for_articles)

del tags_for_articles
del tags_set

In [6]:
# calculate article score according to the sum of its tag score and sort the articles by score
score = []
for i, row in df_articles.iterrows():
    score.append(sum([tag_scores[tag] for tag in row['article_tags']]))
df_articles['score'] = score
df_articles.sort_values(by='score', ascending=False)

Unnamed: 0,article_content,article_tags,score
53179,口罩 搶 消毒 用品 搶 新冠 肺炎 被 世界 衛生 組織 正式 定性 為 全球 大 流行 ...,"[武漢, 歐洲, 疫情, 沙士, 口罩, 日本, 疫情, 全球大流行, 澳洲, 中國, 肺炎...",2.075211e-01
356045,經濟日報 專訊 本 港 在 新冠 肺炎 疫情 下 不少 餐廳 停業 或 結業 有 日本 餐廳...,"[肺炎, 黃大仙, 新冠肺炎疫情, 疫情, 新冠肺炎, 服務, 新冠肺炎疫情, Facebo...",1.905318e-01
131165,新型 冠狀 病毒 持續 蔓延至 全球 各 地 新冠肺炎 武漢 肺炎 在 中國 內地 的 確診...,"[新冠肺炎, 冠狀病毒, 武漢, 香港, 零售, 中國, 經濟, 肺炎, 冠狀病毒, 經濟,...",1.898835e-01
358723,新冠肺炎 肆虐 不少 行業 都 受 重挫 餐飲業 旅遊業 因為 疫情 關係 不斷 出現 倒閉...,"[香港, 運輸, 裁員, 林鄭月娥, 創業, PayMe, 工程, 荃灣, 新冠肺炎疫情, ...",1.764115e-01
355194,自 上 月號 起 從 韓國 抵 港 的 港人 須 接受 日 醫學 監察 而 由 南韓 抵 港...,"[疫情, 無薪假, 社企, 口罩, 巴士, Apple, 肺炎, 首爾, 疫情, 無薪假, ...",1.729946e-01
...,...,...,...
291905,小五小六 的 學生 需要 面對 影響 升中 派位 的 小學 呈 分試 丹拿山 循道 學校 校...,[丹拿山循道學校],6.355889e-07
272044,行政 長官 林鄭月娥 已 接納 終審 法院 首席 法官 的 建議 將 兩 位 終審 法院 非...,[陳兆愷],6.355889e-07
112889,恒指 失守 會 下試 昇捷 控股 現價元 買入價 現價 目標 價元 止蝕 價元 買入 原因 ...,[昇捷控股（02340）],6.355889e-07
315990,綽號 波波 嘅前 藝人 唐麗 球年 參加 香港 小姐 競選 奪得 季軍 卸任 之後 就 為 ...,[吳毅將],6.355889e-07


In [8]:
# LinearSVC with reduced dimension so more data can be trained
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
import re

tags = df_articles['article_tags']
mlb = MultiLabelBinarizer()
binarized_labels = mlb.fit_transform(tags[0:100])

articles = df_articles['article_content']
X_train, X_test, y_train, y_test = train_test_split(articles[0:100], binarized_labels, test_size=0.33, random_state=42)

classifier = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ("scaler", StandardScaler(with_mean=False)),
    ("lsa", TruncatedSVD(n_components=1000)),
    # ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=-1))
    ('clf', OneVsRestClassifier(SGDClassifier(), n_jobs=-1))
])
classifier.fit(X_train, y_train)
predicted = classifier.predict(X_test)

# f1 score (LinearSVC) for 50000 articles with standard scaler and truncated svd: 0.0504148319332105
# f1 score (LinearSVC) for 40000 articles without standard scaler and truncated svd: 0.39575328375678326
# f1 score (SGDClassifier) for 100000 articles with standard scaler and truncated svd: 0.11678808708061852

In [None]:
# LinearSVC
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import re

f = open('stop_words.txt', 'r', encoding='utf8')
stop_words = [re.sub('\\n', '', str(word)) for word in f]
fa = open('preprocessed_articles.txt', 'r', encoding='utf8')
preprocessed_articles = [re.sub('\\n', '', str(article)) for article in fa]
ft = open('preprocessed_articles_tags.txt', 'r', encoding='utf8')
article_tags = [re.split(', ', str(tag).replace('[', '').replace(']', '').replace('\n', '')) for tag in ft]

all_articles = np.array(preprocessed_articles[0:20000])
all_articles_tags = article_tags[0:20000]
X_train, X_test, y_train, y_test = train_test_split(all_articles, all_articles_tags, test_size=0.33, random_state=42)

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_train)
classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC()))
])
classifier.fit(X_train, Y)
predicted = classifier.predict(X_test)
all_labels = mlb.inverse_transform(predicted)

with open('training_result.txt', 'w', encoding='utf8') as writer:
    for yLabels, labels in zip(y_test, all_labels):
        writer.write('Tags by editor: ['+ ', '.join(yLabels) + ']; ' + 'Tags by model: ['+ ', '.join(labels) + ']\n')

In [None]:
# LinearSVC - Test for 40000 articles
# MultinomialNB - Test for 40000 articles
# LogisticRegression - Test for 40000 articles
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.metrics import f1_score

# f = open('stop_words.txt', 'r', encoding='utf8')
# stop_words = [re.sub('\\n', '', str(word)).replace(' ', '') for word in f]
fa = open('preprocessed_articles.txt', 'r', encoding='utf8')
preprocessed_articles = [re.sub('\\n', '', str(article)) for article in fa]
ft = open('preprocessed_articles_tags.txt', 'r', encoding='utf8')
article_tags = [re.split(', ', str(tag).replace('[', '').replace(']', '').replace('\n', '')) for tag in ft]

mlb = MultiLabelBinarizer()
binarized_labels = mlb.fit_transform(article_tags[0:40000])

X_train, X_test, y_train, y_test = train_test_split(preprocessed_articles[0:40000], binarized_labels, test_size=0.33, random_state=42)

classifier = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=-1)) # f1_score: 0.39575 (40000 articles)
    # ('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None))) # f1_score: 0.15998 (10000 articles)
    # ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)) # f1_score: 0.24809 (40000 articles)
])
classifier.fit(X_train, y_train)
predicted = classifier.predict(X_test)

In [9]:
from sklearn.metrics import f1_score

# Print score
print ('F1 score for 40000 articles: ', str(f1_score(y_test, predicted, average='micro')))

F1 score for 40000 articles:  0.34763948497854075


In [None]:
# Saving model
import pickle

with open('./models/svc_classifier.pkl', 'wb') as f:
    pickle.dump(classifier, f)

In [None]:
# Load model

with open('./models/svc_classifier.pkl', 'rb') as f:
    svc_clf = pickle.load(f)

In [None]:
# Binary Relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import re
import numpy as np
from sklearn.metrics import f1_score

f = open('stop_words.txt', 'r', encoding='utf8')
stop_words = [re.sub('\\n', '', str(word)) for word in f]
fa = open('preprocessed_articles.txt', 'r', encoding='utf8')
preprocessed_articles = [re.sub('\\n', '', str(article)) for article in fa]
ft = open('preprocessed_articles_tags.txt', 'r', encoding='utf8')
article_tags = [re.split(', ', str(tag).replace('[', '').replace(']', '').replace('\n', '')) for tag in ft]

# notNullArticleIndex =  list()
# for i in range(len(preprocessed_articles[0:105797])):
#     if preprocessed_articles[i] != '':
#         notNullArticleIndex.append(i)

# removed_null_articles = [preprocessed_articles[i] for i in notNullArticleIndex]
# removed_null_articles_tags = [article_tags[i] for i in notNullArticleIndex]

# fa.close()
# ft.close()
# del preprocessed_articles
# del article_tags
# del notNullArticleIndex

mlb = MultiLabelBinarizer()
binarized_labels = mlb.fit_transform(article_tags[0:40000])

X_train, X_test, y_train, y_test = train_test_split(preprocessed_articles[0:40000], binarized_labels, test_size=0.33, random_state=42)

# initialize binary relevance multi-label classifier with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word')
vectorizer.fit(X_train)
vectorizer.fit(X_test)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

#train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

# accuracy
print("f1 score for BR: ", f1_score(y_test, predictions, average='micro'))

In [None]:
# Classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression

# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression())

# training logistic regression model on train data
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ", accuracy_score(y_test, predictions))

In [None]:
# Label powerset
from skmultilearn.problem_transform import LabelPowerset

# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ", accuracy_score(y_test, predictions))

In [None]:
# Adapted algorithm
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
classifier_new = MLkNN(k=10)
# Note that this classifier can throw up errors when handling sparse matrices.
x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()
# train
classifier_new.fit(x_train, y_train)
# predict
predictions_new = classifier_new.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions_new))