In [None]:
# Data preparation. Read all articles from csv.
import pandas as pd
tags = pd.read_csv("tag_id_name.csv")
tags_list = [str(tag) for tag in tags.tag_name]
articles = pd.read_csv('article_tag_full_prod.csv', usecols=['a_content_id', 'a_content', 'tag_names', 'tag_id_paths', 'a_channel_id'])
del tags

In [None]:
#EDA
import re

tags_set = set()
tags_for_articles = []
for i, row in articles.iterrows():
    tag_names = re.split('\|', str(row['tag_names']))
    tags_set.update(tag_names)
    tags_for_articles.extend(tag_names)

#Part 1 - find tags in the sample articles
print(len(tags_set)) # 12549 different tags
total_num_tags = len(tags_for_articles)
print(total_num_tags) # 1583746 tags used for all articles

#Part 2 - tags distribution
distribution = [(tag, tags_for_articles.count(tag)) for tag in tags_set]
distribution.sort(key = lambda x: x[1], reverse=True)
#Top 3 tags: 疫情, 新冠肺炎疫情, 新冠肺炎
print('Top 3: ', distribution[0], ', ', distribution[1], ', ', distribution[2])
#Accounting percentage of top 3 tags
print('Acounting percentage: ', (distribution[0][1] + distribution[1][1] + distribution[2][1])/total_num_tags * 100, '%')

In [None]:
import string
import re
from ckiptagger import data_utils, construct_dictionary, WS

# Data cleaning (remove punctuation, special characters, spaces)
def clean(article):
    removed_parenthese = re.sub('[()]', '', article)
    removed_num = re.sub(r'[0-9]+', '', removed_parenthese)
    removed_w = re.sub(r'[^\w]', '', removed_num)
    removed_english = re.sub(r'[a-zA-Z]', '', removed_w)
    return removed_english

ws = WS('./data')

article_tags = []
pa = open('preprocessed_articles.txt', 'w', encoding='utf8')
pat = open('preprocessed_articles_tags.txt', 'w', encoding='utf8')
for i, row in articles.iterrows():
    tag_names = re.split('\|', str(row['tag_names']))
    article_tags.append(tag_names)
    sentence_list = re.split('。|！|？| ', clean(str(row['a_content'])))
    word_sentence_list = ws(sentence_list)
    not_null_sentence_list = filter(lambda sentence: len(sentence) > 0, word_sentence_list)
    row['a_content'] = ' '.join(sum(not_null_sentence_list, []))
    pa.write(row['a_content'] + '\n')
    pat.write('[' + ', '.join(map(str, tag_names)) + ']\n')
pa.close()
pat.close()
del ws

In [None]:
# LinearSVC
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import re

f = open('stop_words.txt', 'r', encoding='utf8')
stop_words = [re.sub('\\n', '', str(word)) for word in f]
fa = open('preprocessed_articles.txt', 'r', encoding='utf8')
preprocessed_articles = [re.sub('\\n', '', str(article)) for article in fa]
ft = open('preprocessed_articles_tags.txt', 'r', encoding='utf8')
article_tags = [re.split(', ', str(tag).replace('[', '').replace(']', '').replace('\n', '')) for tag in ft]

all_articles = np.array(preprocessed_articles[0:20000])
all_articles_tags = article_tags[0:20000]
X_train, X_test, y_train, y_test = train_test_split(all_articles, all_articles_tags, test_size=0.33, random_state=42)

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_train)
classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC()))
])
classifier.fit(X_train, Y)
predicted = classifier.predict(X_test)
all_labels = mlb.inverse_transform(predicted)

with open('training_result.txt', 'w', encoding='utf8') as writer:
    for yLabels, labels in zip(y_test, all_labels):
        writer.write('Tags by editor: ['+ ', '.join(yLabels) + ']; ' + 'Tags by model: ['+ ', '.join(labels) + ']\n')

In [None]:
# LinearSVC - Test for 40000 articles
# MultinomialNB - Test for 40000 articles
# LogisticRegression - Test for 40000 articles
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.metrics import f1_score

# f = open('stop_words.txt', 'r', encoding='utf8')
# stop_words = [re.sub('\\n', '', str(word)).replace(' ', '') for word in f]
fa = open('preprocessed_articles.txt', 'r', encoding='utf8')
preprocessed_articles = [re.sub('\\n', '', str(article)) for article in fa]
ft = open('preprocessed_articles_tags.txt', 'r', encoding='utf8')
article_tags = [re.split(', ', str(tag).replace('[', '').replace(']', '').replace('\n', '')) for tag in ft]

# notNullArticleIndex =  list()
# for i in range(len(preprocessed_articles[0:105797])):
#     if preprocessed_articles[i] != '':
#         notNullArticleIndex.append(i)

# removed_null_articles = [preprocessed_articles[i] for i in notNullArticleIndex]
# removed_null_articles_tags = [article_tags[i] for i in notNullArticleIndex]

# fa.close()
# ft.close()
# del preprocessed_articles
# del article_tags
# del notNullArticleIndex

mlb = MultiLabelBinarizer()
binarized_labels = mlb.fit_transform(article_tags[0:40000])

X_train, X_test, y_train, y_test = train_test_split(preprocessed_articles[0:40000], binarized_labels, test_size=0.33, random_state=42)

classifier = Pipeline([
    ('tfidf', TfidfVectorizer()),
    # ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=-1)) # f1_score: 0.39575 (40000 articles)
    # ('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None))) # f1_score: 0.15998 (10000 articles)
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)) # f1_score: 0.24809 (40000 articles)
])
classifier.fit(X_train, y_train)
predicted = classifier.predict(X_test)

In [None]:
# Print score
print ('F1 score for 40000 articles: ', str(f1_score(y_test, predicted, average='micro')))

In [None]:
# Saving model
import pickle

with open('./models/svc_classifier.pkl', 'wb') as f:
    pickle.dump(classifier, f)

In [None]:
# Load model

with open('./models/svc_classifier.pkl', 'rb') as f:
    svc_clf = pickle.load(f)

In [None]:
# Binary Relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import re
import numpy as np
from sklearn.metrics import f1_score

f = open('stop_words.txt', 'r', encoding='utf8')
stop_words = [re.sub('\\n', '', str(word)) for word in f]
fa = open('preprocessed_articles.txt', 'r', encoding='utf8')
preprocessed_articles = [re.sub('\\n', '', str(article)) for article in fa]
ft = open('preprocessed_articles_tags.txt', 'r', encoding='utf8')
article_tags = [re.split(', ', str(tag).replace('[', '').replace(']', '').replace('\n', '')) for tag in ft]

# notNullArticleIndex =  list()
# for i in range(len(preprocessed_articles[0:105797])):
#     if preprocessed_articles[i] != '':
#         notNullArticleIndex.append(i)

# removed_null_articles = [preprocessed_articles[i] for i in notNullArticleIndex]
# removed_null_articles_tags = [article_tags[i] for i in notNullArticleIndex]

# fa.close()
# ft.close()
# del preprocessed_articles
# del article_tags
# del notNullArticleIndex

mlb = MultiLabelBinarizer()
binarized_labels = mlb.fit_transform(article_tags[0:40000])

X_train, X_test, y_train, y_test = train_test_split(preprocessed_articles[0:40000], binarized_labels, test_size=0.33, random_state=42)

# initialize binary relevance multi-label classifier with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word')
vectorizer.fit(X_train)
vectorizer.fit(X_test)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

#train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

# accuracy
print("f1 score for BR: ", f1_score(y_test, predictions, average='micro'))

In [None]:
# Classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression

# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression())

# training logistic regression model on train data
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ", accuracy_score(y_test, predictions))
print("\n")

In [None]:
# Label powerset
from skmultilearn.problem_transform import LabelPowerset

# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ", accuracy_score(y_test, predictions))
print("\n")

In [None]:
# Adapted algorithm
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
classifier_new = MLkNN(k=10)
# Note that this classifier can throw up errors when handling sparse matrices.
x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()
# train
classifier_new.fit(x_train, y_train)
# predict
predictions_new = classifier_new.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions_new))