In [None]:
# Data preparation. Read all articles from csv.
import pandas as pd
tags = pd.read_csv("tag_id_name.csv")
tags_list = [str(tag) for tag in tags.tag_name]
articles = pd.read_csv('article_tag_full_prod.csv', usecols=['a_content_id', 'a_content', 'tag_names', 'tag_id_paths', 'a_channel_id'])

In [None]:
import string
import re
from ckiptagger import data_utils, construct_dictionary, WS

# Data cleaning (remove punctuation, special characters, spaces)
def clean(article):
    removed_parenthese = re.sub('[()]', '', article)
    removed_num = re.sub(r'[0-9]+', '', removed_parenthese)
    removed_w = re.sub(r'[^\w]', '', removed_num)
    removed_english = re.sub(r'[a-zA-Z]', '', removed_w)
    return removed_english

ws = WS('./data')

article_tags = []
pa = open('preprocessed_articles.txt', 'w', encoding='utf8')
pat = open('preprocessed_articles_tags.txt', 'w', encoding='utf8')
for i, row in articles.iterrows():
    tag_names = re.split('\|', str(row['tag_names']))
    article_tags.append(tag_names)
    sentence_list = re.split('。|！|？| ', clean(str(row['a_content'])))
    word_sentence_list = ws(sentence_list)
    not_null_sentence_list = filter(lambda sentence: len(sentence) > 0, word_sentence_list)
    row['a_content'] = ' '.join(sum(not_null_sentence_list, []))
    pa.write(row['a_content'] + '\n')
    pat.write('[' + ', '.join(map(str, tag_names)) + ']\n')
pa.close()
pat.close()
del ws

In [9]:
pa.close()
pat.close()

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import re

f = open('stop_words.txt', 'r', encoding='utf8')
stop_words = [re.sub('\\n', '', str(word)) for word in f]
fa = open('preprocessed_data.txt', 'r', encoding='uft8')
preprocessed_articles = [re.sub('\\n', '', str(article)) for article in fa]

# all_articles = np.array(articles['a_content'].tolist())
all_articles = np.array(preprocessed_articles)
all_articles_tags = article_tags
X_train, X_test, y_train, y_test = train_test_split(all_articles, all_articles_tags, test_size=0.33, random_state=42)

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_train)
classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LinearSVC()))
])
classifier.fit(X_train, Y)
predicted = classifier.predict(X_test)
all_labels = mlb.inverse_transform(predicted)

with open('training_result.txt', 'w', encoding='utf8') as writer:
    for item, labels in zip(X_test, all_labels):
        writer.writelines('{0} => {1}'.format(item, ', '.join(labels)))

In [None]:
# Split test data and train data
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns
from sklearn.model_selection import train_test_split


train, test = train_test_split(simple_articles, test_size=0.33, random_state=42)
x_train = train.content
y_train = train.drop(labels = ['id', 'content'])
x_test = test.content
y_test = test.drop(labels = ['id', 'content'])

In [None]:
import re
f = open('stop_words.txt', 'r', encoding='utf8')
stop_words = [re.sub('\\n', '', word) for word in f]

# Define a pipeline combining a text feature extractor with multilabel classifier
# Naive Bayes
NB_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None))),
])
for tag in tags_list:
    NB_pipeline.fit(x_train, train[tag]) # some problem on train[tag]
    prediction = NB_pipeline.predict(x_test)

# LinearSVC
SVC_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
])
for tag in tags_list:
    SVC_pipeline.fit(x_train, train[tag]) # some problem on train[tag]
    prediction = SVC_pipeline.predict(x_test)

# Logistic Regression
LogReg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
])
for tag in tags_list:
    LogReg_pipeline.fit(x_train, train[tag]) # some problem on train[tag]
    prediction = LogReg_pipeline.predict(x_test)

In [None]:
# Binary Relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

# initialize binary relevance multi-label classifier with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

#train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ", accuracy_score(y_test, predictions))
print("\n")print("\n")

In [None]:
# Classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression

# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression())

# training logistic regression model on train data
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ", accuracy_score(y_test, predictions))
print("\n")

In [None]:
# Label powerset
from skmultilearn.problem_transform import LabelPowerset

# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ", accuracy_score(y_test, predictions))
print("\n")

In [None]:
# Adapted algorithm
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
classifier_new = MLkNN(k=10)
# Note that this classifier can throw up errors when handling sparse matrices.
x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()
# train
classifier_new.fit(x_train, y_train)
# predict
predictions_new = classifier_new.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions_new))