In [1]:
import re
import unicodedata
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import underthesea

from sklearn import metrics

from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


### Helper function

In [2]:
def evaluate(y_true, y_pred):
    acc = metrics.accuracy_score(y_true, y_pred)
    pre = metrics.precision_score(y_true, y_pred, average='weighted')
    re = metrics.recall_score(y_true, y_pred, average='weighted')
    f1 = metrics.f1_score(y_true, y_pred, average='weighted')

    return {
        'Accuracy': acc,
        'Precision': pre,
        'Recall': re,
        "F1-score": f1,
    }

def plot_confusion_matrix(y_true, y_pred, label):
    print()
    print("Classification report")
    print(metrics.classification_report(y_true, y_pred, labels=label))
    print()
    conf = metrics.confusion_matrix(y_true=y_true, y_pred=y_pred, normalize='pred', labels=label)
    plt.figure(figsize=(20,20))
    sns.set(font_scale=1.4)
    sns.heatmap(conf,annot=True,annot_kws={"size": 16}, fmt='.3f', xticklabels=label,yticklabels=label)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    

def train_test(pipe, X_train, X_test, y_train, y_test):
    pipe.fit(list(X_train.values.ravel()), y_train.values.ravel())
    pred = pipe.predict(list(X_test.values.ravel()))
    result = evaluate(y_test.values.ravel(), pred)
    result = pd.DataFrame([result], index=['value'])
    print(result)
    plot_confusion_matrix(y_test.values.ravel(), pred, label=pipe.classes_)

In [5]:
def get_number_of_captial_word(word:str):
    return sum(1 for c in word if c.isupper())


def check_invalid_desc(txt, max_num=2):
    first_token = txt.split(" ")[0]
    num_capt = get_number_of_captial_word(first_token)
    if (num_capt >= max_num):
        return True
    return False


def check_invalid_desc2(sent):
    tokens = sent.split(' ')
    if get_number_of_captial_word(tokens[0]) <= 3 and get_number_of_captial_word(tokens[1]) <= 2:
        return False
    elif tokens[1].islower():
        return False
    return True


def split_cap(txt):
    first_token, second_token = txt.split(" ")[:2]
    if first_token.isupper() and second_token.islower():
         return txt
    for i in range(1,len(txt)):
        # if txt[i-1].isupper() and (txt[i].islower() or (not txt[i].isalpha())):
        if txt[i-1].isupper() and txt[i].islower():
            txt =  "".join(txt[:i-1]) + ' ' + "".join(txt[i-1:])
            return txt
        elif txt[i-1].isupper() and txt[i].isnumeric():
            txt =  "".join(txt[:i]) + ' ' + "".join(txt[i:])
            return txt
    return txt


def processDescription(df, true_columns):
    '''
    Remove the case where comes from the crawling procedure.
    Description column some texts have first words merge with the news description.
    '''
    final_dataset = []

    tmp = df.copy()
    tmp['cap_first'] = tmp['Description'].apply(check_invalid_desc)

    final_dataset.append(tmp[tmp['cap_first'] == False][true_columns])

    tmp1 = tmp[tmp['cap_first'] == True].copy()
    tmp1['cap_first'] = tmp1['Description'].apply(check_invalid_desc2)

    good_tmp1_true = tmp1[tmp1['cap_first'] == False].copy()
    good_tmp1_true['Description'] = good_tmp1_true['Description'].apply(split_cap)

    final_dataset.append(good_tmp1_true[true_columns])

    tmp2 = tmp1[tmp1['cap_first'] == True].copy()
    tmp2['Description'] = tmp2['Description'].apply(split_cap)

    final_dataset.append(tmp2[true_columns])

    final_dataset = pd.concat(final_dataset, axis=0)
    final_dataset['Description'] = final_dataset['Description'].apply(lambda x: x.strip())
    return final_dataset.reset_index(drop=True)

In [6]:
def tokenizer(txt):
    return underthesea.word_tokenize(txt, format='text')
    # return ViTokenizer.tokenize(txt)


def process_txt(txt, lower=True):
    txt = unicodedata.normalize('NFKC', str(txt))
    txt = re.sub(r'[^\w\s]', '', txt)
    txt = re.sub(r'[\d]', '', txt)
    txt = re.sub('\s+', ' ', txt)
    txt = txt.strip()
    if lower:
        txt = txt.lower()
    txt = tokenizer(txt)
    return txt

### Read Data

In [16]:
# WARINING: For new data

df_dtype = {'Category': 'category', 'Sub Category': 'category', 'Title': 'object', 'Description': 'object', 'Content': 'object'}
df = pd.read_csv("./vnexpress_crawled.csv", dtype=df_dtype)
true_columns = list(df.columns)
print(f"Dataframe shape: {df.shape}")

df = processDescription(df, true_columns)
df['Title'] = df['Title'].parallel_apply(process_txt)
df['Description'] = df['Description'].parallel_apply(process_txt)
df['Content'] = df['Content'].parallel_apply(process_txt)

In [20]:
# df.to_csv('processed.csv', index=False)
df.to_feather('processed.feather')