In [None]:
#################################################################
#################################################################
#### 5 Folds dataset v2 -- Baseline                          ####
#### Random state = 0                                        ####
####           Fold 1 Accuracy: 60.54567022538553            ####
####           Fold 2 Accuracy: 61.05113299323763            ####
####           Fold 3 Accuracy: 61.72736979475621            ####
####           Fold 4 Accuracy: 60.32744097757741            ####
####           Fold 5 Accuracy: 60.94435876141892            ####
####           Average: 60.91919455047514                    ####
#################################################################
#################################################################

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import normalize
from sklearn.model_selection import KFold
import os
import stringdist as sd
import argparse

In [2]:
random_state = 0

In [3]:
root = os.getcwd()
dataset_path = os.path.join(root, 'datasets', 'botnoi-api', 'botnoi_api_cleaned_v2.xlsx')
dataset = pd.read_excel(dataset_path)

In [4]:
################ change embed numpy array below ################
np_array_fname = 'thai-ckpt-30000.npy'
################################################################
pre_embed_np_array_path = os.path.join(root, 'datasets', 'botnoi-api', np_array_fname)
dataset['bert'] = np.load(pre_embed_np_array_path).tolist()

In [5]:
def compute_accuracy(y_pred, y_true):
    correct = 0
    for pred_topic, test_topic in zip(y_pred, y_true):
        if pred_topic == test_topic:
            correct += 1
    return correct / len(y_pred)

In [7]:
def eval_bert(k):
    print('Evaluating bert {}!'.format(np_array_fname))
    kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
    accs = []
    fold_num = 1
    for train_indices, test_indices in kf.split(dataset):
        print('Fold #{}'.format(fold_num))
        train_df = dataset.iloc[train_indices, :]
        train_keywords = train_df['Keyword'].values.tolist()
        train_topics = train_df['Topic'].values.tolist()
        train_l2_embeds = normalize(np.vstack(train_df['bert'].values))
        
        test_df = dataset.iloc[test_indices, :]
        test_keywords = test_df['Keyword'].values.tolist()
        test_topics = test_df['Topic'].values.tolist()
        test_l2_embeds = normalize(np.vstack(test_df['bert'].values))

        pred_topics = []
        for test_l2_embed in tqdm(test_l2_embeds):
            closest_idx = np.argmax(np.dot(train_l2_embeds, test_l2_embed.T))
            pred_topic = train_topics[closest_idx]
            pred_topics.append(pred_topic)

        acc = compute_accuracy(pred_topics, test_topics)
        print('Fold {} accuracy: {:.2%}'.format(fold_num, acc))
        accs.append(acc)
        fold_num += 1
    avg_kfold_acc = np.mean(accs)
    print('{} fold accuracy: {:.2%}'.format(fold_num - 1, avg_kfold_acc))
#     return avg_kfold_acc


eval_bert(k=5)

Evaluating bert thai-ckpt-30000.npy!
Fold #1


100%|██████████| 8430/8430 [01:10<00:00, 119.97it/s]


Fold 1 accuracy: 59.56%
Fold #2


100%|██████████| 8429/8429 [01:09<00:00, 120.85it/s]


Fold 2 accuracy: 59.67%
Fold #3


100%|██████████| 8429/8429 [01:10<00:00, 119.41it/s]


Fold 3 accuracy: 61.02%
Fold #4


100%|██████████| 8429/8429 [01:12<00:00, 116.29it/s]


Fold 4 accuracy: 59.60%
Fold #5


100%|██████████| 8429/8429 [01:09<00:00, 121.52it/s]

Fold 5 accuracy: 59.98%
thai-ckpt-30000.npy
5 fold accuracy: 59.97%





In [None]:
def eval_kfold_editdist(k):
    print('Evaluating Edit Distance!')
    kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
    accs = []
    fold_num = 1
    for train_indices, test_indices in kf.split(dataset):

        train_df = dataset.iloc[train_indices, :]
        train_keywords = train_df['Keyword'].values.tolist()
        train_topics = train_df['Topic'].values.tolist()

        test_df = dataset.iloc[test_indices, :]
        test_keywords = test_df['Keyword'].values.tolist()
        test_topics = test_df['Topic'].values.tolist()

        pred_topics = []
        for i, test_keyword in enumerate(test_keywords):
            distances = [sd.levenshtein(str(train_keyword), str(test_keyword)) for train_keyword in train_keywords]
            min_distance_idx = np.argmin(distances)
            pred_topic = train_topics[min_distance_idx]
            pred_topics.append(pred_topic)
        
        acc = compute_accuracy(pred_topics, test_topics)
        print('Fold {} accuracy: {}'.format(fold_num, acc))
        accs.append(acc)
        fold_num += 1
    avg_kfold_acc = np.mean(accs)
    print('Average {} fold accuracy: {}'.format(fold_num, avg_kfold_acc))
#     return avg_kfold_acc