In [1]:
import fasttext
import pandas as pd
import numpy as np
import re
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
)

In [2]:
df_train = pd.read_csv('../../../dataset/v1/train.csv')
df_test = pd.read_csv('../../../dataset/v1/test.csv')
labels = df_train.columns[3:].to_list()
labels_fasttext = [
    x.replace('label_', '__label__')
    for x
    in df_train.columns[3:].to_list()
]
y_test = df_test[labels].to_numpy()

SEED = 42
K = 5

In [3]:
def preprocess_review(s: str, add_newline: bool = True):
    s = s.lower()
    s = re.sub(r'\n', ' ', s)
    s = re.sub(r'([a-z])[^\sA-Za-z]([A-Za-z])', r'\g<1>\g<2>', s)
    s = re.sub(r'[^\sA-Za-z]', ' ', s)
    s = re.sub(r'\s{2,}', ' ', s)
    s = s.rstrip(' ')
    s = s.lstrip(' ')
    if add_newline:
        s += '\n'
    return s

def convert_data_final():
    with open('./dataset/train_final.txt', 'w') as f:
        for i in range(df_train.shape[0]):
            label_string = ''
            for j, type in enumerate(df_train.iloc[i, 3:].to_list()):
                if type == 1:
                    label_string += labels_fasttext[j] + ' '
            if label_string == '':
                continue
            label_string += preprocess_review(df_train.iat[i, 2])
            f.write(label_string)
    
    with open('./dataset/test_final.txt', 'w') as f:
        for i in range(df_test.shape[0]):
            label_string = ''
            for j, type in enumerate(df_test.iloc[i, 3:].to_list()):
                if type == 1:
                    label_string += labels_fasttext[j] + ' '
            if label_string == '':
                continue
            label_string += preprocess_review(df_test.iat[i, 2])
            f.write(label_string)

def convert_data_kfold(k=5):
    X = list(range(0, df_train.shape[0]))
    y = df_train[labels].to_numpy()
    mskf = MultilabelStratifiedKFold(n_splits=k, shuffle=True, random_state=SEED)

    k_idx = 0
    for train_list, val_list in mskf.split(X, y):
        with open(f'./dataset/train_fold_{k_idx}.txt', 'w') as f:
            for i in train_list:
                label_string = ''
                for j, type in enumerate(df_train.iloc[i, 3:].to_list()):
                    if type == 1:
                        label_string += labels_fasttext[j] + ' '
                    if label_string == '':
                        continue
                label_string += preprocess_review(df_train.iat[i, 2])
                f.write(label_string)
        with open(f'./dataset/val_fold_{k_idx}.txt', 'w') as f:
            for i in val_list:
                label_string = ''
                for j, type in enumerate(df_train.iloc[i, 3:].to_list()):
                    if type == 1:
                        label_string += labels_fasttext[j] + ' '
                    if label_string == '':
                        continue
                label_string += preprocess_review(df_train.iat[i, 2])
                f.write(label_string)
        k_idx += 1

def evaluate(model, y_test, labels):
    y_pred_label, y_pred_prob = model.predict(
        [
            preprocess_review(s, add_newline=False) for s in
            df_test['cleaned_review'].to_list()
        ],
        k=-1
    )

    y_pred = np.zeros(shape=(df_test.shape[0], len(labels_fasttext)))
    for row_idx, (label_list, prob_list) in enumerate(zip(y_pred_label, y_pred_prob)):
        for label, prob in zip(label_list, prob_list):
            col_idx = labels_fasttext.index(label)
            y_pred[row_idx, col_idx] = prob

    mask_positive = y_pred > 0.5
    y_pred[mask_positive] = 1.0
    y_pred[~mask_positive] = 0.0


    accuracy = accuracy_score(y_test, y_pred)
    print(f'Overall accuracy: {accuracy}')
    for idx, label in enumerate(labels):
        label_accuracy = accuracy_score(y_test[:, idx], y_pred[:, idx])
        print(f'Accuracy {label}: {label_accuracy}')

    f1 = f1_score(y_test, y_pred, average='macro')
    print(f'F1 macro: {f1}')
    print(
        classification_report(y_test, y_pred, target_names=labels, digits=4, zero_division=0)
    )

In [4]:
# uncomment and run it once
# convert_data_kfold(K)
# convert_data_final()

## From scratch

In [5]:
# for k in range(K):
#     print(f'K={k}')
#     model = fasttext.train_supervised(
#         input=f'./dataset/train_fold_{k}.txt',
#         loss='ova',
#         autotuneValidationFile=f'./dataset/val_fold_{k}.txt',
#         verbose=2,
#         autotuneDuration=60*10
#     )
#     # get chosen training parameter
#     model_args = model.f.getArgs()
#     for hparam in dir(model_args):
#         if not hparam.startswith('__'):
#             print(f"{hparam} -> {getattr(model_args, hparam)}")
#     print('='*50)

# Result of K=5 fold autotune

'''
K=0
Warning : loss is manually set to a specific value. It will not be automatically optimized.
Progress: 100.0% Trials:  166 Best score:  0.421875 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  13458
Number of labels: 8
Progress: 100.0% words/sec/thread: 2041808 lr:  0.000000 avg.loss:  2.121504 ETA:   0h 0m 0s
Warning : loss is manually set to a specific value. It will not be automatically optimized.
autotuneDuration -> 600
autotuneMetric -> f1
autotuneModelSize -> 
autotunePredictions -> 1
autotuneValidationFile -> ./dataset/val_fold_0.txt
bucket -> 0
cutoff -> 0
dim -> 41
dsub -> 8
epoch -> 100
input -> ./dataset/train_fold_0.txt
label -> __label__
loss -> loss_name.ova
lr -> 0.12638453969335783
lrUpdateRate -> 100
maxn -> 0
minCount -> 1
minCountLabel -> 0
minn -> 0
model -> model_name.supervised
neg -> 5
output -> 
pretrainedVectors -> 
qnorm -> False
qout -> False
retrain -> False
saveOutput -> False
seed -> 0
setManual -> <bound method PyCapsule.setManual of <fasttext_pybind.args object at 0x7182649704f0>>
t -> 0.0001
thread -> 11
verbose -> 2
wordNgrams -> 1
ws -> 5
==================================================
K=1
Progress: 100.0% Trials:   85 Best score:  0.417098 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  13387
Number of labels: 8
Progress: 100.0% words/sec/thread:  789435 lr:  0.000000 avg.loss:  3.257068 ETA:   0h 0m 0s
Warning : loss is manually set to a specific value. It will not be automatically optimized.
autotuneDuration -> 600
autotuneMetric -> f1
autotuneModelSize -> 
autotunePredictions -> 1
autotuneValidationFile -> ./dataset/val_fold_1.txt
bucket -> 1379641
cutoff -> 0
dim -> 100
dsub -> 2
epoch -> 100
input -> ./dataset/train_fold_1.txt
label -> __label__
loss -> loss_name.ova
lr -> 0.11763146242992435
lrUpdateRate -> 100
maxn -> 0
minCount -> 1
minCountLabel -> 0
minn -> 0
model -> model_name.supervised
neg -> 5
output -> 
pretrainedVectors -> 
qnorm -> False
qout -> False
retrain -> False
saveOutput -> False
seed -> 0
setManual -> <bound method PyCapsule.setManual of <fasttext_pybind.args object at 0x7181ff78bdf0>>
t -> 0.0001
thread -> 11
verbose -> 2
wordNgrams -> 2
ws -> 5
==================================================
K=2
Progress: 100.0% Trials:  181 Best score:  0.415686 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  13139
Number of labels: 8
Progress: 100.0% words/sec/thread: 2276804 lr:  0.000000 avg.loss:  2.327070 ETA:   0h 0m 0s
Warning : loss is manually set to a specific value. It will not be automatically optimized.
autotuneDuration -> 600
autotuneMetric -> f1
autotuneModelSize -> 
autotunePredictions -> 1
autotuneValidationFile -> ./dataset/val_fold_2.txt
bucket -> 0
cutoff -> 0
dim -> 34
dsub -> 4
epoch -> 100
input -> ./dataset/train_fold_2.txt
label -> __label__
loss -> loss_name.ova
lr -> 0.1196188275238499
lrUpdateRate -> 100
maxn -> 0
minCount -> 1
minCountLabel -> 0
minn -> 0
model -> model_name.supervised
neg -> 5
output -> 
pretrainedVectors -> 
qnorm -> False
qout -> False
retrain -> False
saveOutput -> False
seed -> 0
setManual -> <bound method PyCapsule.setManual of <fasttext_pybind.args object at 0x7181ff7cde70>>
t -> 0.0001
thread -> 11
verbose -> 2
wordNgrams -> 1
ws -> 5
==================================================
K=3
Progress: 100.0% Trials:  185 Best score:  0.418848 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  13175
Number of labels: 8
Progress: 100.0% words/sec/thread: 1211246 lr:  0.000000 avg.loss:  2.286595 ETA:   0h 0m 0s
Warning : loss is manually set to a specific value. It will not be automatically optimized.
autotuneDuration -> 600
autotuneMetric -> f1
autotuneModelSize -> 
autotunePredictions -> 1
autotuneValidationFile -> ./dataset/val_fold_3.txt
bucket -> 0
cutoff -> 0
dim -> 46
dsub -> 2
epoch -> 15
input -> ./dataset/train_fold_3.txt
label -> __label__
loss -> loss_name.ova
lr -> 1.151082828763079
lrUpdateRate -> 100
maxn -> 0
minCount -> 1
minCountLabel -> 0
minn -> 0
model -> model_name.supervised
neg -> 5
output -> 
pretrainedVectors -> 
qnorm -> False
qout -> False
retrain -> False
saveOutput -> False
seed -> 0
setManual -> <bound method PyCapsule.setManual of <fasttext_pybind.args object at 0x7181ff7c7130>>
t -> 0.0001
thread -> 11
verbose -> 2
wordNgrams -> 1
ws -> 5
==================================================
K=4
Progress: 100.0% Trials:   95 Best score:  0.440154 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  13074
Number of labels: 8
Progress: 100.0% words/sec/thread:  542336 lr:  0.000000 avg.loss:  2.123288 ETA:   0h 0m 0s 0m 0s
autotuneDuration -> 600
autotuneMetric -> f1
autotuneModelSize -> 
autotunePredictions -> 1
autotuneValidationFile -> ./dataset/val_fold_4.txt
bucket -> 87038
cutoff -> 0
dim -> 84
dsub -> 4
epoch -> 86
input -> ./dataset/train_fold_4.txt
label -> __label__
loss -> loss_name.ova
lr -> 0.28655576114935943
lrUpdateRate -> 100
maxn -> 0
minCount -> 1
minCountLabel -> 0
minn -> 0
model -> model_name.supervised
neg -> 5
output -> 
pretrainedVectors -> 
qnorm -> False
qout -> False
retrain -> False
saveOutput -> False
seed -> 0
setManual -> <bound method PyCapsule.setManual of <fasttext_pybind.args object at 0x7181ff7dd130>>
t -> 0.0001
thread -> 11
verbose -> 2
wordNgrams -> 3
ws -> 5
==================================================
'''



In [6]:
# default param
model = fasttext.train_supervised(
    input=f'./dataset/train_final.txt',
    loss='ova',
    verbose=3,
)
evaluate(model, y_test, labels)

Overall accuracy: 0.085
Accuracy label_recommended: 0.74
Accuracy label_story: 0.555
Accuracy label_gameplay: 0.77
Accuracy label_visual: 0.565
Accuracy label_audio: 0.745
Accuracy label_technical: 0.715
Accuracy label_price: 0.765
Accuracy label_suggestion: 0.895
F1 macro: 0.2150789012273524
                   precision    recall  f1-score   support

label_recommended     0.7400    1.0000    0.8506       148
      label_story     0.0000    0.0000    0.0000        89
   label_gameplay     0.7700    1.0000    0.8701       154
     label_visual     0.0000    0.0000    0.0000        87
      label_audio     0.0000    0.0000    0.0000        51
  label_technical     0.0000    0.0000    0.0000        57
      label_price     0.0000    0.0000    0.0000        47
 label_suggestion     0.0000    0.0000    0.0000        21

        micro avg     0.7550    0.4618    0.5731       654
        macro avg     0.1888    0.2500    0.2151       654
     weighted avg     0.3488    0.4618    0.3974       

Read 0M words
Number of words:  14958
Number of labels: 8
Progress: 100.0% words/sec/thread: 1013860 lr:  0.000000 avg.loss:  4.792876 ETA:   0h 0m 0s


> Table result of K=5 fold autotune

| Param      | 0      | 1       | 2      | 3      | 4      |
| ---------- | ------ | ------- | ------ | ------ | ------ |
| epoch      | 100    | 100     | 100    | 15     | 86     |
| lr         | 0.1264 | 0.1176  | 0.1196 | 1.1511 | 0.2866 |
| dim        | 41     | 100     | 34     | 46     | 84     |
| minCount   | 1      | 1       | 1      | 1      | 1      |
| wordNgrams | 1      | 2       | 1      | 1      | 3      |
| minn       | 0      | 0       | 0      | 0      | 0      |
| maxn       | 0      | 0       | 0      | 0      | 0      |
| bucket     | 0      | 1379641 | 0      | 0      | 87038  |
| ws         | 5      | 5       | 5      | 5      | 5      |

In [7]:
# chosen param
model = fasttext.train_supervised(
    input=f'./dataset/train_final.txt',
    loss='ova',
    verbose=3,
    epoch=100,
    lr=0.12,
    dim=46,
    bucket=0,
    ws=5,
)
evaluate(model, y_test, labels)

Read 0M words
Number of words:  14958
Number of labels: 8
Progress: 100.0% words/sec/thread: 2025406 lr:  0.000000 avg.loss:  2.157004 ETA:   0h 0m 0s


Overall accuracy: 0.165
Accuracy label_recommended: 0.83
Accuracy label_story: 0.775
Accuracy label_gameplay: 0.815
Accuracy label_visual: 0.7
Accuracy label_audio: 0.785
Accuracy label_technical: 0.785
Accuracy label_price: 0.81
Accuracy label_suggestion: 0.895
F1 macro: 0.6026631868861596
                   precision    recall  f1-score   support

label_recommended     0.8314    0.9662    0.8938       148
      label_story     0.7444    0.7528    0.7486        89
   label_gameplay     0.8545    0.9156    0.8840       154
     label_visual     0.6452    0.6897    0.6667        87
      label_audio     0.6000    0.4706    0.5275        51
  label_technical     0.6750    0.4737    0.5567        57
      label_price     0.6957    0.3404    0.4571        47
 label_suggestion     0.5000    0.0476    0.0870        21

        micro avg     0.7664    0.7324    0.7490       654
        macro avg     0.6933    0.5821    0.6027       654
     weighted avg     0.7482    0.7324    0.7263       65

## From pretrained vector

In [8]:
# for k in range(K):
#     print(f'K={k}')
#     model = fasttext.train_supervised(
#         input=f'./dataset/train_fold_{k}.txt',
#         loss='ova',
#         autotuneValidationFile=f'./dataset/val_fold_{k}.txt',
#         verbose=2,
#         autotuneDuration=60*30,
#         dim=300,
#         pretrainedVectors='./crawl-300d-2M.vec'
#     )
#     # get chosen training parameter
#     model_args = model.f.getArgs()
#     for hparam in dir(model_args):
#         if not hparam.startswith('__'):
#             print(f"{hparam} -> {getattr(model_args, hparam)}")
#     print('='*50)

# previous run result

'''
K=0
Warning : loss is manually set to a specific value. It will not be automatically optimized.
Warning : dim is manually set to a specific value. It will not be automatically optimized.
Progress: 100.0% Trials:   11 Best score:  0.434896 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  13458
Number of labels: 8
Progress: 100.0% words/sec/thread:  656360 lr: -0.000100 avg.loss:  2.906421 ETA:   0h 0m 0s 656050 lr:  0.000000 avg.loss:  2.906421 ETA:   0h 0m 0s
Warning : loss is manually set to a specific value. It will not be automatically optimized.
Warning : dim is manually set to a specific value. It will not be automatically optimized.
autotuneDuration -> 1800
autotuneMetric -> f1
autotuneModelSize -> 
autotunePredictions -> 1
autotuneValidationFile -> ./dataset/val_fold_0.txt
bucket -> 0
cutoff -> 0
dim -> 300
dsub -> 2
epoch -> 2
input -> ./dataset/train_fold_0.txt
label -> __label__
loss -> loss_name.ova
lr -> 0.13552228709214267
lrUpdateRate -> 100
maxn -> 0
minCount -> 1
minCountLabel -> 0
minn -> 0
model -> model_name.supervised
neg -> 5
output -> 
pretrainedVectors -> ./crawl-300d-2M.vec
qnorm -> False
qout -> False
retrain -> False
saveOutput -> False
seed -> 0
setManual -> <bound method PyCapsule.setManual of <fasttext_pybind.args object at 0x7257715811b0>>
t -> 0.0001
thread -> 11
verbose -> 2
wordNgrams -> 1
ws -> 5
==================================================
K=1
Progress: 100.0% Trials:   11 Best score:  0.427461 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  13387
Number of labels: 8
Progress: 100.0% words/sec/thread:  654557 lr:  0.000000 avg.loss:  2.771937 ETA:   0h 0m 0s
autotuneDuration -> 1800
autotuneMetric -> f1
autotuneModelSize -> 
autotunePredictions -> 1
autotuneValidationFile -> ./dataset/val_fold_1.txt
bucket -> 0
cutoff -> 0
dim -> 300
dsub -> 16
epoch -> 2
input -> ./dataset/train_fold_1.txt
label -> __label__
loss -> loss_name.ova
lr -> 0.1692979210449485
lrUpdateRate -> 100
maxn -> 0
minCount -> 1
minCountLabel -> 0
minn -> 0
model -> model_name.supervised
neg -> 5
output -> 
pretrainedVectors -> ./crawl-300d-2M.vec
qnorm -> False
qout -> False
retrain -> False
saveOutput -> False
seed -> 0
setManual -> <bound method PyCapsule.setManual of <fasttext_pybind.args object at 0x72576eb3b5b0>>
t -> 0.0001
thread -> 11
verbose -> 2
wordNgrams -> 1
ws -> 5
==================================================
K=2
Warning : loss is manually set to a specific value. It will not be automatically optimized.
Warning : dim is manually set to a specific value. It will not be automatically optimized.
Progress: 100.0% Trials:   11 Best score:  0.428758 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  13139
Number of labels: 8
Progress: 100.0% words/sec/thread:  701340 lr:  0.000000 avg.loss:  2.120648 ETA:   0h 0m 0s
autotuneDuration -> 1800
autotuneMetric -> f1
autotuneModelSize -> 
autotunePredictions -> 1
autotuneValidationFile -> ./dataset/val_fold_2.txt
bucket -> 0
cutoff -> 0
dim -> 300
dsub -> 2
epoch -> 5
input -> ./dataset/train_fold_2.txt
label -> __label__
loss -> loss_name.ova
lr -> 0.1
lrUpdateRate -> 100
maxn -> 0
minCount -> 1
minCountLabel -> 0
minn -> 0
model -> model_name.supervised
neg -> 5
output -> 
pretrainedVectors -> ./crawl-300d-2M.vec
qnorm -> False
qout -> False
retrain -> False
saveOutput -> False
seed -> 0
setManual -> <bound method PyCapsule.setManual of <fasttext_pybind.args object at 0x72576eb5dab0>>
t -> 0.0001
thread -> 11
verbose -> 2
wordNgrams -> 1
ws -> 5
==================================================
K=3
Warning : loss is manually set to a specific value. It will not be automatically optimized.
Warning : dim is manually set to a specific value. It will not be automatically optimized.
Progress: 100.0% Trials:   12 Best score:  0.424084 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  13175
Number of labels: 8
Progress: 100.0% words/sec/thread:  651943 lr:  0.000000 avg.loss:  2.673541 ETA:   0h 0m 0s
autotuneDuration -> 1800
autotuneMetric -> f1
autotuneModelSize -> 
autotunePredictions -> 1
autotuneValidationFile -> ./dataset/val_fold_3.txt
bucket -> 0
cutoff -> 0
dim -> 300
dsub -> 16
epoch -> 2
input -> ./dataset/train_fold_3.txt
label -> __label__
loss -> loss_name.ova
lr -> 0.1692979210449485
lrUpdateRate -> 100
maxn -> 0
minCount -> 1
minCountLabel -> 0
minn -> 0
model -> model_name.supervised
neg -> 5
output -> 
pretrainedVectors -> ./crawl-300d-2M.vec
qnorm -> False
qout -> False
retrain -> False
saveOutput -> False
seed -> 0
setManual -> <bound method PyCapsule.setManual of <fasttext_pybind.args object at 0x72576eb5ea30>>
t -> 0.0001
thread -> 11
verbose -> 2
wordNgrams -> 1
ws -> 5
==================================================
K=4
Warning : loss is manually set to a specific value. It will not be automatically optimized.
Warning : dim is manually set to a specific value. It will not be automatically optimized.
Progress: 100.0% Trials:   11 Best score:  0.455598 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  13074
Number of labels: 8
Progress: 100.0% words/sec/thread:  556130 lr:  0.000000 avg.loss:  2.714420 ETA:   0h 0m 0s
autotuneDuration -> 1800
autotuneMetric -> f1
autotuneModelSize -> 
autotunePredictions -> 1
autotuneValidationFile -> ./dataset/val_fold_4.txt
bucket -> 0
cutoff -> 0
dim -> 300
dsub -> 16
epoch -> 2
input -> ./dataset/train_fold_4.txt
label -> __label__
loss -> loss_name.ova
lr -> 0.1692979210449485
lrUpdateRate -> 100
maxn -> 0
minCount -> 1
minCountLabel -> 0
minn -> 0
model -> model_name.supervised
neg -> 5
output -> 
pretrainedVectors -> ./crawl-300d-2M.vec
qnorm -> False
qout -> False
retrain -> False
saveOutput -> False
seed -> 0
setManual -> <bound method PyCapsule.setManual of <fasttext_pybind.args object at 0x725771578070>>
t -> 0.0001
thread -> 11
verbose -> 2
wordNgrams -> 1
ws -> 5
==================================================
'''



In [9]:
# default param
model = fasttext.train_supervised(
    input=f'./dataset/train_final.txt',
    loss='ova',
    verbose=3,
    dim=300,
    pretrainedVectors='./pretrained_vector/crawl-300d-2M.vec'
)
evaluate(model, y_test, labels)

Read 0M words
Number of words:  14958
Number of labels: 8
Progress:  93.1% words/sec/thread:  667571 lr:  0.006859 avg.loss:  2.286446 ETA:   0h 0m 0s

Overall accuracy: 0.17
Accuracy label_recommended: 0.84
Accuracy label_story: 0.79
Accuracy label_gameplay: 0.82
Accuracy label_visual: 0.745
Accuracy label_audio: 0.79
Accuracy label_technical: 0.81
Accuracy label_price: 0.765
Accuracy label_suggestion: 0.89
F1 macro: 0.5871326672275503
                   precision    recall  f1-score   support

label_recommended     0.8372    0.9730    0.9000       148
      label_story     0.7582    0.7753    0.7667        89
   label_gameplay     0.8642    0.9091    0.8861       154
     label_visual     0.6875    0.7586    0.7213        87
      label_audio     0.6364    0.4118    0.5000        51
  label_technical     0.7436    0.5088    0.6042        57
      label_price     0.5000    0.2340    0.3188        47
 label_suggestion     0.0000    0.0000    0.0000        21

        micro avg     0.7792    0.7339    0.7559       654
        macro avg     0.6284    0.5713    0.5871       654
     weighted avg     0.7380    0.7339    0.7272       654
 

Progress: 100.0% words/sec/thread:  627994 lr:  0.000000 avg.loss:  2.230373 ETA:   0h 0m 0s


| Param      | 0      | 1       | 2      | 3      | 4      |
| ---------- | ------ | ------- | ------ | ------ | ------ |
| epoch      | 2      | 2       | 5      | 2      | 2      |
| lr         | 0.1355 | 0.1693  | 0.1    | 0.1693 | 0.1693 |
| minCount   | 1      | 1       | 1      | 1      | 1      |
| wordNgrams | 1      | 1       | 1      | 1      | 1      |
| minn       | 0      | 0       | 0      | 0      | 0      |
| maxn       | 0      | 0       | 0      | 0      | 0      |
| bucket     | 0      | 0       | 0      | 0      | 0      |
| dsub       | 2      | 16      | 2      | 16     | 16     |
| ws         | 5      | 5       | 5      | 5      | 5      |

In [10]:
# chosen param
model = fasttext.train_supervised(
    input=f'./dataset/train_final.txt',
    loss='ova',
    verbose=3,
    dim=300,
    pretrainedVectors='./pretrained_vector/crawl-300d-2M.vec',
    epoch=2,
    lr=0.1693,
    minCount=1,
    wordNgrams=1,
    minn=0,
    maxn=0,
    bucket=0,
    ws=5,
)
evaluate(model, y_test, labels)

Read 0M words
Number of words:  14958
Number of labels: 8
Progress: 100.0% words/sec/thread:  574372 lr:  0.000000 avg.loss:  2.801757 ETA:   0h 0m 0s


Overall accuracy: 0.175
Accuracy label_recommended: 0.835
Accuracy label_story: 0.775
Accuracy label_gameplay: 0.795
Accuracy label_visual: 0.755
Accuracy label_audio: 0.77
Accuracy label_technical: 0.795
Accuracy label_price: 0.775
Accuracy label_suggestion: 0.895
F1 macro: 0.5646531270349565
                   precision    recall  f1-score   support

label_recommended     0.8324    0.9730    0.8972       148
      label_story     0.7340    0.7753    0.7541        89
   label_gameplay     0.8383    0.9091    0.8723       154
     label_visual     0.7065    0.7471    0.7263        87
      label_audio     0.5926    0.3137    0.4103        51
  label_technical     0.7353    0.4386    0.5495        57
      label_price     0.5556    0.2128    0.3077        47
 label_suggestion     0.0000    0.0000    0.0000        21

        micro avg     0.7752    0.7171    0.7450       654
        macro avg     0.6243    0.5462    0.5647       654
     weighted avg     0.7299    0.7171    0.7097      