In [1]:
#File: Performances.ipynb
#Purpose: find best parameters to train fastText model
#Author: Quan Gan
import fasttext
import csv 

In [2]:
#Method: sortHelper
#Purpose: result list sorting helper
def sortHelper(e):
    return e['precision']

In [3]:
#Method: getPerformance
#Purpose: train different fasttext models with different parameters and return performances
#Parameters: input_train -> the trainset file path
#            input_word_vector -> pretained word vector file path
#            input_test -> the testset file path
def getPerformance(input_train, input_word_vector, input_test):
    result = []
    for lr in [0.1, 0.5, 1]:
        for epoch in [5, 10, 15, 20, 25]:
            model = fasttext.train_supervised(input = input_train,
                                              dim = 300,
                                              lr = lr,
                                              epoch = epoch,
                                              loss ='ova',
                                              pretrainedVectors = input_word_vector)
            for k in range(1, 6):
                Performance = model.test(input_test, k=k)
                result.append({'precision' : Performance[1], 'recall' : Performance[2], 'k':k, 'learning rate' : lr, 'epoch': epoch})
    result.sort(key=sortHelper, reverse=True)
    return result

In [4]:
#Method: output_to_csv
#Purpose: convert result performances into csv file
#Parameters: fileName -> the filename without file extension (the file will be stored in current file location)
#            result -> the performances result
def output_to_csv(fileName, result):
    keys = ['precision', 'recall', 'k', 'learning rate','epoch']
    with open(fileName + '.csv', 'w', newline='') as output_file:
        writer = csv.DictWriter(output_file, keys)
        writer.writeheader()
        writer.writerows(result)

In [5]:
#simple model performance result
simple_result = getPerformance("../data/DwC_simple.train", "../data/crawl-300d-2M-subword.vec", "../data/DwC_simple.valid")
output_to_csv("../data/Performance result/simple_performance", simple_result)

Read 0M words
Number of words:  625
Number of labels: 9
Progress: 100.0% words/sec/thread: 1147550 lr:  0.000000 avg.loss:  0.011676 ETA:   0h 0m 0s
Read 0M words
Number of words:  625
Number of labels: 9
Progress: 100.0% words/sec/thread: 1251509 lr:  0.000000 avg.loss:  0.009273 ETA:   0h 0m 0s
Read 0M words
Number of words:  625
Number of labels: 9
Progress: 100.0% words/sec/thread: 1253095 lr:  0.000000 avg.loss:  0.005162 ETA:   0h 0m 0s
Read 0M words
Number of words:  625
Number of labels: 9
Progress: 100.0% words/sec/thread: 1190258 lr:  0.000000 avg.loss:  0.004346 ETA:   0h 0m 0s
Read 0M words
Number of words:  625
Number of labels: 9
Progress: 100.0% words/sec/thread: 1254249 lr:  0.000000 avg.loss:  0.004171 ETA:   0h 0m 0s
Read 0M words
Number of words:  625
Number of labels: 9
Progress: 100.0% words/sec/thread: 1242129 lr:  0.000000 avg.loss:  0.007863 ETA:   0h 0m 0s
Read 0M words
Number of words:  625
Number of labels: 9
Progress: 100.0% words/sec/thread: 1256980 lr:  0.

In [6]:
#difficult model performance result
difficult_result = getPerformance("../data/DwC_difficult.train", "../data/crawl-300d-2M-subword.vec", "../data/DwC_simple.valid")
output_to_csv("../data/Performance result/difficult_performance", difficult_result)

Read 0M words
Number of words:  692
Number of labels: 7
Progress: 100.0% words/sec/thread: 1367123 lr:  0.000000 avg.loss:  0.012161 ETA:   0h 0m 0s
Read 0M words
Number of words:  692
Number of labels: 7
Progress: 100.0% words/sec/thread: 1376383 lr:  0.000000 avg.loss:  0.013347 ETA:   0h 0m 0s
Read 0M words
Number of words:  692
Number of labels: 7
Progress: 100.0% words/sec/thread: 1429335 lr:  0.000000 avg.loss:  0.004101 ETA:   0h 0m 0s
Read 0M words
Number of words:  692
Number of labels: 7
Progress: 100.0% words/sec/thread: 1381290 lr:  0.000000 avg.loss:  0.003167 ETA:   0h 0m 0s
Read 0M words
Number of words:  692
Number of labels: 7
Progress: 100.0% words/sec/thread: 1320033 lr:  0.000000 avg.loss:  0.004449 ETA:   0h 0m 0s
Read 0M words
Number of words:  692
Number of labels: 7
Progress: 100.0% words/sec/thread: 1246574 lr:  0.000000 avg.loss:  0.002366 ETA:   0h 0m 0s
Read 0M words
Number of words:  692
Number of labels: 7
Progress: 100.0% words/sec/thread: 1371552 lr:  0.

In [7]:
#model trained by steve mapping
steve_result = getPerformance("../data/steve_696.train", "../data/crawl-300d-2M-subword.vec", "../data/steve_299.valid")
output_to_csv("../data/Performance result/steve_performance", steve_result)

Read 0M words
Number of words:  2717
Number of labels: 6
Progress: 100.0% words/sec/thread: 1540619 lr:  0.000000 avg.loss:  0.026534 ETA:   0h 0m 0s
Read 0M words
Number of words:  2717
Number of labels: 6
Progress: 100.0% words/sec/thread: 1631473 lr:  0.000000 avg.loss:  0.018439 ETA:   0h 0m 0s
Read 0M words
Number of words:  2717
Number of labels: 6
Progress: 100.0% words/sec/thread: 1657169 lr:  0.000000 avg.loss:  0.009325 ETA:   0h 0m 0s
Read 0M words
Number of words:  2717
Number of labels: 6
Progress: 100.0% words/sec/thread: 1738231 lr:  0.000000 avg.loss:  0.010632 ETA:   0h 0m 0s
Read 0M words
Number of words:  2717
Number of labels: 6
Progress: 100.0% words/sec/thread: 1701359 lr:  0.000000 avg.loss:  0.006012 ETA:   0h 0m 0s
Read 0M words
Number of words:  2717
Number of labels: 6
Progress: 100.0% words/sec/thread: 1544349 lr:  0.000000 avg.loss:  0.008524 ETA:   0h 0m 0s
Read 0M words
Number of words:  2717
Number of labels: 6
Progress: 100.0% words/sec/thread: 1633818 