In [2]:
#File: Performances.ipynb
#Purpose: find best parameters to train fastText model
#Author: Quan Gan
import fasttext
import csv 

In [3]:
#Method: sortHelper
#Purpose: result list sorting helper
def sortHelper(e):
    return e['precision']

In [4]:
#Method: getPerformance
#Purpose: train different fasttext models with different parameters and return performances
#Parameters: input_train -> the trainset file path
#            input_word_vector -> pretained word vector file path
#            input_test -> the testset file path
def getPerformance(input_train, input_word_vector, input_test):
    result = []
    for lr in [0.1, 0.5, 1]:
        for epoch in [5, 10, 15, 20, 25]:
            model = fasttext.train_supervised(input = input_train,
                                              dim = 300,
                                              lr = lr,
                                              epoch = epoch,
                                              loss ='ova',
                                              pretrainedVectors = input_word_vector)
            for k in range(1, 6):
                Performance = model.test(input_test, k=k)
                result.append({'precision' : Performance[1], 'recall' : Performance[2], 'k':k, 'learning rate' : lr, 'epoch': epoch})
    result.sort(key=sortHelper, reverse=True)
    return result

In [5]:
#Method: output_to_csv
#Purpose: convert result performances into csv file
#Parameters: fileName -> the filename without file extension (the file will be stored in current file location)
#            result -> the performances result
def output_to_csv(fileName, result):
    keys = ['precision', 'recall', 'k', 'learning rate','epoch']
    with open(fileName + '.csv', 'w', newline='') as output_file:
        writer = csv.DictWriter(output_file, keys)
        writer.writeheader()
        writer.writerows(result)

In [7]:
#model trained by steve mapping
steve_result = getPerformance("../data/steve_696.train", "../data/crawl-300d-2M-subword.vec", "../data/steve_299.valid")
output_to_csv("../data/Performance_result/steve_performance", steve_result)

Read 0M words
Number of words:  2717
Number of labels: 6
Progress: 100.0% words/sec/thread: 1540619 lr:  0.000000 avg.loss:  0.026534 ETA:   0h 0m 0s
Read 0M words
Number of words:  2717
Number of labels: 6
Progress: 100.0% words/sec/thread: 1631473 lr:  0.000000 avg.loss:  0.018439 ETA:   0h 0m 0s
Read 0M words
Number of words:  2717
Number of labels: 6
Progress: 100.0% words/sec/thread: 1657169 lr:  0.000000 avg.loss:  0.009325 ETA:   0h 0m 0s
Read 0M words
Number of words:  2717
Number of labels: 6
Progress: 100.0% words/sec/thread: 1738231 lr:  0.000000 avg.loss:  0.010632 ETA:   0h 0m 0s
Read 0M words
Number of words:  2717
Number of labels: 6
Progress: 100.0% words/sec/thread: 1701359 lr:  0.000000 avg.loss:  0.006012 ETA:   0h 0m 0s
Read 0M words
Number of words:  2717
Number of labels: 6
Progress: 100.0% words/sec/thread: 1544349 lr:  0.000000 avg.loss:  0.008524 ETA:   0h 0m 0s
Read 0M words
Number of words:  2717
Number of labels: 6
Progress: 100.0% words/sec/thread: 1633818 

In [11]:
#model trained by speciment type SESAR
SESAR_result = getPerformance("../Collections_data/SESARtrain_specimenType.train", "../data/crawl-300d-2M-subword.vec", "../Collections_data/SESARValid_specimenType.valid")
output_to_csv("../data/Performance_result/SESAR_performance_specimenType", SESAR_result)

In [12]:
#model trained by material type SESAR
SESAR_result = getPerformance("../Collections_data/SESARtrain_materialType.train", "../data/crawl-300d-2M-subword.vec", "../Collections_data/SESARValid_materialType.valid")
output_to_csv("../data/Performance_result/SESAR_performance_materialType", SESAR_result)

In [13]:
#model trained by sampled Feature SESAR
SESAR_result = getPerformance("../Collections_data/SESARtrain_sampeldFeature.train", "../data/crawl-300d-2M-subword.vec", "../Collections_data/SESARValid_sampeldFeature.valid")
output_to_csv("../data/Performance_result/SESAR_performance_sampledFeature", SESAR_result)

In [9]:
#model trained by specimen type openContext
openContext_result = getPerformance("../Collections_data/openContexttrain_specimenType.train", "../data/crawl-300d-2M-subword.vec", "../Collections_data/openContextValid_specimenType.valid")
output_to_csv("../data/Performance_result/openContext_performance_specimenType", openContext_result)

In [None]:
#model trained by specimen type openContext only item category
openContext_result = getPerformance("../Collections_data/openContexttrain_specimenType_onlyOne.train", "../data/crawl-300d-2M-subword.vec", "../Collections_data/openContextValid_specimenType_onlyOne.valid")
output_to_csv("../data/Performance_result/openContext_performance_specimenType_onlyOne", openContext_result)

In [10]:
#model trained by matrial type openContext only item category
openContext_result = getPerformance("../Collections_data/openContexttrain_materialType_onlyOne.train", "../data/crawl-300d-2M-subword.vec", "../Collections_data/openContextValid_materialType_onlyOne.valid")
output_to_csv("../data/Performance_result/openContext_performance_materialType_onlyOne", openContext_result )