In [16]:
#File: CollectionPredict.ipynb
#Purpose: use fastText model to predict DwC different collections
#Author: Quan Gan
import fasttext
import csv

In [17]:
# 'Amphibians' refers to 'Amphibians %26 Reptiles'
# 'zoology' refers to 'Invertebrate Zoology'
collections = ["Amphibians", "Birds", "Botany", "Entomology", "Fishes", "Mammals", "zoology"]

In [18]:
#Method: trainModel
#Purpose: train fastText model
#Paremater: trainSet -> the trainSet file path
#           input_word_vector -> the pretrained word vector file path
def trainModel(trainSet, input_word_vector):
    model = fasttext.train_supervised(input = trainSet,
                                      dim = 300,
                                      lr = 0.5,
                                      epoch = 20,
                                      loss ='ova',
                                      pretrainedVectors = input_word_vector)
    return model

In [19]:
#Method: getPredictList
#Purpose: read predict file and store data into list
#Parameter: filePath -> the predict file path
def getPredictList(filePath):
    predict_file = open(filePath, 'r')
    predict_sentences = []
    for line in predict_file:
        predict_sentences.append(line.strip())
    return predict_sentences

In [20]:
#Method: calulateProb
#Purpose: calulate the average, minimum and maximum probility.
#Parameter: prob -> the probility list
def calulateProb(prob):
    avg_prob = 0
    max = -float("inf")
    min = float("inf")
    for i in prob:
        avg_prob += i[0]
        if max < i[0]:
            max = i[0]
        if min > i[0]:
            min = i[0]
    avg_prob = avg_prob/len(prob)
    return avg_prob, max, min

In [21]:
#Method: csvResult
#Purpose: store result into csv file
#Parameter: prob -> the probility list
def csvResult(fileName, result):
    with open("../data/Collection_result/{}_result.txt".format(fileName), 'w', newline='') as output_file:
        for i in range(0, len(result[0])):
            output_file.write("Label: {} - Probility: {}\n".format(result[0][i], result[1][i][0]))

In [22]:
#trian the model with simple trainset
model = trainModel("../data/steve_696.train", "../data/crawl-300d-2M-subword.vec")

Read 0M words
Number of words:  2883
Number of labels: 6
Progress: 100.0% words/sec/thread: 1497396 lr:  0.000000 avg.loss:  0.002422 ETA:   0h 0m 0s


In [24]:
probPredict = []
for i in collections:
    predictList = getPredictList("../data/Collection_predict/{}_predict.txt".format(i))
    result = model.predict(predictList, k=1)
    csvResult(i, result)
    avg, max, min = calulateProb(result[1])
    probPredict.append({"Collcetion": i, "Average_prob": avg, "Max_prob": max, "Min_prob": min})

In [25]:
#Method: output_to_csv
#Purpose: convert result performances into csv file
#Parameters: fileName -> the filename without file extension (the file will be stored in current file location)
#            result -> the performances result
def output_to_csv(fileName, result):
    keys = ['Collcetion', 'Average_prob', 'Max_prob', 'Min_prob']
    with open(fileName + '.csv', 'w', newline='') as output_file:
        writer = csv.DictWriter(output_file, keys)
        writer.writeheader()
        writer.writerows(result)

In [26]:
output_to_csv("../data/Collection_result/Sum_Result", probPredict)