In [2]:
#File: crossValidation.ipynb
#Purpose: use cross validation to evaluate the model
#Author: Quan Gan
import fasttext
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [3]:
from sklearn.model_selection import KFold

In [77]:
#Method: trainModel
#Purpose: train fastText model
#Paremater: trainSet -> the trainSet file path
#           input_word_vector -> the pretrained word vector file path
def trainModel(trainSet, input_word_vector, LR, Epoch):
    model = fasttext.train_supervised(input = trainSet,
                                      dim = 300,
                                      lr = LR,
                                      epoch = Epoch,
                                      loss ='ova',
                                      pretrainedVectors = input_word_vector)
    return model

In [78]:
#Method: cross_Validation
#Purpose: use cross validation to evaluate the result
#Parameter: df -> the input data
#           NFold -> number of fold
def cross_Validation(df, NFold, lr, epoch):
    kfold = KFold(n_splits=NFold, shuffle=True, random_state=1)
    precision = 0
    recall = 0
    for train, test in kfold.split(df):
        df.iloc[train].to_csv('train.train', header=False, index=False)
        df.iloc[test].to_csv('test.valid', header=False, index=False)
   
        model = trainModel('train.train', '../data/crawl-300d-2M-subword.vec', lr, epoch)
        result = model.test('test.valid', k=1)

        precision += result[1]
        recall += result[2]
        os.remove('test.valid')
        os.remove('train.train')
    print("{} fold Cross Validation".format(NFold))
    print("Precision:", round(precision/NFold, 4))
    print("Recall:", round(recall/NFold,4))
    return round(precision/NFold, 4), round(recall/NFold,4)

In [82]:
def diffSizeCV(dataFrame, lr, epoch, step, kfold, seed):
    result = pd.DataFrame(columns=['amount', 'Precision@1', 'Recall@1'])

    count = step
    temp = dataFrame
    train = pd.DataFrame()
    while len(temp) != 0:
        current = temp.sample(n = count, random_state=seed)
        train = pd.concat([train, current])
        precision, recall = cross_Validation(train, kfold, lr, epoch)
        result = pd.concat([result, pd.DataFrame([[len(train), precision, recall]], columns=['amount', 'Precision@1', 'Recall@1'])], ignore_index=True)
        cond = temp[0].index.isin(current[0].index)
        temp = temp.loc[~cond]   
        count = min(step, len(temp))
    return result

In [88]:
def drawGraph(df, title, amount,ylim=[0, 100]):
    fig = plt.figure(figsize=[10,5])
    ax = fig.add_subplot()

    p = ax.plot(df['amount'], df['Precision@1']*100, label='Precision')
    r = ax.plot(df['amount'], df['Recall@1']*100, label='Recall')

    for i, rect in enumerate(df.index):
        ax.text(df['amount'][i]-5, df['Precision@1'][rect]*100+1,  round(df['Precision@1'][rect]*100, 4), color='blue')

    for j, rect1 in enumerate(df.index):
        ax.text(df['amount'][j]-5, df['Recall@1'][rect1]*100-1.5,  round(df['Recall@1'][rect1]*100, 4), color='orange')

    ax.set_ylim(ylim)
    plt.legend(loc='upper left')
    plt.xlabel('Record counts')
    plt.ylabel('Scores')
    plt.title(f'{title} performance {amount} records')

SyntaxError: non-default argument follows default argument (Temp/ipykernel_27432/2825217133.py, line 1)

In [83]:
df_DwC = pd.read_csv('../data/cleanedData.txt', header= None)
DwC_result = diffSizeCV(df_DwC, 0.5, 20, 100, 5, 2)

5 fold Cross Validation
Precision: 0.79
Recall: 0.7671
5 fold Cross Validation
Precision: 0.87
Recall: 0.841
5 fold Cross Validation
Precision: 0.8867
Recall: 0.8611
5 fold Cross Validation
Precision: 0.915
Recall: 0.8934
5 fold Cross Validation
Precision: 0.926
Recall: 0.9031
5 fold Cross Validation
Precision: 0.9233
Recall: 0.9023
5 fold Cross Validation
Precision: 0.9243
Recall: 0.9024
5 fold Cross Validation
Precision: 0.9337
Recall: 0.9134
5 fold Cross Validation
Precision: 0.9211
Recall: 0.9012
5 fold Cross Validation
Precision: 0.9307
Recall: 0.9115


In [87]:
drawGraph(DwC_result, 'Smithsonian/fastText', [60, 100], len(df_DwC))

TypeError: drawGraph() takes from 2 to 3 positional arguments but 4 were given

In [86]:
SESAR_specimenType = pd.read_csv('../Collections_data/cleanedSESAR_specimenType.txt', header= None)
SESAR_specimenType_result = diffSizeCV(SESAR_specimenType, 0.1, 20, 100, 5, 2)

5 fold Cross Validation
Precision: 0.99
Recall: 0.99
5 fold Cross Validation
Precision: 0.995
Recall: 0.995
5 fold Cross Validation
Precision: 0.9966
Recall: 0.9966
5 fold Cross Validation
Precision: 0.9975
Recall: 0.9975
5 fold Cross Validation
Precision: 0.99
Recall: 0.99
5 fold Cross Validation
Precision: 0.9917
Recall: 0.9917
5 fold Cross Validation
Precision: 0.9943
Recall: 0.9943
5 fold Cross Validation
Precision: 0.9938
Recall: 0.9938
5 fold Cross Validation
Precision: 0.9933
Recall: 0.9933
5 fold Cross Validation
Precision: 0.9918
Recall: 0.9918
