In [14]:
#File: crossValidation.ipynb
#Purpose: use cross validation to evaluate the model
#Author: Quan Gan
import fasttext
import pandas as pd
import numpy as np
import os

In [15]:
#Method: trainModel
#Purpose: train fastText model
#Paremater: trainSet -> the trainSet file path
#           input_word_vector -> the pretrained word vector file path
def trainModel(trainSet, input_word_vector):
    model = fasttext.train_supervised(input = trainSet,
                                      dim = 300,
                                      lr = 0.5,
                                      epoch = 20,
                                      loss ='ova',
                                      pretrainedVectors = input_word_vector)
    return model

In [16]:
from sklearn.model_selection import KFold

In [17]:
#Method: cross_Validation
#Purpose: use cross validation to evaluate the result
#Parameter: df -> the input data
#           NFold -> number of fold
def cross_Validation(df, NFold):
    kfold = KFold(n_splits=NFold, shuffle=True, random_state=1)
    precision = 0
    recall = 0
    for train, test in kfold.split(df):
        df.iloc[train].to_csv('train.train', header=False, index=False)
        df.iloc[test].to_csv('test.valid', header=False, index=False)
   
        model = trainModel('train.train', '../data/crawl-300d-2M-subword.vec')
        result = model.test('test.valid', k=1)

        precision += result[1]
        recall += result[2]
        os.remove('test.valid')
        os.remove('train.train')
    print("{} fold Cross Validation".format(NFold))
    print("Precision:", precision/NFold)
    print("Recall:", recall/NFold)

In [18]:
df = pd.read_csv('../data/cleanedData.txt', header= None)

In [19]:
cross_Validation(df, 5)

5 fold Cross Validation
Precision: 0.936683417085427
Recall: 0.9175026664716576


In [20]:
cross_Validation(df, 10)

10 fold Cross Validation
Precision: 0.9356464646464646
Recall: 0.9165532503305824
