In [1]:
#File: crossValidation.ipynb
#Purpose: use cross validation to evaluate the model
#Author: Quan Gan, Yuxuan Zhou
import fasttext
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
from sklearn.model_selection import KFold

In [3]:
#Method: trainModel
#Purpose: train fastText model
#Paremater: trainSet -> the trainSet file path
#           input_word_vector -> the pretrained word vector file path
def trainModel(trainSet, input_word_vector):
    model = fasttext.train_supervised(input = trainSet,
                                      dim = 300,
                                      lr = 0.5,
                                      epoch = 20,
                                      loss ='ova',
                                      pretrainedVectors = input_word_vector)
    return model

In [4]:
#Method: cross_Validation
#Purpose: use cross validation to evaluate the result
#Parameter: df -> the input data
#           NFold -> number of fold
def cross_Validation(df,NFold,lr,epoch, filename):
    kfold = KFold(n_splits=NFold,shuffle=True,random_state=1)
    n = 1
    resultPD = pd.DataFrame(columns=["filename", 'precision', 'recall'])
    for train, test in kfold.split(df):
        df.iloc[train].to_csv('train.train', header=False, index=False)
        df.iloc[test].to_csv('test.valid', header=False, index=False)

        model = trainModel('train.train','wiki-news-300d-1M.vec')
        result = model.test('test.valid',k=1)

        os.remove('test.valid')
        os.remove('train.train')
        
        print(f"{n} number of cross validation")
        print("Precision:",result[1])
        print("Recall:",result[2])
        resultPD.loc[n-1] = [filename, result[1], result[2]]
        n+=1
    return resultPD

In [5]:
p = open('new_title.txt', 'r')
line = list(set(p.readlines()))

save = pd.DataFrame(columns=["filename", 'precision', 'recall'])
result = []

for i in line:
    df = pd.read_csv(i[:-1], header=None)
    save = cross_Validation(df, 5, 0.5, 20, i[:-1])
    result.append(save)

#print(pd.concat(result,axis=1))

1 number of cross validation
Precision: 0.9296482412060302
Recall: 0.893719806763285
2 number of cross validation
Precision: 0.9597989949748744
Recall: 0.9408866995073891
3 number of cross validation
Precision: 0.949748743718593
Recall: 0.9402985074626866
4 number of cross validation
Precision: 0.9195979899497487
Recall: 0.905940594059406
5 number of cross validation
Precision: 0.914572864321608
Recall: 0.896551724137931
1 number of cross validation
Precision: 0.914572864321608
Recall: 0.8792270531400966
2 number of cross validation
Precision: 0.9597989949748744
Recall: 0.9408866995073891
3 number of cross validation
Precision: 0.9396984924623115
Recall: 0.9303482587064676
4 number of cross validation
Precision: 0.9396984924623115
Recall: 0.9257425742574258
5 number of cross validation
Precision: 0.9045226130653267
Recall: 0.8866995073891626
1 number of cross validation
Precision: 0.9396984924623115
Recall: 0.9033816425120773
2 number of cross validation
Precision: 0.9597989949748744
R

In [6]:
# get the precision and recall data
# put data on the test.csv
for i in result:
    i.to_csv("test.csv", mode='a', index=True)