In [2]:
import pandas as pd
import numpy as np
import scipy
import nltk
import spacy
import gensim
import glob
import csv
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import cross_val_score
import sklearn.model_selection
import sklearn.pipeline
import re
from sklearn import svm
from sklearn import *
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.feature_selection import chi2
from sklearn.base import BaseEstimator, TransformerMixin
import gensim.models.wrappers.fasttext
from scipy import sparse
import tensorflow_datasets as tfds
import tensorflow as tf
import collections
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import LeaveOneOut,KFold,train_test_split
import simplejson
import pprint


# Custom imports
from mr_generic_scripts import *
from mr_cls_BILSTM import *
from mr_generic_scripts import load_combined_data

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Path to xlsx files folder

# Original MIND-CA corpus
path_to_raw_files = f_path + 'Data/raw_relabeled/'
# MIND-CA + human augment
path_to_plus_files = f_path + 'Data/raw_plus/'
# UK-MIND-20
path_to_wp1 = f_path + 'Data/wp1/'

# Augmented data
# augmentations, 125 examples per QA pair
path_to_aug = f_path + 'Data/aug_data/all/'
# augmentations, 500 examples per QA pair
path_to_aug_hq_os = f_path + 'Data/aug_data_os/all/'
# augmentation, no sampling - 1500 total examples per question
path_to_aug_joint = f_path + 'Data/aug_data_joint/all/'

# Merged xlsx files with multiple augmentations
path_to_set_files = f_path + 'Data/aug_data/sets/'

In [None]:
# List of augmentations by category
hq_data = ['reord','phrase','dict']
lq_data = ['wordnet','ppdb','glove','fasttext']
set_data = ['ab_lq','ab_hq','all_lq','all_hq','all_aug']

In [None]:
# General config of the training run (!)

# List of data to use for training

# All possible train sets
# train_sets = ['orig','plus','reord','phrase','dict','wordnet','ppdb','glove','fasttext','ab_lq','ab_hq','all_lq','all_hq']

# Selective train set
#train_sets = ['reord','phrase','dict']
train_sets = ['wordnet','ppdb','glove','fasttext']
#train_sets = ['wp1']

# Alias path to aug data (either 125 or 500 examples or the 1500 joint)
aug_path = path_to_aug_joint

# Training parameters
# Number of folds for k-fold cross validation
n_k_fold = 10

# Only answers (False) or questions + answers (True)
mind_qa = True

In [None]:
# Get the datasets in dataframes
combined_data = load_combined_data{}

# Check if we load only answers or questions plus answers
if mind_qa:
    # Always load MIND-CA + human aug, this is the base set
    datasets['plus'] = mr_get_qa_data(path_to_plus_files)

    # Always load UK-MIND-20, we need it for testing
    datasets['wp1'] = mr_get_qa_data(path_to_wp1)

    # If comparison is needed, load MIND-CA without any aug
    if 'orig' in train_sets:
        datasets['orig'] = mr_get_qa_data(path_to_raw_files)

    # Load augmented data
    for at_set in train_sets:
        if at_set in ['orig','plus','wp1']:
            continue
        path_to_aug = aug_path + at_set + "/"
        datasets[at_set] = mr_get_qa_data(path_to_aug)

    
# Only the answer
else:
    # Always load MIND-CA + human aug, this is the base set
    datasets['plus'] = mr_get_data(path_to_plus_files)

    # Always load UK-MIND-20, we need it for testing
    datasets['wp1'] = mr_get_data(path_to_wp1)

    # If comparison is needed, load MIND-CA without any aug
    if 'orig' in train_sets:
        datasets['orig'] = mr_get_data(path_to_raw_files)

    # Load augmented data
    aug_dataset = {}
    for at_set in train_sets:
        if at_set in ['orig','plus','wp1']:
            continue
        path_to_aug = aug_path + at_set + "/"
        datasets[at_set] = mr_get_data(path_to_aug)

In [None]:
# Sanity check
for d_id in train_sets:
    print(len(datasets[d_id][-1][1]))
    if at_set in ['orig','plus','wp1']:
         continue
    # Augmented datasets have additional column that needs to be dropped
    datasets[d_id][-1][1].drop(["Aug_ID"],axis=1,inplace=True)

In [None]:
def mr_proc_results(raw_results):
  # Process the results from the 10 runs
  # result format: [acc, acc per q, acc per age], [f1, f1 per q, f1 per age], [acc, acc per q, acc per age] (for wp1), [f1, f1 per q, f1 per age] (for wp1)
  # Ignore ages as they seem to be mostly consistent with global average
  # Ignore accs per question and age as averaging them seems to be consistent with global average
  # Report global acc, global macro f1, average of macro f1 per question; same for wp1
  pr_results = [[[acc_score, f1_score,round(sum(qf_s)/11,2)],[acc_score_wp1, f1_score_wp1,round(sum(qf_s_wp1)/11,2)]] 
                for ([acc_score, qa_s, aa_s], [f1_score, qf_s, af_s],
                     [acc_score_wp1, qa_s_wp1, aa_s_wp1], [f1_score_wp1, qf_s_wp1, af_s_wp1]) in raw_results]

  # Throw the list in an np array
  pr_arr = np.array(pr_results)

  # Print the results
  pp = pprint.PrettyPrinter(indent=4)

  pp.pprint(pr_results)
  pp.pprint(np.mean(pr_arr,axis=0))

In [None]:
# Initialize the classifier
if mind_qa:
  # ages 8 to 13, removing outliers; vocabulary of 1000; max len 35
  bl_cls = MR_bilstm(text_cols,[7,8,9,10,11,12,13,14],1000,35)
else:
  # ages 8 to 13, removing outliers; vocabulary of 1000; max len 20
  bl_cls = MR_bilstm(text_cols,[7,8,9,10,11,12,13,14],1000,20)

# Configure eval parameters - eval by age and questions, do not return examples with errors (not fully implemented in current version)  
bl_cls.mr_set_eval_vars(True,True,False)

In [None]:
# Initialize the results variable 
results = {}

In [None]:
# Use the combined dataset for training and evaluation
train_df, test_df = train_test_split(combined_data, test_size=0.2)

# Train and evaluate using k-fold cross-validation
results['combined'] = bl_cls.mr_kfold_pre_split(train_df, test_df, 0.25, n_k_fold)

# Save the results
rs_path = 'Results/split_eval_joint/bl_qa_combined.txt'
with open(rs_path, 'w') as op:
    simplejson.dump(results['combined'], op)


# Run all train-test combos
# for at_set in train_sets:
#     print("Current train: " + str(at_set) + "\n")

#     if at_set in ['orig','plus','wp1']:
#         # For orig and plus we directly train and test using kfold validation
#         results[at_set] = bl_cls.mr_kfold_pre_split(datasets[at_set][-1][1],datasets['wp1'][-1][1],0.25,n_k_fold)
#     else:
#         # For augmented data we need to also provide the "plus" set for evaluation and organizing the split
#         results[at_set] = bl_cls.mr_kfold_aug_pre_split(datasets['plus'][-1][1],datasets[at_set][-1][1],datasets['wp1'][-1][1],0.25,n_k_fold)

#     # Save the results in a file
#     rs_path = 'Results/split_eval_joint/bl_qa_os_'
#     s_path = rs_path + at_set + '.txt'
#     with open(s_path,'w') as op:
#         simplejson.dump(results[at_set],op)

In [None]:
# Visualize the results
# Visualize the results for the combined dataset
print("Combined dataset results:")
mr_proc_results(results['combined'])


# for at_set in train_sets:
#     print(at_set)
#     mr_proc_results(results[at_set])