# Dataset Generation

In [None]:
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)

## Extract Claims
* The `.ann` files are in the folders `data/p` and `data/n` respectively.

In [None]:
phrases = {
    'interpretation': [],
    'evaluation_rational': [],
    'evaluation_emotional': [],
    'agreement': [],
    'disagreement': []
}

for fname in os.listdir('data/p'):
    cur_types = {}
    cur_phrases = {}
    with open(f'data/p/{fname}') as f:
        for line in f.readlines():
            parts = line.split('\t')
            if len(parts) == 2:
                _, identifier, info = parts[1].split(' ')
                cur_types[identifier] = info.replace('\n', '')
            else:
                identifier = parts[0]
                info = parts[1]
                phrase = parts[2]
                phrase_type, _, _ = info.split(' ')
                if phrase_type == 'claim':
                    cur_phrases[identifier] = phrase.replace('\n', '')
    
    for identifier in cur_phrases.keys():
        phrases[cur_types[identifier]].append(cur_phrases[identifier])

for fname in os.listdir('data/n'):
    cur_types = {}
    cur_phrases = {}
    with open(f'data/n/{fname}') as f:
        for line in f.readlines():
            parts = line.split('\t')
            if len(parts) == 2:
                _, identifier, info = parts[1].split(' ')
                cur_types[identifier] = info.replace('\n', '')
            else:
                identifier = parts[0]
                info = parts[1]
                phrase = parts[2]
                phrase_type, _, _ = info.split(' ')
                if phrase_type == 'claim':
                    cur_phrases[identifier] = phrase.replace('\n', '')
    
    for identifier in cur_phrases.keys():
        phrases[cur_types[identifier]].append(cur_phrases[identifier])
                

## Generate Indices for different datasets

In [None]:
def train_val_test_idcs(n_samples, split_train=0.7, split_val=0.2, split_test=0.1):
    assert np.isclose(split_train + split_val + split_test, 1), 'Splits must add up to 1.'
    perm = np.random.permutation(n_samples)
    train_end = int(n_samples*split_train)
    val_end = train_end + int(n_samples*split_val)
    return perm[:train_end], perm[train_end:val_end], perm[val_end:]

In [None]:
idcs = list(map(lambda k: train_val_test_idcs(len(phrases[k])), phrases.keys()))

## Split Data

In [None]:
train = {}
val = {}
test = {}
for i, key in enumerate(phrases.keys()):
    train[key] = np.array(phrases[key])[idcs[i][0]]
    val[key] = np.array(phrases[key])[idcs[i][1]]
    test[key] = np.array(phrases[key])[idcs[i][2]]

## Save Data

In [None]:
with open('data_train.p', 'wb') as f:
    pickle.dump(train, f)

with open('data_val.p', 'wb') as f:
    pickle.dump(val, f)
    
with open('data_test.p', 'wb') as f:
    pickle.dump(test, f)

### Save in a sklearn compatible way

In [None]:
X = np.array([])
y = np.array([])
for cls, key in enumerate(val.keys()):
    X = np.append(X, val[key])
    y = np.append(y, np.repeat(cls, len(val[key])))

In [None]:
np.savez('dataset.npz', X=X, y=y)

In [None]:
X = np.array([])
y = np.array([])
for cls, key in enumerate(test.keys()):
    X = np.append(X, test[key])
    y = np.append(y, np.repeat(cls, len(test[key])))

In [None]:
np.savez('dataset_test.npz', X=X, y=y)

### Generate Dataset statistics

In [None]:
cnts = np.zeros(len(phrases.keys()))
for i, key in enumerate(phrases.keys()):
    cnts[i] = len(phrases[key])
cnts

In [None]:
plt.figure()
plt.bar(range(5), cnts)
plt.xticks(range(5), labels=phrases.keys(), rotation=15)
plt.savefig('class_distributuin.png', bbox_inches='tight')