# Final Project

## TRAC 2 - Format data for PET

In this notebook we generate suitable csv files for training, development, testing and unlabeled data to use in PET.
Here we prepare the data for the task-A with 3 classes.


## Package imports

In [1]:
import pandas as pd
import numpy as np

## Load training data

In [2]:
# Load aggressiveness dataset
train_data = pd.read_csv('../../../data/release-files/eng/trac2_eng_train.csv')
dev_data = pd.read_csv('../../../data/release-files/eng/trac2_eng_dev.csv')

# test data data and labels is in separate files
test_data = pd.read_csv('../../../data/release-files/test/trac2_eng_test.csv')
test_labels_a = pd.read_csv('../../../data/release-files/gold/trac2_eng_gold_a.csv')['Sub-task A']

# join text and labels for test data
test_data['Sub-task A'] = test_labels_a


## Labels

In [3]:
# encode the labels for the task A. NAG=0, CAG=1, OAG=2 
train_data['label'] = train_data['Sub-task A'].map(lambda x: 0 if x=='NAG' else (1 if x=='CAG' else 2))

dev_data['label'] = dev_data['Sub-task A'].map(lambda x: 0 if x=='NAG' else (1 if x=='CAG' else 2))

test_data['label'] = test_data['Sub-task A'].map(lambda x: 0 if x=='NAG' else (1 if x=='CAG' else 2))


In [4]:
# leave out columns not needed
train_data = train_data[['Text', 'label']]
dev_data = dev_data[['Text', 'label']]
test_data = test_data[['Text', 'label']]

In [5]:
# labels distribution
train_data['label'].value_counts()

0    3375
1     453
2     435
Name: label, dtype: int64

## Select number of training, dev, test and unlabeled data

In [6]:
def training_size(train_data, x):
    '''Selects x amount of examples from the training data without replacement
    The function samples the same amount of labels from the classes.
    Parameters:
    train_data: training dataframe
    x: number of training examples to sample
    We expect that x <= 1,000, so we can sample without replacement all classes. 
    '''   
    # since the number of classes is odd the remainder is assigned to class 0 (NAG)
    df0 = train_data[train_data['label']==0].sample((x//3 + x%3), replace=False, random_state=12345)
    df1 = train_data[train_data['label']==1].sample(x//3, replace=False, random_state=12345)
    df3 = train_data[train_data['label']==2].sample(x//3, replace=False, random_state=12345)
    
    # create the training dataframe
    train = pd.concat([df0,df1,df3], axis=0)[['Text', 'label']]
    
    # index of the elements included in the training dataframe
    index_train = train.index.to_list()
    
    # the unlabeled datafrane are all training data not used in the train dataframe
    unlabeled = train_data.drop(index=index_train)[['Text', 'label']]
    
    # shuffle rows
    train = train.sample(frac=1, ignore_index=True, random_state=12345)
    unlabeled = unlabeled.sample(frac=1, ignore_index=True, random_state=12345)
    
    return train, unlabeled

In [7]:
# create training and unlabeled dataframes
labeled_examples = 500

data = training_size(train_data, labeled_examples)

train = data[0]
unlabeled = data[1]

# remove labels in unlabeled dataset
unlabeled['label'] = np.nan


In [8]:
# development and test datasets
dev_data = dev_data[['Text','label']]
test_data = test_data[['Text', 'label']]

In [9]:
# convert labels to strings
train['label'] = train['label'].astype('str')
dev_data['label'] = dev_data['label'].astype('str')
test_data['label'] = test_data['label'].astype('str')

In [10]:
# save data

path = '../../../data/pet_files/500_examples_3c/'
train_name = path + 'train.csv'
unlabeled_name = path + 'unlabeled.csv'
dev_name = path + 'dev.csv'
test_name = path + 'test.csv'

train.to_csv(train_name, index=False, header=False)
unlabeled.to_csv(unlabeled_name, index=False, header=False)
dev_data.to_csv(dev_name, index=False, header=False)
test_data.to_csv(test_name, index=False, header=False)