# Final Project

## TRAC 2 - Format data for PET

In this notebook we generate suitable csv files for training, development, testing and unlabeled data to use in PET.


## Package imports

In [1]:
import pandas as pd
import numpy as np

## Load training data

In [2]:
# Load aggressiveness dataset
train_data = pd.read_csv('../../../data/release-files/eng/trac2_eng_train.csv')
dev_data = pd.read_csv('../../../data/release-files/eng/trac2_eng_dev.csv')

# test data data and labels is in separate files
test_data = pd.read_csv('../../../data/release-files/test/trac2_eng_test.csv')
test_labels_a = pd.read_csv('../../../data/release-files/gold/trac2_eng_gold_a.csv')['Sub-task A']
test_labels_b = pd.read_csv('../../../data/release-files/gold/trac2_eng_gold_b.csv')['Sub-task B']

# join text and labels for test data
test_data['Sub-task A'] = test_labels_a
test_data['Sub-task B'] = test_labels_b

## Labels

In [3]:
# will consider only two labels for task A: AG and NAG. So CAG and OAG are now AG
train_data['label_a'] = train_data['Sub-task A'].map(lambda x: 1 if (x=='OAG' or x=='CAG') else 0)
train_data['label_b'] = train_data['Sub-task B'].map(lambda x: 1 if x=='GEN' else 0)

dev_data['label_a'] = dev_data['Sub-task A'].map(lambda x: 1 if (x=='OAG' or x=='CAG') else 0)
dev_data['label_b'] = dev_data['Sub-task B'].map(lambda x: 1 if x=='GEN' else 0)

test_data['label_a'] = test_data['Sub-task A'].map(lambda x: 1 if (x=='OAG' or x=='CAG') else 0)
test_data['label_b'] = test_data['Sub-task B'].map(lambda x: 1 if x=='GEN' else 0)

In [4]:
# leave out columns not needed
train_data = train_data[['Text', 'label_a', 'label_b']]
dev_data = dev_data[['Text', 'label_a', 'label_b']]
test_data = test_data[['Text', 'label_a', 'label_b']]

In [5]:
# labels distribution
train_data['label_a'].value_counts()

0    3375
1     888
Name: label_a, dtype: int64

In [6]:
# labels distribution
train_data['label_b'].value_counts()

0    3954
1     309
Name: label_b, dtype: int64

## Select number of training, dev, test and unlabeled data

In [None]:
def training_size(x, task):
    '''Selects x amount of examples from the training data without replacement
    The function samples the same amount of labels from the classes 0 and 1.
    Parameters:
    We expect that x <= 1,000 for task A
    task= 'A' or 'B'
    '''
    if task == 'A':
        labels = 'label_a'
        df0 = train_data[train_data[labels]==0].sample(x//2, replace=False, random_state=12345)
        df1 = train_data[train_data[labels]==1].sample(x//2, replace=False, random_state=12345)
    elif task == 'B' and x <= 600: # case where I have enough examples of class 1 to do sample w/o replacement
        labels = 'label_b'
        df0 = train_data[train_data[labels]==0].sample(x//2, replace=False, random_state=12345)
        df1 = train_data[train_data[labels]==1].sample(x//2, replace=False, random_state=12345)
    elif task == 'B' and x > 600: # case where I don't have enough examples of class 1 to do sample w/o replacement
        labels = 'label_b'
        df0 = train_data[train_data[labels]==0].sample(x//2, replace=False, random_state=12345)
        df1 = train_data[train_data[labels]==1].sample(x//2, replace=True, random_state=12345)
    
    # create the training dataframe
    train = pd.concat([df0,df1], axis=0)[['Text', labels]]
    
    # index of the elements included in the training dataframe
    index_train = train.index.to_list()
    
    # the unlabeled datafrane are all training data not used in the train dataframe
    unlabeled = train_data.drop(index=index_train)[['Text', labels]]
     
    # shuffle rows
    train = train.sample(frac=1, ignore_index=True, random_state=12345)
    unlabeled = unlabeled.sample(frac=1, ignore_index=True, random_state=12345)
    
    return train, unlabeled

In [None]:
# create training, unlabeled dataframes
labeled_examples = 10

data = training_size(labeled_examples,'A')

train = data[0]
unlabeled = data[1]

# remove labels in unlabeled dataset
unlabeled['label_a'] = np.nan

# rename label column
train = train.rename(columns={'label_a':'label'})
unlabeled = unlabeled.rename(columns={'label_a':'label'})

In [None]:
# development and test datasets
dev_data = dev_data[['Text','label_a']]
test_data = test_data[['Text', 'label_a']]

# rename label column
dev_data = dev_data.rename(columns={'label_a':'label'})
test_data = test_data.rename(columns={'label_a':'label'})

In [None]:
# convert labels to strings
train['label'] = train['label'].astype('str')
dev_data['label'] = dev_data['label'].astype('str')
test_data['label'] = test_data['label'].astype('str')

In [None]:
# save data

path = '../../../data/pet_files/10_examples/'
train_name = path + 'train.csv'
unlabeled_name = path + 'unlabeled.csv'
dev_name = path + 'dev.csv'
test_name = path + 'test.csv'

train.to_csv(train_name, index=False, header=False)
unlabeled.to_csv(unlabeled_name, index=False, header=False)
dev_data.to_csv(dev_name, index=False, header=False)
test_data.to_csv(test_name, index=False, header=False)