# Parameters

In [0]:
raw_data_folder = '/content/drive/My Drive/COMP_4211_Project/Data/LIAR'
destination_folder = '/content/drive/My Drive/COMP_4211_Project/Data/Phase_3'

train_test_ratio = 0.90
train_valid_ratio = 0.80
first_n_words = 200

# Libraries

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Preprocessing

In [0]:
def trim_string(x):

    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])

    return x

def numericalize_label(x):

    if x in ['pants-fire', 'false']:
        return 1
    elif x in ['mostly-true', 'true']:
        return 0
    return 2

In [0]:
numericalize_label('false')

1

In [0]:
# Read raw data
df_train = pd.read_csv(raw_data_folder + '/train.tsv', sep='\t', names=['label','text'], usecols=[1,2])
df_valid = pd.read_csv(raw_data_folder + '/valid.tsv', sep='\t', names=['label','text'], usecols=[1,2])
df_test = pd.read_csv(raw_data_folder + '/test.tsv', sep='\t', names=['label','text'], usecols=[1,2])
df_full = pd.concat([df_train, df_valid, df_test], ignore_index=True, sort=False)

# Drop rows with empty text; trim text to first_n_words
df_full.drop( df_full[df_full.text.str.len() < 5].index, inplace=True)
df_full['text'] = df_full['text'].apply(trim_string)

# Split according to labels
df_full['label'] = df_full['label'].apply(numericalize_label)
df_fake = df_full[df_full.label == 1]
df_real = df_full[df_full.label == 0]

# Train-test split
df_real_full_train, df_real_test = train_test_split(df_real, train_size = train_test_ratio, random_state = 4211)
df_fake_full_train, df_fake_test = train_test_split(df_fake, train_size = train_test_ratio, random_state = 4211)

# Train-valid split
df_real_train, df_real_valid = train_test_split(df_real_full_train, train_size = train_valid_ratio, random_state = 4211)
df_fake_train, df_fake_valid = train_test_split(df_fake_full_train, train_size = train_valid_ratio, random_state = 4211)

# Concatenate splits of different labels
df_train = pd.concat([df_real_train, df_fake_train], ignore_index=True, sort=False)
df_valid = pd.concat([df_real_valid, df_fake_valid], ignore_index=True, sort=False)
df_test = pd.concat([df_real_test, df_fake_test], ignore_index=True, sort=False)

# Write preprocessed data
df_train.to_csv(destination_folder + '/LIAR_train.csv', index=False)
df_valid.to_csv(destination_folder + '/LIAR_valid.csv', index=False)
df_test.to_csv(destination_folder + '/LIAR_test.csv', index=False)

In [0]:
df_train = pd.read_csv(raw_data_folder + '/train.tsv', sep='\t', names=['label','text'], usecols=[1,2])
df_valid = pd.read_csv(raw_data_folder + '/valid.tsv', sep='\t', names=['label','text'], usecols=[1,2])
df_test = pd.read_csv(raw_data_folder + '/test.tsv', sep='\t', names=['label','text'], usecols=[1,2])
df_full = pd.concat([df_train, df_valid, df_test], ignore_index=True, sort=False)