# Parameters

In [0]:
raw_data_folder = '/content/drive/My Drive/COMP_4211_Project/Data/UVictoria'
destination_folder = '/content/drive/My Drive/COMP_4211_Project/Data/Phase_3'

train_test_ratio = 0.02
first_n_words = 200

# Libraries

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Preprocessing

In [0]:
def trim_string(x):

    x = x.split(maxsplit=first_n_words)
    start_i = min(4, len(x)-1)
    for i in range(start_i,-1,-1):
        if x[i] == '-' or '(Reuters)' in x[i]:
            break
    
    x = ' '.join(x[i+1:i+1+first_n_words])

    return x

In [0]:
# Read raw data
df_fake = pd.read_csv(raw_data_folder + '/Fake.csv', usecols=[1])
df_real = pd.read_csv(raw_data_folder + '/True.csv', usecols=[1])

# Drop rows with empty text; trim text to first_n_words
df_fake.drop( df_fake[df_fake.text.str.len() < 5].index, inplace=True)
df_fake['text'] = df_fake['text'].apply(trim_string)
df_fake.drop( df_fake[df_fake.text.str.len() < 5].index, inplace=True)
df_real.drop( df_real[df_real.text.str.len() < 5].index, inplace=True)
df_real['text'] = df_real['text'].apply(trim_string)
df_real.drop( df_real[df_real.text.str.len() < 5].index, inplace=True)

# Prepare labels
df_fake['label'] = 1
df_real['label'] = 0

# Train-test split
df_real_train, df_real_test = train_test_split(df_real, train_size = train_test_ratio, random_state = 4211)
df_fake_train, df_fake_test = train_test_split(df_fake, train_size = train_test_ratio, random_state = 4211)

# Concatenate splits of different labels
df_train = pd.concat([df_real_train, df_fake_train], ignore_index=True, sort=False)
df_test = pd.concat([df_real_test, df_fake_test], ignore_index=True, sort=False)

# Write preprocessed data
df_train = df_train.reindex(columns=['label', 'text'])
df_train.to_csv(destination_folder + '/UVictoria_train.csv', index=False)
df_test = df_test.reindex(columns=['label', 'text'])
df_test.to_csv(destination_folder + '/UVictoria_test.csv', index=False)