In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
import seaborn as sns

import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.data import Field, TabularDataset, BucketIterator
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
raw_data_path = './data/news.csv'
destination_folder = './data'

train_test_ratio = 0.1
train_valid_ratio = 0.8

# trim the samples to reduce model complexity while maintaining enough information for prediction
first_n_word = 500 

def trim_string(x):
    x = x.split(maxsplit=first_n_word)
    x = " ".join(x[:first_n_word])
    
    return x

In [None]:
df_raw = pd.read_csv(raw_data_path)

# prepare columns
df_raw['label'] = (df_raw['label'] == 'FAKE').astype('int')
df_raw['titletext'] = df_raw['title'] + ". " + df_raw['text']
df_raw = df_raw.reindex(columns=['label', 'title', 'text', 'titletext'])

# drop row with empty text
df_raw.drop( df_raw[df_raw.text.str.len() < 5].index, inplace=True )

# trim text and titletext to first_n_words
df_raw['text'] = df_raw['text'].apply(trim_string)
df_raw['titletext'] = df_raw['titletext'].apply(trim_string)

# train test split
X_raw = df_raw.drop(['label'], axis=1)
y_raw = df_raw['label']

X_full_train, X_test, y_full_train, y_test = train_test_split(X_raw, y_raw, test_size=train_test_ratio, random_state=20, stratify=y_raw)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, train_size=train_valid_ratio, random_state=20, stratify=y_full_train)

# Concatenate
X_train['label'] = y_train
X_val['label'] = y_val
X_test['label'] = y_test

X_train.to_csv(destination_folder+'/train.csv', index=False)
X_val.to_csv(destination_folder+'/valid.csv', index=False)
X_test.to_csv(destination_folder+'/test.csv', index=False)

In [5]:
label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(tokenize='spacy',tokenizer_language='en_core_web_sm', lower=True, include_lengths=True, batch_first=True)
fields = [('label', label_field), ('title', text_field), ('text', text_field), ('titletext', text_field)]

In [6]:
fields

[('label', <torchtext.legacy.data.field.Field at 0x2380f34ed00>),
 ('title', <torchtext.legacy.data.field.Field at 0x2380f34eca0>),
 ('text', <torchtext.legacy.data.field.Field at 0x2380f34eca0>),
 ('titletext', <torchtext.legacy.data.field.Field at 0x2380f34eca0>)]