In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from datasets import load_dataset
from transformers import BertModel, BertTokenizer
import re

In [2]:
df_goss_raw = pd.read_csv("df_gosscop.csv")

In [3]:
def clean_txt(text):
    text = re.sub("'", "",text)
    text = re.sub("_", "",text)
    text=re.sub("(\\W)+"," ",text)    
    return text

In [4]:
df_goss_raw['clean_text'] = df_goss_raw.clean_text_combined.apply(clean_txt)

#apply additional preprocessing
punctuation = r'!"#$%&()*+-/:;<=>?@[\\]^_`{|}~\',.'
df_goss_raw['clean_txt'] = df_goss_raw['clean_text_combined'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
df_goss_raw['clean_txt'] = df_goss_raw['clean_txt'].str.lower()
df_goss_raw['clean_txt'] = df_goss_raw['clean_txt'].apply(lambda x:' '.join(x.split()))

In [5]:
df = df_goss_raw[['Unnamed: 0','clean_txt','label']]
df = df.rename(columns={'Unnamed: 0': 'idx'})
df

Unnamed: 0,idx,clean_txt,label
0,0,selena gomez goes barefoot on the street after...,0
1,1,jessica simpson stumbles out of hubbys birthda...,0
2,2,kanye west is reportedly opening a restaurant ...,0
3,3,is kanye west heading to rehab it’s that time ...,0
4,4,pregnant kate middleton hit with cocaine bombs...,0
...,...,...,...
13021,13269,chris pratt ‘guardians of the galaxy’ cast cal...,1
13022,13270,robert pattinson and suki waterhouse spark rom...,1
13023,13271,review the marvelous mrs maisel rides emmy win...,1
13024,13272,the trailer for keshas new documentary reveale...,1


In [7]:
ratio_train = 0.8
ratio_val = 0.1
ratio_test = 0.1

# Produces test split.
remaining, test = train_test_split(df, test_size=ratio_test, random_state=214)

# Adjusts val ratio, w.r.t. remaining dataset.
ratio_remaining = 1 - ratio_test
ratio_val_adjusted = ratio_val / ratio_remaining

# Produces train and val splits.
train, val, = train_test_split(remaining, test_size=ratio_val_adjusted, random_state=214)

In [8]:
print(train.shape, val.shape, test.shape)

(10420, 3) (1303, 3) (1303, 3)


In [9]:
train.head()

Unnamed: 0,idx,clean_txt,label
1902,1934,kourtney kardashian opens up about restricting...,0
1386,1408,15 spectacular celebrity feuds of 2016 1 kanye...,0
2539,2589,nicole kidman and prince harry named sexiest r...,0
5649,5778,jamielynn sigler on her special chance to reco...,1
2869,2922,dakota johnson and chris martin fasttrack wedd...,0


In [10]:
val.head()

Unnamed: 0,idx,clean_txt,label
190,192,angelina jolies honeytrap plot to seize uganda...,0
5266,5378,teen wolf reboot eyed at mtv — yes weeks befor...,1
11875,12103,farrah abrahams daughter sips 150 apple juice ...,1
11325,11547,david beckham tears up after son brooklyn surp...,1
7021,7177,kelly dodd slams meghan king edmonds’ marriage...,1


In [11]:
test.head()

Unnamed: 0,idx,clean_txt,label
430,436,jessica simpson stumbles nearly falls before b...,0
4881,4980,shannon beador addresses weight gain on ‘rhoc’...,1
11750,11973,emilia clarke on ‘solo’ director shakeup ‘game...,1
1701,1729,will eminem run for president in 2020 after vi...,0
5031,5134,jessica alba expecting baby no 3 with husband ...,1


In [12]:
train['label'].value_counts()

1    8090
0    2330
Name: label, dtype: int64

In [13]:
val['label'].value_counts()

1    1011
0     292
Name: label, dtype: int64

In [14]:
test['label'].value_counts()

1    1019
0     284
Name: label, dtype: int64

In [15]:
#adapted from: https://colab.research.google.com/gist/beatobongco/98aa1ed3fe0ec1e1922edecbf2af934f/using-roberta-to-generate-features-for-a-simple-neural-network.ipynb
class FeatureExtractor:
    """Class that uses a Transformer model to vectorize batches of strings"""
    def __init__(self, model_type):
        MODELS = {
             'bert': (BertModel,       BertTokenizer,       'bert-base-uncased'),
            # 'xl-net': (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
            # 'roberta': (RobertaModel,    RobertaTokenizer,    'roberta-base')
        }
        m = MODELS[model_type]    
        self.model = m[0].from_pretrained(m[2])
        self.tokenizer = m[1].from_pretrained(m[2])

    def encode_strings(self, input_strings):
    # Encode text
        embeddings = []
        for s in input_strings:

            input_ids = torch.tensor([self.tokenizer.encode(s, add_special_tokens=True, max_length=512, padding = True,\
                                                            truncation=True)])
            with torch.no_grad():
                last_hidden_states = self.model(input_ids).last_hidden_state  
      
            embeddings.append(last_hidden_states[:,0].numpy())
        return embeddings

In [16]:
fe = FeatureExtractor('bert')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
array_train = train[["clean_txt"]].to_numpy() 
X_train_txt = np.array(array_train[:,0].tolist())

In [19]:
train_hidden = fe.encode_strings(X_train_txt)

In [20]:
np.save('train_bert_simple.npy', train_hidden, allow_pickle=True)

In [21]:
array_val = val[["clean_txt"]].to_numpy() 
X_val_txt = np.array(array_val[:,0].tolist())
val_hidden = fe.encode_strings(X_val_txt)
np.save('val_bert_simple.npy', val_hidden, allow_pickle=True)

In [22]:
array_test = test[["clean_txt"]].to_numpy() 
X_test_txt = np.array(array_test[:,0].tolist())
test_hidden = fe.encode_strings(X_test_txt)
np.save('test_bert_simple.npy', test_hidden, allow_pickle=True)

In [23]:
train_label = np.array(train['label'])
train_idx = np.array(train['idx'])
list_train = list(zip(train_hidden, train_label, train_idx))
df_train = pd.DataFrame(list_train, columns=['hidden_state','label','idx'])
df_train.head()

Unnamed: 0,hidden_state,label,idx
0,"[[-0.6351673, 0.014559049, 0.29377505, 0.37676...",0,1934
1,"[[-0.3496328, -0.1398266, 0.20144267, 0.455405...",0,1408
2,"[[-0.45250654, 0.11762404, 0.14361618, -0.6483...",0,2589
3,"[[0.11294373, -0.20785874, 0.2868215, -0.27205...",1,5778
4,"[[-0.07389187, 0.09317138, 0.5003885, -0.26259...",0,2922


In [24]:
val_label = np.array(val['label'])
val_idx = np.array(val['idx'])
list_val = list(zip(val_hidden, val_label, val_idx))
df_val = pd.DataFrame(list_val, columns=['hidden_state','label','idx'])
df_val.head()

Unnamed: 0,hidden_state,label,idx
0,"[[-0.36481032, -0.002037473, 0.421112, 0.19184...",0,192
1,"[[-0.087618366, -0.53036445, 0.35774526, 0.315...",1,5378
2,"[[-0.3008975, -0.12119578, 0.4158599, 0.195135...",1,12103
3,"[[-0.5137289, -0.26202345, 0.35133645, 0.53006...",1,11547
4,"[[-0.13322158, 0.04337415, 0.11195108, 0.02458...",1,7177


In [25]:
test_label = np.array(test['label'])
test_idx = np.array(test['idx'])
list_test = list(zip(test_hidden, test_label, test_idx))
df_test = pd.DataFrame(list_test, columns=['hidden_state','label','idx'])
df_test.head()

Unnamed: 0,hidden_state,label,idx
0,"[[0.050070293, -0.28424078, 0.29238722, 0.2928...",0,436
1,"[[-0.58470356, -0.18257326, 0.14449677, -0.007...",1,4980
2,"[[0.15742663, -0.1125668, 0.1829456, 0.0956175...",1,11973
3,"[[-0.3247289, -0.27901417, 0.04718214, 0.24377...",0,1729
4,"[[-0.58893657, 0.0008235853, 0.06734834, 0.262...",1,5134


In [26]:
df_train.to_pickle('C:/Users/labca/Documents/Dissertation - Fake News/Embeddings/df_train_bert_simple.pkl')

In [27]:
df_val.to_pickle('C:/Users/labca/Documents/Dissertation - Fake News/Embeddings/df_val_bert_simple.pkl')

In [28]:
df_test.to_pickle('C:/Users/labca/Documents/Dissertation - Fake News/Embeddings/df_test_bert_simple.pkl')