In [None]:
#pip install transformers==4.21.2

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from datasets import load_dataset
from transformers import XLNetModel, XLNetTokenizer
import re

In [2]:
df_goss_raw = pd.read_csv("df_gosscop.csv")

In [3]:
def clean_txt(text):
    text = re.sub("'", "",text)
    text = re.sub("_", "",text)
    text=re.sub("(\\W)+"," ",text)    
    return text

In [4]:
df_goss_raw['clean_text'] = df_goss_raw.clean_text_combined.apply(clean_txt)

In [5]:
df = df_goss_raw[['Unnamed: 0','clean_text','label']]
df = df.rename(columns={'Unnamed: 0': 'idx'})
df

Unnamed: 0,idx,clean_text,label
0,0,Selena Gomez Goes Barefoot On The Street After...,0
1,1,Jessica Simpson Stumbles Out of Hubbys Birthda...,0
2,2,Kanye West Is Reportedly Opening a Restaurant ...,0
3,3,Is Kanye West Heading to Rehab It s that time ...,0
4,4,Pregnant Kate Middleton Hit With Cocaine Bombs...,0
...,...,...,...
13021,13269,Chris Pratt Guardians of the Galaxy Cast Call ...,1
13022,13270,Robert Pattinson and Suki Waterhouse spark rom...,1
13023,13271,Review The Marvelous Mrs Maisel rides Emmy win...,1
13024,13272,The Trailer For Keshas New Documentary Reveale...,1


In [6]:
ratio_train = 0.8
ratio_val = 0.1
ratio_test = 0.1

# Produces test split.
remaining, test = train_test_split(df, test_size=ratio_test, random_state=214)

# Adjusts val ratio, w.r.t. remaining dataset.
ratio_remaining = 1 - ratio_test
ratio_val_adjusted = ratio_val / ratio_remaining

# Produces train and val splits.
train, val, = train_test_split(remaining, test_size=ratio_val_adjusted, random_state=214)

In [7]:
print(train.shape, val.shape, test.shape)

(10420, 3) (1303, 3) (1303, 3)


In [8]:
train.head()

Unnamed: 0,idx,clean_text,label
1902,1934,Kourtney Kardashian Opens Up About Restricting...,0
1386,1408,15 Spectacular Celebrity Feuds of 2016 1 Kanye...,0
2539,2589,Nicole Kidman and Prince Harry named sexiest r...,0
5649,5778,Jamie Lynn Sigler on Her Special Chance to Rec...,1
2869,2922,Dakota Johnson And Chris Martin Fast Track Wed...,0


In [9]:
val.head()

Unnamed: 0,idx,clean_text,label
190,192,Angelina Jolies honeytrap plot to seize Uganda...,0
5266,5378,Teen Wolf Reboot Eyed at MTV Yes Weeks Before ...,1
11875,12103,Farrah Abrahams Daughter Sips 150 Apple Juice ...,1
11325,11547,David Beckham Tears Up After Son Brooklyn Surp...,1
7021,7177,Kelly Dodd Slams Meghan King Edmonds Marriage ...,1


In [24]:
test.head()

Unnamed: 0,idx,clean_text,label
430,436,Jessica Simpson Stumbles Nearly Falls Before B...,0
4881,4980,Shannon Beador Addresses Weight Gain on RHOC S...,1
11750,11973,Emilia Clarke on Solo Director Shake Up Game o...,1
1701,1729,Will Eminem Run For President In 2020 After Vi...,0
5031,5134,Jessica Alba expecting baby No 3 with husband ...,1


In [10]:
train['label'].value_counts()

1    8090
0    2330
Name: label, dtype: int64

In [11]:
val['label'].value_counts()

1    1011
0     292
Name: label, dtype: int64

In [12]:
test['label'].value_counts()

1    1019
0     284
Name: label, dtype: int64

In [13]:
#adapted from: https://colab.research.google.com/gist/beatobongco/98aa1ed3fe0ec1e1922edecbf2af934f/using-roberta-to-generate-features-for-a-simple-neural-network.ipynb
class FeatureExtractor:
    """Class that uses a Transformer model to vectorize batches of strings"""
    def __init__(self, model_type):
        MODELS = {
            # 'bert': (BertModel,       BertTokenizer,       'bert-base-uncased'),
             'xl-net': (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
            # 'roberta': (RobertaModel,    RobertaTokenizer,    'roberta-base')
        }
        m = MODELS[model_type]    
        self.model = m[0].from_pretrained(m[2])
        self.tokenizer = m[1].from_pretrained(m[2])

    def encode_strings(self, input_strings):
    # Encode text
        embeddings = []
        for s in input_strings:
          
            input_ids = torch.tensor([self.tokenizer.encode(s, add_special_tokens=True, max_length=512, padding = True,\
                                                            truncation=True)])
            with torch.no_grad():
                last_hidden_states = self.model(input_ids).last_hidden_state  
      
            embeddings.append(last_hidden_states[:,0].numpy())
        return embeddings

In [14]:
fe = FeatureExtractor('xl-net')

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
array_train = train[["clean_text"]].to_numpy() 
X_train_txt = np.array(array_train[:,0].tolist())

In [16]:
train_hidden = fe.encode_strings(X_train_txt)

In [18]:
np.save('train_xlnet_simple.npy', train_hidden, allow_pickle=True)

In [19]:
array_val = val[["clean_text"]].to_numpy() 
X_val_txt = np.array(array_val[:,0].tolist())
val_hidden = fe.encode_strings(X_val_txt)
np.save('val_xlnet_simple.npy', val_hidden, allow_pickle=True)

In [20]:
array_test = test[["clean_text"]].to_numpy() 
X_test_txt = np.array(array_test[:,0].tolist())
test_hidden = fe.encode_strings(X_test_txt)
np.save('test_xlnet_simple.npy', test_hidden, allow_pickle=True)

In [21]:
train_label = np.array(train['label'])
train_idx = np.array(train['idx'])
list_train = list(zip(train_hidden, train_label, train_idx))
df_train = pd.DataFrame(list_train, columns=['hidden_state','label','idx'])
df_train.head()

Unnamed: 0,hidden_state,label,idx
0,"[[-0.6905054, 0.10080558, -3.5896642, 0.688696...",0,1934
1,"[[0.71386486, 1.624135, 0.39983767, 1.6060241,...",0,1408
2,"[[-1.4997008, -2.5545237, -2.6850965, 2.128371...",0,2589
3,"[[-0.59582114, 0.6739678, -0.54739684, 0.43669...",1,5778
4,"[[-0.17796452, 1.5538232, -0.1271606, 0.483559...",0,2922


In [22]:
val_label = np.array(val['label'])
val_idx = np.array(val['idx'])
list_val = list(zip(val_hidden, val_label, val_idx))
df_val = pd.DataFrame(list_val, columns=['hidden_state','label','idx'])
df_val.head()

Unnamed: 0,hidden_state,label,idx
0,"[[-0.3297603, -0.4885347, -0.45875156, 1.86647...",0,192
1,"[[0.82688785, -0.03501948, -3.5826552, 0.89857...",1,5378
2,"[[-1.0590429, -0.045592815, -2.6220367, 1.4351...",1,12103
3,"[[0.23462263, 0.6357783, -0.49189222, 0.865996...",1,11547
4,"[[-3.6478138, -0.8344219, 0.6634267, 1.3527999...",1,7177


In [25]:
test_label = np.array(test['label'])
test_idx = np.array(test['idx'])
list_test = list(zip(test_hidden, test_label, test_idx))
df_test = pd.DataFrame(list_test, columns=['hidden_state','label','idx'])
df_test.head()

Unnamed: 0,hidden_state,label,idx
0,"[[-0.39024025, -1.5027034, -1.6497475, -0.5419...",0,436
1,"[[-1.5501236, -3.0166059, -1.8201647, 1.163599...",1,4980
2,"[[-0.02296041, -0.38859874, -0.12139326, -0.57...",1,11973
3,"[[1.6468408, -1.9771976, -4.8161287, 0.5524025...",0,1729
4,"[[0.21094456, -1.9484203, -0.25539798, 0.16063...",1,5134


In [26]:
df_train.to_pickle('C:/Users/labca/Documents/Dissertation - Fake News/Embeddings/df_train_xlnet_simple.pkl')

In [27]:
df_val.to_pickle('C:/Users/labca/Documents/Dissertation - Fake News/Embeddings/df_val_xlnet_simple.pkl')

In [28]:
df_test.to_pickle('C:/Users/labca/Documents/Dissertation - Fake News/Embeddings/df_test_xlnet_simple.pkl')