In [2]:
# Importing the relevant modules
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import torch
# Loading the pre-trained BERT model
###################################
# Embeddings will be derived from
# the outputs of this model
model = BertModel.from_pretrained('bert-base-uncased',
           output_hidden_states = True,)
# Setting up the tokenizer
###################################
# This is the same tokenizer that
# was used in the model to generatefeed in data from a csv to a neural network using pytorch
# embeddings to ensure consistency
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [8]:
def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors

In [9]:
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [10]:
# Text corpus
##############
# These sentences show the different
# forms of the word 'bank' to show the
# value of contextualized embeddings

texts = ["bank",
         "The river bank was flooded.",
         "The bank vault was robust.",
         "He had to bank on her for support.",
         "The bank was out of money.",
         "The bank teller was a man."]

In [11]:
# Getting embeddings for the target
# word in all given contexts
target_word_embeddings = []

for text in texts:
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer)
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)
    
    # Find the position 'bank' in list of tokens
    word_index = tokenized_text.index('bank')
    # Get the embedding for bank
    word_embedding = list_token_embeddings[word_index]

    target_word_embeddings.append(word_embedding)

In [16]:
len(target_word_embeddings[4])

768

In [71]:
from transformers import BertTokenizer

# Load the tokenizer of the "bert-base-cased" pretrained model
# See https://huggingface.co/transformers/pretrained_models.html for other models
tz = BertTokenizer.from_pretrained("bert-base-uncased")

# The senetence to be encoded
sent = "radio france devient france radio france devient france tv inter france devient france tv régions france devient france tv juniors france devient france tv culture orange devient france télécom devient france macron république française devient france france"

# Encode the sentence
encoded = tz.encode_plus(
    text=sent,  # the sentence to be encoded
    add_special_tokens=True,  # Add [CLS] and [SEP]
    max_length = 40,  # maximum length of a sentence
    pad_to_max_length=True,  # Add [PAD]s
    return_attention_mask = True,  # Generate the attention mask
    return_tensors = 'pt',  # ask the function to return PyTorch tensors
)

# Get the input IDs and attention mask in tensor format
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [72]:
input_ids

tensor([[  101,  2557,  2605, 14386,  4765,  2605,  2557,  2605, 14386,  4765,
          2605,  2694,  6970,  2605, 14386,  4765,  2605,  2694,  4655,  2605,
         14386,  4765,  2605,  2694, 16651,  2605, 14386,  4765,  2605,  2694,
          3226,  4589, 14386,  4765,  2605, 18126, 14386,  4765,  2605,   102]])

In [23]:
attn_mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [35]:
x=pd.read_csv("/home/bruce/EP-M1/ML-DL/Project/train.csv")

In [79]:
y=x['text']
y_vect=[]

In [80]:
for i in y:
    sent = i
    encoded = tz.encode_plus(
    text=sent,  # the sentence to be encoded
    add_special_tokens=True,  # Add [CLS] and [SEP]
    max_length = 40,  # maximum length of a sentence
    pad_to_max_length=True,  # Add [PAD]s
    return_attention_mask = True,  # Generate the attention mask
    return_tensors = 'pt',  # ask the function to return PyTorch tensors
    )
    input_ids = encoded['input_ids'] 
    y_vect.append(input_ids)



In [81]:
len(y_vect)

353969

In [75]:
x.drop('text',axis=1)

Unnamed: 0,retweets_count,favorites_count,followers_count,statuses_count,friends_count,mentions,urls,verified,hashtags,timestamp,TweetID
0,3,0,3682,453535,3628,[],[],0,[],1646978048000,832509
1,0,0,86,1016,284,[],[],0,[],1647694288000,1388011
2,3,1,1944,28234,1995,[],[],0,[],1647370048000,63896
3,0,0,1,1072,0,[],['https://t.co/rytlted08g'],0,[],1647256282000,979251
4,0,0,13957,25311,10841,[],[],0,[],1647258374000,1040049
...,...,...,...,...,...,...,...,...,...,...,...
353964,0,0,34,1509,55,[],['https://t.co/pma33zhslx'],0,[],1647438153000,142573
353965,0,0,89,11166,127,[],[],0,[],1647072106000,240866
353966,3,0,1888,712,3086,[],[],0,[],1647607230000,1173763
353967,0,0,139,486,320,[],[],0,[],1646987195000,929182


In [82]:
x['tv']=y_vect

In [83]:
x.head()

Unnamed: 0,text,retweets_count,favorites_count,followers_count,statuses_count,friends_count,mentions,urls,verified,hashtags,timestamp,TweetID,tv
0,rt refarcir macron ans nom prépare,3,0,3682,453535,3628,[],[],0,[],1646978048000,832509,"[[tensor(101), tensor(19387), tensor(25416), t..."
1,populaire,0,0,86,1016,284,[],[],0,[],1647694288000,1388011,"[[tensor(101), tensor(3769), tensor(7068), ten..."
2,faut dégager cinglé,3,1,1944,28234,1995,[],[],0,[],1647370048000,63896,"[[tensor(101), tensor(6904), tensor(4904), ten..."
3,enseignants mettre prescriptions président rép...,0,0,1,1072,0,[],['https://t.co/rytlted08g'],0,[],1647256282000,979251,"[[tensor(101), tensor(4372), tensor(20240), te..."
4,mafieuse oppressive macron,0,0,13957,25311,10841,[],[],0,[],1647258374000,1040049,"[[tensor(101), tensor(5003), tensor(8873), ten..."


In [90]:
x.drop('urls',axis=1)

Unnamed: 0,text,retweets_count,favorites_count,followers_count,statuses_count,friends_count,mentions,verified,hashtags,timestamp,TweetID,tv
0,rt refarcir macron ans nom prépare,3,0,3682,453535,3628,[],0,[],1646978048000,832509,"[[tensor(101), tensor(19387), tensor(25416), t..."
1,populaire,0,0,86,1016,284,[],0,[],1647694288000,1388011,"[[tensor(101), tensor(3769), tensor(7068), ten..."
2,faut dégager cinglé,3,1,1944,28234,1995,[],0,[],1647370048000,63896,"[[tensor(101), tensor(6904), tensor(4904), ten..."
3,enseignants mettre prescriptions président rép...,0,0,1,1072,0,[],0,[],1647256282000,979251,"[[tensor(101), tensor(4372), tensor(20240), te..."
4,mafieuse oppressive macron,0,0,13957,25311,10841,[],0,[],1647258374000,1040049,"[[tensor(101), tensor(5003), tensor(8873), ten..."
...,...,...,...,...,...,...,...,...,...,...,...,...
353964,gonflette tour raciste frustré,0,0,34,1509,55,[],0,[],1647438153000,142573,"[[tensor(101), tensor(2175), tensor(2078), ten..."
353965,france caste crapuleuse encadrée gangsters irr...,0,0,89,11166,127,[],0,[],1647072106000,240866,"[[tensor(101), tensor(2605), tensor(14542), te..."
353966,eric zemmour français berbère,3,0,1888,712,3086,[],0,[],1647607230000,1173763,"[[tensor(101), tensor(4388), tensor(27838), te..."
353967,gauchistes dépression pq,0,0,139,486,320,[],0,[],1646987195000,929182,"[[tensor(101), tensor(11721), tensor(15217), t..."


In [93]:
x=x.drop(['text','urls','hashtags','mentions','timestamp'],axis=1)

In [94]:
x

Unnamed: 0,retweets_count,favorites_count,followers_count,statuses_count,friends_count,verified,TweetID,tv
0,3,0,3682,453535,3628,0,832509,"[[tensor(101), tensor(19387), tensor(25416), t..."
1,0,0,86,1016,284,0,1388011,"[[tensor(101), tensor(3769), tensor(7068), ten..."
2,3,1,1944,28234,1995,0,63896,"[[tensor(101), tensor(6904), tensor(4904), ten..."
3,0,0,1,1072,0,0,979251,"[[tensor(101), tensor(4372), tensor(20240), te..."
4,0,0,13957,25311,10841,0,1040049,"[[tensor(101), tensor(5003), tensor(8873), ten..."
...,...,...,...,...,...,...,...,...
353964,0,0,34,1509,55,0,142573,"[[tensor(101), tensor(2175), tensor(2078), ten..."
353965,0,0,89,11166,127,0,240866,"[[tensor(101), tensor(2605), tensor(14542), te..."
353966,3,0,1888,712,3086,0,1173763,"[[tensor(101), tensor(4388), tensor(27838), te..."
353967,0,0,139,486,320,0,929182,"[[tensor(101), tensor(11721), tensor(15217), t..."


In [96]:
ev = pd.read_csv("/home/bruce/EP-M1/ML-DL/Project/evaluation.csv")

In [97]:
y=ev['text']

In [98]:
y_vect=[]

In [99]:
for i in y:
    sent = i
    encoded = tz.encode_plus(
    text=sent,  # the sentence to be encoded
    add_special_tokens=True,  # Add [CLS] and [SEP]
    max_length = 40,  # maximum length of a sentence
    pad_to_max_length=True,  # Add [PAD]s
    return_attention_mask = True,  # Generate the attention mask
    return_tensors = 'pt',  # ask the function to return PyTorch tensors
    )
    input_ids = encoded['input_ids'] 
    y_vect.append(input_ids)



In [101]:
ev["tv"]=y_vect

In [103]:
ev=ev.drop(["mentions","urls","hashtags","text","timestamp"],axis=1)

In [104]:
ev

Unnamed: 0,favorites_count,followers_count,statuses_count,friends_count,verified,TweetID,tv
0,0,85,4442,327,0,1184643,"[[tensor(101), tensor(18855), tensor(5657), te..."
1,0,427,33282,443,0,1199190,"[[tensor(101), tensor(6302), tensor(5157), ten..."
2,6,1127,13111,1596,0,917372,"[[tensor(101), tensor(9998), tensor(20236), te..."
3,2,1699,25760,2036,0,731754,"[[tensor(101), tensor(8840), tensor(2140), ten..."
4,0,249,20718,369,0,1400049,"[[tensor(101), tensor(27838), tensor(7382), te..."
...,...,...,...,...,...,...,...
117985,0,4,194,64,0,192729,"[[tensor(101), tensor(21451), tensor(2483), te..."
117986,0,744,39489,894,0,445763,"[[tensor(101), tensor(21451), tensor(7054), te..."
117987,12,924,2270,159,0,423814,"[[tensor(101), tensor(13012), tensor(13473), t..."
117988,0,5,4,80,0,1220017,"[[tensor(101), tensor(6581), tensor(2063), ten..."


In [105]:
ev

Unnamed: 0,favorites_count,followers_count,statuses_count,friends_count,verified,TweetID,tv
0,0,85,4442,327,0,1184643,"[[tensor(101), tensor(18855), tensor(5657), te..."
1,0,427,33282,443,0,1199190,"[[tensor(101), tensor(6302), tensor(5157), ten..."
2,6,1127,13111,1596,0,917372,"[[tensor(101), tensor(9998), tensor(20236), te..."
3,2,1699,25760,2036,0,731754,"[[tensor(101), tensor(8840), tensor(2140), ten..."
4,0,249,20718,369,0,1400049,"[[tensor(101), tensor(27838), tensor(7382), te..."
...,...,...,...,...,...,...,...
117985,0,4,194,64,0,192729,"[[tensor(101), tensor(21451), tensor(2483), te..."
117986,0,744,39489,894,0,445763,"[[tensor(101), tensor(21451), tensor(7054), te..."
117987,12,924,2270,159,0,423814,"[[tensor(101), tensor(13012), tensor(13473), t..."
117988,0,5,4,80,0,1220017,"[[tensor(101), tensor(6581), tensor(2063), ten..."


In [112]:
ev["tv"]

0         [[tensor(101), tensor(18855), tensor(5657), te...
1         [[tensor(101), tensor(6302), tensor(5157), ten...
2         [[tensor(101), tensor(9998), tensor(20236), te...
3         [[tensor(101), tensor(8840), tensor(2140), ten...
4         [[tensor(101), tensor(27838), tensor(7382), te...
                                ...                        
117985    [[tensor(101), tensor(21451), tensor(2483), te...
117986    [[tensor(101), tensor(21451), tensor(7054), te...
117987    [[tensor(101), tensor(13012), tensor(13473), t...
117988    [[tensor(101), tensor(6581), tensor(2063), ten...
117989    [[tensor(101), tensor(19387), tensor(17995), t...
Name: tv, Length: 117990, dtype: object

In [114]:
ev['tv'].iloc[0]

tensor([[  101, 18855,  5657, 10364, 13013,  4270,  3789,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])

In [115]:
x = ev['tv'].iloc[0]

In [116]:
x.ndim

2

In [117]:
x = torch.flatten(x)

In [119]:
x.ndim

1