# Install Transformers Library

Source: [Fine_Tuning_BERT_for_Spam_Classification](https://github.com/prateekjoshi565/Fine-Tuning-BERT/blob/master/Fine_Tuning_BERT_for_Spam_Classification.ipynb)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 54.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 74.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2


In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers
from transformers import AutoModel, BertTokenizerFast

# specify GPU
device = torch.device("cuda")

In [4]:
def sentiment2label(x):
  if x == 'positive':
    return 0
  elif x == 'neutral':
    return 0
  else:
    return 1

In [5]:
pip install tweet-preprocessor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [6]:
import preprocessor as p
import re

def cleanTweet(tweets, idx):

  texts = []

  for text in tweets:

    text2 = re.sub('#', '', text)
    text2 = re.sub('@', '', text2)

    p.set_options(p.OPT.URL)
    
    texts.append(p.clean(text2))

  return pd.Series(data=texts, index=idx)


# Load Datasets

In [30]:
df_svq = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/twNLP/data/input/tweets_svq.csv")
df_svq = df_svq[['content']].set_index(df_svq.id)
df_svq.rename(columns={'content':'text'}, inplace=True)
df_svq.text = cleanTweet(df_svq['text'], df_svq.index)
df_svq.head()

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
1523089210047344640,Termina la feria de Abril pero chavalería se v...
1523088314697494529,Yo no había visto jamás a tanta gente que cono...
1523086850583580672,"La Feria de Abril ha acabado, que cosas critic..."
1523085987911733248,Sevilla siempre sale victoriosa. Tras dos larg...
1523084505380237315,Illo pa que feria de abril si me lo paso mejor...


In [31]:
df_mlg = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/twNLP/data/input/tweets_mlg.csv")
df_mlg = df_mlg[['content']].set_index(df_mlg.id)
df_mlg.rename(columns={'content':'text'}, inplace=True)
df_mlg.text = cleanTweet(df_mlg['text'], df_mlg.index)
df_mlg.head()

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
1561140591320260609,Nuevo cierre de una caseta en el Real en el úl...
1561140440426061826,Dos hermanos denuncian la agresión de otro jov...
1561140425376808960,"Hoy en la feria de Málaga. Ánimo Andres, se pu..."
1561139949973405697,no se me ocurren unas mejore vacacione que la ...
1561139807484518402,harryelsocio Fui a la feria de Malaga. Parecía...


# Import BERT Model and BERT Tokenizer

In [33]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased", return_dict=False)

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased", return_dict=False)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

# Tokenization

In [35]:
max_seq_len = 30

In [36]:
# tokenize and encode sequences in the svq set
tokens_svq = tokenizer.batch_encode_plus(
    df_svq.text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the mlg set
tokens_mlg = tokenizer.batch_encode_plus(
    df_mlg.text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)



# Convert Integer Sequences to Tensors

In [37]:
# for svq set
svq_seq = torch.tensor(tokens_svq['input_ids'])
svq_mask = torch.tensor(tokens_svq['attention_mask'])

# for mlg set
mlg_seq = torch.tensor(tokens_mlg['input_ids'])
mlg_mask = torch.tensor(tokens_mlg['attention_mask'])

# Freeze BERT Parameters

In [38]:
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

# Define Model Architecture

In [39]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(512,2)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask)
      
      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      # apply softmax activation
      x = self.softmax(x)

      return x

In [40]:
# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# push the model to GPU
model = model.to(device)

In [41]:
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-3)



# Load Saved Model

In [42]:
#load weights of best model
path = '/content/drive/MyDrive/Colab Notebooks/twNLP/data/output/saved_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

# Get Predictions for Test Data

In [46]:
# get predictions for test data
with torch.no_grad():
  preds_svq = model(svq_seq.to(device), svq_mask.to(device))
  preds_svq = preds_svq.detach().cpu().numpy()
  preds_svq = np.argmax(preds_svq, axis = 1)

  preds_mlg = model(mlg_seq.to(device), mlg_mask.to(device))
  preds_mlg = preds_mlg.detach().cpu().numpy()
  preds_mlg = np.argmax(preds_mlg, axis = 1)

In [50]:
# save predictios to csv file

preds_svq = pd.DataFrame(data=preds_svq, index=df_svq.index, columns=['class'])
preds_svq.to_csv('/content/drive/MyDrive/Colab Notebooks/twNLP/data/output/preds_svq.csv')
preds_mlg = pd.DataFrame(data=preds_mlg, index=df_mlg.index, columns=['class'])
preds_mlg.to_csv('/content/drive/MyDrive/Colab Notebooks/twNLP/data/output/preds_mlg.csv')