In [1]:
!pip install transformers
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import BertTokenizer, BertConfig,AdamW, BertForSequenceClassification,get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score,matthews_corrcoef
from tqdm import tqdm, trange,tnrange,tqdm_notebook
import random
import os
import io
#% matplotlib inline

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)
SEED = 19
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device == torch.device("cuda"):
    torch.cuda.manual_seed_all(SEED)

In [3]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [5]:
df_train = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/dataset/SMSSpamColl.csv')

In [6]:
df_train.isnull().sum()

label    0
text     0
dtype: int64

In [7]:
df_train.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df_train['label'].unique()

array(['ham', 'spam'], dtype=object)

In [9]:
df_train['label'].value_counts()

ham     4827
spam     747
Name: label, dtype: int64

In [10]:
df_train = df_train[~df_train['label'].isnull()]

In [11]:
def clean_text(text):
    import re
    from string import punctuation
    text=re.sub(r'(http|ftp|https):\/\/([\w\-_]+(?:(?:\.[\w\-_]+)+))([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?', 
                ' ', text)
    text=re.sub(r'['+punctuation+']',' ',text)
    text=re.sub(r'#(\w+)',' ',text)
    text=re.sub(r'@(\w+)',' ',text)
    text = text.lower() # Convert  to lowercase

    token=RegexpTokenizer(r'\w+')
    tokens = token.tokenize(text)

    lemmatizer = WordNetLemmatizer()
    stems = [lemmatizer.lemmatize(t) for t in tokens]
    stemmer = PorterStemmer()
    stems = [stemmer.stem(t) for t in stems]
    
    return ' '.join(stems)

def tokenize(text):
    token=RegexpTokenizer(r'\w+')
    tokens = token.tokenize(text)
    
    return tokens

In [12]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df_train['label_1'] = labelencoder.fit_transform(df_train['label'])

In [13]:
df_train[['label','label_1']].drop_duplicates(keep='first')

Unnamed: 0,label,label_1
0,ham,0
2,spam,1


In [14]:
df_train.rename(columns={'label_1':'sentiment'},inplace=True)

In [15]:
## create label and sentence list
sentences = df_train.text.values
print("Distribution of data based on labels: ",df_train.label.value_counts())
MAX_LEN = 256
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)

Distribution of data based on labels:  ham     4827
spam     747
Name: label, dtype: int64


In [16]:
df_train.head()

Unnamed: 0,label,text,sentiment
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [16]:
input_ids = [tokenizer.encode(sent, add_special_tokens=True,max_length=MAX_LEN,padding='longest',truncation=True) for sent in sentences]

In [25]:
labels = df_train.sentiment.values

print("Actual sentence before tokenization: ",sentences[2])
print("Encoded Input from dataset: ",input_ids[2])

attention_masks = []
## Create a mask of 1 for all input tokens and 0 for all padding tokens
attention_masks = [[float(i>0) for i in seq] for seq in input_ids]
print(attention_masks[2])

Actual sentence before tokenization:  Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
Encoded Input from dataset:  [101, 2489, 4443, 1999, 1016, 1037, 1059, 2243, 2135, 4012, 2361, 2000, 2663, 6904, 2452, 2345, 1056, 25509, 2015, 7398, 2089, 2384, 1012, 3793, 6904, 2000, 6584, 12521, 2487, 2000, 4374, 4443, 3160, 1006, 2358, 2094, 19067, 2102, 3446, 1007, 1056, 1004, 1039, 1005, 1055, 6611, 5511, 19961, 22407, 18613, 23352, 7840, 15136, 1005, 1055, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [26]:
train_inputs,validation_inputs,train_labels,validation_labels = train_test_split(input_ids,labels,random_state=41,test_size=0.1)
train_masks,validation_masks,_,_ = train_test_split(attention_masks,input_ids,random_state=41,test_size=0.1)

In [27]:
from torch.nn.utils.rnn import pad_sequence

input_ids = [tokenizer.encode(sent, add_special_tokens=True, max_length=MAX_LEN, padding='max_length', truncation=True) for sent in sentences]

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=41, test_size=0.1)
train_idx, validation_idx, _, _ = train_test_split(range(len(input_ids)), input_ids, random_state=41, test_size=0.1)
train_masks = torch.tensor([attention_masks[i] for i in train_idx]).unsqueeze(1)
validation_masks = torch.tensor([attention_masks[i] for i in validation_idx]).unsqueeze(1)

# Convert the input sequences to PyTorch tensors
train_inputs = [torch.tensor(seq) for seq in train_inputs]
validation_inputs = [torch.tensor(seq) for seq in validation_inputs]

# Pad the input sequences to the same length
train_inputs = pad_sequence(train_inputs, batch_first=True)
validation_inputs = pad_sequence(validation_inputs, batch_first=True)

# Convert all our data into torch tensors, required data type for our model
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)


In [28]:
batch_size = 32

train_data = TensorDataset(train_inputs,train_masks,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

validation_data = TensorDataset(validation_inputs,validation_masks,validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data,sampler=validation_sampler,batch_size=batch_size)

In [29]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [30]:
# Load BertForSequenceClassification
num_labels = 6
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels).to(device)

lr = 2e-5
adam_epsilon = 1e-8
epochs = 3

num_warmup_steps = 0
num_training_steps = len(train_dataloader)*epochs

optimizer = AdamW(model.parameters(), lr=lr,eps=adam_epsilon,correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [31]:
train_data[0]

(tensor([  101,  1000,  2562, 24471,  3471,  1999, 24471,  2540,  1010,  1038,
          1005,  2522,  2480,  6343,  2097,  2954,  2005,  1057,  1012,  2069,
          1057,  1004, 23713,  1025,  1057,  2031,  2000,  2954,  2005, 24471,
          2969,  1004, 23713,  1025,  2663,  1996,  2645,  1012,  1011,  6819,
          3726,  9126,  5685,  1011,  1043,  1023,  2102,  1012,  1012, 17371,
          1012,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [32]:
## Store our loss and accuracy for plotting
train_loss_set = []
learning_rate = []

# Gradients gets accumulated by default
model.zero_grad()

# tnrange is a tqdm wrapper around the normal python range
for _ in tnrange(1,epochs+1,desc='Epoch'):
  print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
  # Calculate total loss for this epoch
  batch_loss = 0

  for step, batch in enumerate(train_dataloader):
    # Set our model to training mode (as opposed to evaluation mode)
    model.train()
    
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Forward pass
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs[0]
    
    # Backward pass
    loss.backward()
    
    # Clip the norm of the gradients to 1.0
    # Gradient clipping is not in AdamW anymore
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    # Update learning rate schedule
    scheduler.step()

    # Clear the previous accumulated gradients
    optimizer.zero_grad()
    
    # Update tracking variables
    batch_loss += loss.item()

  # Calculate the average loss over the training data.
  avg_train_loss = batch_loss / len(train_dataloader)

  #store the current learning rate
  for param_group in optimizer.param_groups:
    print("\n\tCurrent Learning rate: ",param_group['lr'])
    learning_rate.append(param_group['lr'])
    
  train_loss_set.append(avg_train_loss)
  print(F'\n\tAverage Training loss: {avg_train_loss}')
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_accuracy,eval_mcc_accuracy,nb_eval_steps = 0, 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits[0].to('cpu').numpy()
    label_ids = b_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = label_ids.flatten()
    
    df_metrics=pd.DataFrame({'Epoch':epochs,'Actual_class':labels_flat,'Predicted_class':pred_flat})
    
    tmp_eval_accuracy = accuracy_score(labels_flat,pred_flat)
    tmp_eval_mcc_accuracy = matthews_corrcoef(labels_flat, pred_flat)
    
    eval_accuracy += tmp_eval_accuracy
    eval_mcc_accuracy += tmp_eval_mcc_accuracy
    nb_eval_steps += 1

  print(F'\n\tValidation Accuracy: {eval_accuracy/nb_eval_steps}')
  print(F'\n\tValidation MCC Accuracy: {eval_mcc_accuracy/nb_eval_steps}')

  for _ in tnrange(1,epochs+1,desc='Epoch'):


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]


	Current Learning rate:  1.3333333333333333e-05

	Average Training loss: 0.09978802257534829

	Validation Accuracy: 0.9774305555555556

	Validation MCC Accuracy: 0.8977642911135971

	Current Learning rate:  6.666666666666667e-06

	Average Training loss: 0.013516170360150919

	Validation Accuracy: 0.9791666666666666

	Validation MCC Accuracy: 0.9151335396790109

	Current Learning rate:  0.0

	Average Training loss: 0.004251219182806149

	Validation Accuracy: 0.9747023809523809

	Validation MCC Accuracy: 0.895914434304506


In [33]:
from sklearn.metrics import confusion_matrix,classification_report
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [34]:
## emotion labels
label2int = {
  "ham": 0,
  "spam": 1
}

In [35]:
print(classification_report(df_metrics['Actual_class'].values, df_metrics['Predicted_class'].values, target_names=label2int.keys(), digits=2))


              precision    recall  f1-score   support

         ham       0.91      0.91      0.91        11
        spam       0.67      0.67      0.67         3

    accuracy                           0.86        14
   macro avg       0.79      0.79      0.79        14
weighted avg       0.86      0.86      0.86        14



In [37]:
model_save_folder = 'model/'
tokenizer_save_folder = 'tokenizer/'

path_model = 'deep_learning/' + model_save_folder
path_tokenizer = 'deep_learning/' + tokenizer_save_folder

##create the dir
os.makedirs(path_model, exist_ok=True)
os.makedirs(path_tokenizer, exist_ok=True)

### Now let's save our model and tokenizer to a directory
model.save_pretrained(path_model)
tokenizer.save_pretrained(path_tokenizer)

model_save_name = 'fineTuneModel.pt'
path = path_model + model_save_name
torch.save(model.state_dict(), path)


In [43]:
# Load the saved model and tokenizer
path_model = 'deep_learning/model/'
path_tokenizer = 'deep_learning/tokenizer/'
model = BertForSequenceClassification.from_pretrained(path_model)
tokenizer = BertTokenizer.from_pretrained(path_tokenizer)

# Load the test data
df_val = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/dataset/spamtest.csv')

# Tokenize the input text
test_inputs = tokenizer(list(df_val['text']), padding=True, truncation=True, max_length=128, return_tensors='pt')

# Make predictions
with torch.no_grad():
    model.eval()
    outputs = model(test_inputs['input_ids'], token_type_ids=None, attention_mask=test_inputs['attention_mask'])
    logits = outputs[0].detach().cpu().numpy()
    predictions = logits.argmax(axis=-1)

# Map the predicted labels back to their original names
int2label = {0: 'ham', 1: 'spam'}
predicted_labels = [int2label[p] for p in predictions]

# Add the predicted labels to the test dataframe
df_val['sentiment'] = predicted_labels

# Save the predictions to a file
df_val.to_csv('/content/drive/MyDrive/ColabNotebooks/dataset/spamtest_predictions.csv', index=False)


In [44]:
df_val.head()

Unnamed: 0,text,sentiment
0,you awarded lucky prize USD 80000,spam
1,Call this number to won the money,ham
2,Congratulation! You got cash USD 90000,ham
