In [26]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
    
%matplotlib inline
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
data_IMDB = pd.read_csv('/content/IMDB Dataset.csv')
data_IMDB['sentiment'][data_IMDB['sentiment'] == 'positive'] = 1
data_IMDB['sentiment'][data_IMDB['sentiment'] == 'negative'] = 0
training_size = 3000
testing_size = 100
train_data_IMDB = data_IMDB.iloc[:training_size, :]
test_data_IMDB = data_IMDB.iloc[training_size : training_size + testing_size, :]
print(train_data_IMDB.shape, test_data_IMDB.shape)

(3000, 2) (100, 2)


In [28]:
from sklearn.model_selection import train_test_split

X_IMDB = train_data_IMDB['review'].values
y_IMDB = train_data_IMDB['sentiment'].values

X_train_IMDB, X_val_IMDB, y_train_IMDB, y_val_IMDB = train_test_split(X_IMDB, y_IMDB, test_size=0.1, random_state=2020)

y_train_IMDB = y_train_IMDB.astype(np.float64)
y_val_IMDB = y_val_IMDB.astype(np.float64)

In [29]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


In [30]:
import nltk
# Uncomment to download "stopwords"
# nltk.download("stopwords")
from nltk.corpus import stopwords

def text_preprocessing(s):
    """
    - Lowercase the sentence
    - Change "'t" to "not"
    - Remove "@name"
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    """
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

In [31]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(sent),  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [32]:
# Encode our concatenated data
encoded_reviews_IMDB = [tokenizer.encode(sent, max_length=512, truncation=True, add_special_tokens=True) for sent in X_IMDB]
# Find the maximum length
max_len = max([len(sent) for sent in encoded_reviews_IMDB])
print('Max length: ', max_len)

Max length:  512


In [33]:
# Specify `MAX_LEN`
MAX_LEN = 512

# Print sentence 0 and its encoded token ids
token_ids_IMDB = list(preprocessing_for_bert([X_IMDB[0]])[0].squeeze().numpy())
print('Original: ', X_IMDB[0])
print('Token IDs: ', token_ids_IMDB)

# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_inputs_IMDB, train_masks_IMDB = preprocessing_for_bert(X_train_IMDB)
val_inputs_IMDB, val_masks_IMDB = preprocessing_for_bert(X_val_IMDB)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due



In [34]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels_IMDB = torch.tensor(y_train_IMDB)
train_labels_IMDB = train_labels_IMDB.type(torch.LongTensor)
val_labels_IMDB = torch.tensor(y_val_IMDB)
val_labels_IMDB = val_labels_IMDB.type(torch.LongTensor)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 16

# Create the DataLoader for our training set
train_data_IMDB = TensorDataset(train_inputs_IMDB, train_masks_IMDB, train_labels_IMDB)
train_sampler_IMDB = RandomSampler(train_data_IMDB)
train_dataloader_IMDB = DataLoader(train_data_IMDB, sampler=train_sampler_IMDB, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data_IMDB = TensorDataset(val_inputs_IMDB, val_masks_IMDB, val_labels_IMDB)
val_sampler_IMDB = SequentialSampler(val_data_IMDB)
val_dataloader_IMDB = DataLoader(val_data_IMDB, sampler=val_sampler_IMDB, batch_size=batch_size)

In [35]:
%%time
import torch
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 2

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

CPU times: user 44 µs, sys: 0 ns, total: 44 µs
Wall time: 48.9 µs


In [36]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader_IMDB) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [37]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [38]:
set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
train(bert_classifier, train_dataloader_IMDB, val_dataloader_IMDB, epochs=2, evaluation=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.689082   |     -      |     -     |   18.08  
   1    |   40    |   0.536667   |     -      |     -     |   17.20  
   1    |   60    |   0.400937   |     -      |     -     |   17.19  
   1    |   80    |   0.444124   |     -      |     -     |   17.18  
   1    |   100   |   0.333421   |     -      |     -     |   17.20  
   1    |   120   |   0.367048   |     -      |     -     |   17.18  
   1    |   140   |   0.345663   |     -      |     -     |   17.20  
   1    |   160   |   0.296318   |     -      |     -     |   17.21  
   1    |   168   |   0.313264   |     -      |     -     |   6.68   
----------------------------------------------------------------------
   1    |    -    |   0.422843   |  0.311053  |   86.95   |  150.57  
---------------------------------------------------------------------

KeyboardInterrupt: ignored

In [39]:
def predict(model, batch_size, test_data, targets, device='cuda'):
    # Encode our concatenated data
    encoded_data = [tokenizer.encode(sent, max_length=512, truncation=True, add_special_tokens=True) for sent in
                    test_data]
    # Find the maximum length
    max_len = max([len(sent) for sent in encoded_data])
    print('Max length: ', max_len)

    # Specify `MAX_LEN`
    MAX_LEN = 512

    # Print sentence 0 and its encoded token ids
    token_ids_IMDB = list(preprocessing_for_bert([test_data[0]])[0].squeeze().numpy())
    print('Original: ', test_data[0])
    print('Token IDs: ', token_ids_IMDB)

    # Run function `preprocessing_for_bert` on the train set and the validation set
    print('Tokenizing data...')
    test_inputs, test_masks = preprocessing_for_bert(test_data)

    # Convert other data types to torch.Tensor
    targets = targets.astype(np.float64)
    tagets = torch.tensor(targets)
    tagets = tagets.type(torch.LongTensor)

    test_inputs, test_masks, tagets, model = test_inputs.to(device), test_masks.to(device), tagets.to(device), model.to(device)

    with torch.no_grad():
        logits = model(test_inputs, test_masks)
        preds = torch.argmax(logits, dim=1).flatten()
        preds = preds.cpu()

        # Calculate the accuracy rate
        accuracy = np.mean(preds.numpy() == targets) * 100
        print(f"The Accuracy is {accuracy}\n")
        print(f"-------------------------------------------------\n")

    

In [40]:
predict(bert_classifier, 64, test_data_IMDB['review'].values, test_data_IMDB['sentiment'].values, device='cuda')

Max length:  512
Original:  I bought a set of 4 DVDs for 10 bucks at my local Suncoast, which contained this movie and three other trashy horror flicks (including its sequel "Witchcraft XI"). So basically I paid the rock bottom price of $2.50 for this movie, if you do the math. I can't exactly say I was ripped off. I have a thing for trashy horror movies, but this is the kind of trash that gives trash a bad name. The budget couldn't be over $1,000 (though it appears as if they spent a total of $1.50). I know it's a low-budget film, but that's no excuse for totally uninspired camerawork. The film "Blood Cult," though not very good, was made for an extremely low budget and still had fairly good camerawork and acting. The acting in this movie is the definition of "effortless," especially from that muscular guy with the Texas accent. Everyone is pretty much reading their lines off the page. You can take that figuratively or literally. I wouldn't be surprised if the script was off-camera as



tensor([0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
        1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
        1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
        0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
        0, 1, 1, 1]) [0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1.
 1. 0. 0. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1.
 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0.
 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1.
 1. 1. 1. 1.]
The Accuracy is 85.0

-------------------------------------------------



In [71]:
data_syncrony = pd.read_excel('/content/Day2.xlsx')
data_syncrony = data_syncrony

In [93]:
sample = [data_syncrony['INPUT'][430]]
sample

["Hi I'm just getting home from work.  We are under flood advisory and all streets are flooded coming home.  What took a 15min drive took 2hrs.  I'm trying to make the payment but now it's considered"]

In [94]:

test_inputs, test_masks = preprocessing_for_bert(sample)
test_inputs, test_masks = test_inputs.to('cuda'), test_masks.to('cuda')
with torch.no_grad():
    logits = bert_classifier(test_inputs, test_masks)
    preds = torch.argmax(logits, dim=1).flatten()
    print(preds)

tensor([0], device='cuda:0')




In [95]:
! git clone https://github.com/haniaa2/Datathon2022.git

Cloning into 'Datathon2022'...
fatal: could not read Username for 'https://github.com': No such device or address
