#Set up Google Drive



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Helper Functions

In [5]:
from torch.utils.data import DataLoader,TensorDataset
from torch.nn.utils.rnn import pad_sequence

# Pad inputs and NER sequences with 0 and 9 respectively  
def pad_and_collate_sequences(batch):
    inputs, labels = zip(*batch)
    lengths = torch.tensor([len(sequence) for sequence in inputs])
    padded_inputs = pad_sequence([torch.tensor(sequence) for sequence in inputs], batch_first=True, padding_value=0)
    padded_labels = pad_sequence([torch.tensor(label_sequence) for label_sequence in labels], batch_first=True, padding_value=9)
    return padded_inputs, padded_labels, lengths

In [6]:
# Generate the input file for Perl eval script
def generate_eval_input(pred_list):
    input_file = '/content/drive/MyDrive/Dataset/given/dev'
    output_file = '/content/drive/MyDrive/Dataset/perl_input.txt'
    
    with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
        sentence_index = 0
        word_index = 0
    
        while True:
            line = in_file.readline()
            # if eof
            if not line:                         
                break
            
            # if not blank line
            if line.strip(): 
                label_num =  pred_list[sentence_index][word_index]                
                out_file.write(line.strip() + " " + REVERSE_NER_LABELS[label_num] + "\n")           
                word_index += 1
         
            # if blank line      
            else:
                out_file.write("\n")
                sentence_index += 1
                word_index = 0

In [23]:
import subprocess
# Run Perl eval script using cmd
def run_perl_script():
    input_file = "/content/drive/MyDrive/Dataset/perl_input.txt"
    # Proxy file
    output_file = "/content/drive/MyDrive/Dataset/eval_buffer.txt"
    perl_script = "/content/drive/MyDrive/Dataset/conll03eval.txt"
    # perl conll03eval.txt < predict_dev.txt
    command = f"perl {perl_script} < {input_file} > {output_file}"
    # This line might not work for Operating Sytem other than Window
    subprocess.run(command, shell=True, check=True)

# Extract the F-Score from the Perl eval script
def get_f_score():
    output_file = "/content/drive/MyDrive/Dataset/eval_buffer.txt"
    with open(output_file, "r") as f:
        lines = f.readlines()
    
        # Find the line with the overall FB1 value
        for line in lines:
            if "FB1" in line and "accuracy" in line:
                fb1_value = float(line.split()[-1])
                print("FB1:", fb1_value)
                return fb1_value

# Use GPU if Avaliable

In [8]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
!nvidia-smi

Using cpu device
NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



# Task 1

### Encode/Decode NER labels

In [9]:
NER_LABELS = {
    'B-ORG': 0,
    'O': 1,
    'B-MISC': 2,
    'B-PER': 3, 
    'I-PER': 4,
    'B-LOC': 5,
    'I-ORG': 6,
    'I-MISC': 7,
    'I-LOC': 8
}

REVERSE_NER_LABELS = {v: k for k, v in NER_LABELS.items()}

### Preprocessing Train Data

In [10]:
vocab = {}
train_sentences = []
train_sentences_NER = []
NER_lables_counter = {}
Total_NER_labels = 0

# Create vocab, represent sentences and NER labels with unique integers, and 
# calculate class weights (train data)
with open('/content/drive/MyDrive/Dataset/given/train', 'r') as train_file:
    vocab_unique_val = 1
    sentence_buffer = []
    NER_seq_buffer = []
    
    while True:
        line = train_file.readline()
        # if eof
        if not line:
            # edge case
            indexed_sentence = [vocab[word] for word in sentence_buffer]
            encoded_NER_seq = [NER_LABELS[label] for label in NER_seq_buffer]
            train_sentences.append(indexed_sentence)
            train_sentences_NER.append(encoded_NER_seq)
            break
        # if not blank line
        if line.strip():
            word = line.split(" ")[1].strip()          
            label = line.split(" ")[2].strip()

            # build the vocab
            if word not in vocab:
                vocab[word] = vocab_unique_val
                vocab_unique_val += 1                           

            if label not in NER_lables_counter:
                NER_lables_counter[label] = 1
            else:
                NER_lables_counter[label] = NER_lables_counter[label] + 1

            sentence_buffer.append(word)
            NER_seq_buffer.append(label)

            Total_NER_labels += 1
        # if blank line      
        else:
            indexed_sentence = [vocab[word] for word in sentence_buffer]
            encoded_NER_seq = [NER_LABELS[label] for label in NER_seq_buffer]
            train_sentences.append(indexed_sentence)
            train_sentences_NER.append(encoded_NER_seq)
            sentence_buffer = []
            NER_seq_buffer = []

# Calculate class frequencies
class_frequencies = {}
for label, count in NER_lables_counter.items():
    class_frequencies[label] = count / Total_NER_labels

# Calculate class weights
class_weights = {}
for label, freq in class_frequencies.items():
    class_weights[label] = 1 / freq

print(train_sentences)
print(train_sentences_NER)
print(class_weights)
print(len(vocab))

[[1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11], [12, 13], [14, 15, 16, 17, 18, 19, 20, 21, 22, 3, 23, 5, 24, 5, 25, 7, 8, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 5, 36, 9], [37, 38, 39, 5, 40, 15, 41, 38, 42, 43, 44, 45, 17, 18, 46, 24, 47, 48, 49, 50, 51, 52, 53, 54, 26, 40, 55, 23, 56, 57, 9], [58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 60, 61, 68, 63, 69, 70, 20, 71, 58, 40, 16, 38, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 9], [82, 17, 83, 55, 84, 56, 85, 86, 87, 20, 56, 88, 89, 90, 56, 91, 20, 47, 34, 92, 93, 40, 15, 41, 9], [82, 17, 79, 94, 95, 96, 93, 1, 97, 98, 99, 100, 5, 101, 36, 102, 71, 103, 86, 104, 105, 50, 40, 106, 86, 107, 108, 109, 56, 79, 110, 111, 86, 112, 113, 5, 114, 106, 115, 9], [100, 116, 117, 118, 119, 120, 50, 54, 86, 121, 89, 122, 123, 124, 36, 125, 126, 127, 128, 129, 130, 131, 132, 133, 30, 31, 32, 9], [134, 100, 135, 5, 136, 137, 94, 119, 40, 1, 38, 138, 42, 43, 71, 139, 107, 115, 140, 71, 141, 87, 64, 90, 56, 142, 143, 144, 56, 145, 79, 146, 147, 5, 106, 115, 9], [148

### Preprocessing Dev Data & Test Data

In [11]:
import os

# Represent sentences with unique integers
def load_and_transform_input(input_file) : 
    # unneeded NER labels
    BUFFER_NER = -100
    
    sentences = []
    sentences_NER = []  
    
    file_name = ""
    if input_file == "dev":
        file_name = "dev"
    elif input_file == "test":
        file_name = "test"
    
    with open(os.path.join('/content/drive/MyDrive/Dataset/given', file_name), 'r') as f:
        sentence_buffer = []
        NER_seq_buffer = []

        while True:
            line = f.readline()
            # if eof
            if not line:
                # edge case    
                indexed_sentence = [vocab[word] if word in vocab else len(vocab) + 1 for word in sentence_buffer]           
                sentences.append(indexed_sentence)   

                if input_file == "test":
                    sentences_NER.append(NER_seq_buffer)
                elif input_file == "dev":
                    encoded_NER_seq = [NER_LABELS[label] for label in NER_seq_buffer]    
                    sentences_NER.append(encoded_NER_seq)        
                      
                break
            # if not blank line
            if line.strip():
                word = line.split(" ")[1].strip()            
                sentence_buffer.append(word)
                
                if input_file == "test":
                    NER_seq_buffer.append(BUFFER_NER)
                elif input_file == "dev":
                    labels = line.split(" ")[2].strip() 
                    NER_seq_buffer.append(labels)    
            # if blank line      
            else:       
                indexed_sentence = [vocab[word] if word in vocab else len(vocab) + 1 for word in sentence_buffer]           
                sentences.append(indexed_sentence)
                
                if input_file == "test":
                    sentences_NER.append(NER_seq_buffer)
                elif input_file == "dev":
                    encoded_NER_seq = [NER_LABELS[label] for label in NER_seq_buffer]    
                    sentences_NER.append(encoded_NER_seq)

                sentence_buffer = []
                NER_seq_buffer = []
    
    return  sentences, sentences_NER

In [12]:
dev_sentences, dev_sentences_NER = load_and_transform_input("dev")
print(dev_sentences[5])
print(dev_sentences_NER[5])
test_sentences, test_sentences_NER = load_and_transform_input("test")
print(test_sentences[0])
print(test_sentences_NER[0])
print(len(vocab))

[14046, 93, 23625, 71, 2195, 3262, 79, 6175, 1693, 5, 201, 519, 2275, 1083, 2226, 7501, 236, 5, 23625, 1153, 833, 70, 18238, 9]
[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1892, 677, 23625, 23625, 23625, 3396, 71, 21355, 2338, 23625, 10616, 9]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]
23624


### Prepare Train Dataloader

In [13]:
train_data = list(zip(train_sentences, train_sentences_NER))
train_loader = DataLoader(train_data, batch_size=16, shuffle=True, collate_fn=pad_and_collate_sequences)

### Prepare Prediction Dataloader for dev

In [14]:
dev_data = list(zip(dev_sentences, dev_sentences_NER))
dev_loader = DataLoader(dev_data, batch_size=16, shuffle=False, collate_fn=pad_and_collate_sequences)

### Prepare Prediction Dataloader for test

In [15]:
test_data = list(zip(test_sentences, test_sentences_NER))
predict_loader_test = DataLoader(test_data, batch_size=16, shuffle=False, collate_fn=pad_and_collate_sequences)

### Bi-directional LSTM Constructor

In [16]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence



class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_hidden_dim, lstm_layers, dropout_p, linear_output_dim, num_classes):
        super(BiLSTM, self).__init__()
        # Embedding layer, increment by 2 to include pad and OOV tokens
        self.embedding = nn.Embedding(vocab_size + 2, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, num_layers=lstm_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_p)
        self.linear = nn.Linear(lstm_hidden_dim * 2, linear_output_dim)
        self.activation = nn.ELU()
        self.classifier = nn.Linear(linear_output_dim, num_classes)

    def forward(self, inputs, lengths):
        embeddings = self.embedding(inputs)
        packed_embeddings = pack_padded_sequence(embeddings, lengths, batch_first=True, enforce_sorted=False)
        packed_lstm_output, _ = self.lstm(packed_embeddings)
        lstm_output, _ = pad_packed_sequence(packed_lstm_output, batch_first=True)
        lstm_output = self.dropout(lstm_output)
        linear_output = self.activation(self.linear(lstm_output))
        class_scores = self.classifier(linear_output)
        return class_scores

### Train Bidirectional LSTM

In [14]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.nn.utils import clip_grad_norm_

# class weights tensor 
class_weights_list = list(class_weights.values())
class_weights_tensor = torch.tensor(class_weights_list, dtype=torch.float)

# Init training configs
model = BiLSTM(len(vocab), embedding_dim=100, lstm_hidden_dim=256, lstm_layers=1, dropout_p=0.33, linear_output_dim=128, num_classes=9)
criterion = nn.CrossEntropyLoss(ignore_index=9, weight=class_weights_tensor)
optimizer = torch.optim.SGD(model.parameters(), lr= 0.07198707503473897, momentum=0.6853521900306477, nesterov=True,weight_decay=0.00047079031615098084)
lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True)

patience = 30
best_f_score = 0.0
epochs_without_improvement = 0

# Training loop
for epoch in range(1000):                 
    model.train()
    running_loss = 0.0
    print("Epoch: " + str(epoch))

    for i, (inputs, labels, lengths) in enumerate(train_loader):       
        optimizer.zero_grad()
        outputs = model(inputs, lengths)
        loss = criterion(outputs.view(-1, 9), labels.view(-1))
        loss.backward()
        # Apply gradient clipping
        clip_grad_norm_(model.parameters(), max_norm=1.2790731132676674)  
        optimizer.step()          
        running_loss += loss.item()
    
    # Calculate average loss per epoch
    train_loss = running_loss / len(train_loader)
    print("Training Loss: {:.4f}".format(train_loss))     
    
    # Validation loss
    model.eval() 
    total_loss = 0
    num_batches = 0 
    
    with torch.no_grad():
        for inputs, labels, lengths in dev_loader:             
            outputs = model(inputs, lengths)        
            # Calculate loss
            loss = criterion(outputs.view(-1, 9), labels.view(-1))
            total_loss += loss.item()
            num_batches += 1        
    # Calculate average loss per epoch
    valid_loss = total_loss / num_batches
    print("Validation Loss: {:.4f}".format(valid_loss))    


    # F-Score    
    input_sequences = []
    predictions_dev = []

    with torch.no_grad():
        for inputs, labels, lengths in dev_loader:           
            outputs = model(inputs, lengths)
            _, predicted = torch.max(outputs, dim=2)
        
            # Remove padding from inputs and predictions
            for seq_idx, seq_length in enumerate(lengths):
                input_sequences.append(inputs[seq_idx, :seq_length].tolist())
                predictions_dev.append(predicted[seq_idx, :seq_length].tolist())

    # Calculate F-score using the Perl eval scrip 
    generate_eval_input(predictions_dev)
    run_perl_script()
    f_score = get_f_score()    
    
    # Learning rate scheduling
    lr_scheduler.step(f_score)

    # Early stopping
    if f_score > best_f_score:
        best_f_score = f_score
        epochs_without_improvement = 0
        torch.save(model.state_dict(), "/content/drive/MyDrive/Dataset/debug_model.pt")
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print("Early stopping")
            break   
    print("\n")

Epoch: 0
Training Loss: 1.7005
Validation Loss: 1.3229


Epoch: 1
Training Loss: 1.2238
Validation Loss: 1.0771


Epoch: 2
Training Loss: 0.9967
Validation Loss: 0.9519


Epoch: 3
Training Loss: 0.8419
Validation Loss: 0.8722


Epoch: 4
Training Loss: 0.7492
Validation Loss: 0.7943


Epoch: 5
Training Loss: 0.6803
Validation Loss: 0.7833


Epoch: 6
Training Loss: 0.6383
Validation Loss: 0.8292


Epoch: 7
Training Loss: 0.6174
Validation Loss: 0.7952


Epoch: 8
Training Loss: 0.5860
Validation Loss: 0.7514


Epoch: 9
Training Loss: 0.5659
Validation Loss: 0.7668


Epoch: 10
Training Loss: 0.5479
Validation Loss: 0.7810


Epoch: 11
Training Loss: 0.5296
Validation Loss: 0.7434


Epoch: 12
Training Loss: 0.5084
Validation Loss: 0.7426


Epoch: 13
Training Loss: 0.4729
Validation Loss: 0.6705


Epoch: 14
Training Loss: 0.4302
Validation Loss: 0.6825


Epoch: 15
Training Loss: 0.3822
Validation Loss: 0.6425


Epoch: 16
Training Loss: 0.3309
Validation Loss: 0.7019


Epoch: 17
Training Loss:

In [2]:
print(best_f_score)

NameError: ignored

# Load Model Cache Hypertuning / Debug   (Remove this when deploying?)

In [21]:
loaded_model = BiLSTM(len(vocab), embedding_dim=100, lstm_hidden_dim=256, lstm_layers=1, dropout_p=0.33, linear_output_dim=128, num_classes=9)
# Adjust path to load specific model
saved_state_dict = torch.load("/content/drive/MyDrive/Dataset/model_archives/Task1/models/blstm1_81.31.pt")
loaded_model.load_state_dict(saved_state_dict)
loaded_model.eval()

BiLSTM(
  (embedding): Embedding(23626, 100, padding_idx=0)
  (lstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (activation): ELU(alpha=1.0)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)

In [24]:
input_sequences = []
predictions_dev = []
    
# Eval on the validation set (dev)
with torch.no_grad():
    for inputs, labels, lengths in dev_loader :
        outputs =loaded_model(inputs, lengths)
        _, predicted = torch.max(outputs, dim=2)
        
        # Remove padding from inputs and predictions
        for seq_idx, seq_length in enumerate(lengths):
            input_sequences.append(inputs[seq_idx, :seq_length].tolist())
            predictions_dev.append(predicted[seq_idx, :seq_length].tolist())

# Calculate F-score using the Perl eval scrip 
generate_eval_input(predictions_dev)
run_perl_script()
f_score = get_f_score() 

FB1: 81.31


# Load and Output Required Files

In [25]:
def Create_required_output_file(loaded_model, pred_loader, req_file):
    input_sequences = []
    predictions = []

    with torch.no_grad():
        for inputs, labels, lengths in pred_loader:
            outputs = loaded_model(inputs, lengths)
            _, predicted = torch.max(outputs, dim=2)
        
            # Remove padding from inputs and predictions
            for seq_idx, seq_length in enumerate(lengths):
                input_sequences.append(inputs[seq_idx, :seq_length].tolist())
                predictions.append(predicted[seq_idx, :seq_length].tolist())
    
    if req_file == "dev":
        input_file = '/content/drive/MyDrive/Dataset/given/dev'
        output_file = '/content/drive/MyDrive/Dataset/dev1.out'
    
    elif req_file == "test":
        input_file = '/content/drive/MyDrive/Dataset/given/test'
        output_file = '/content/drive/MyDrive/Dataset/test1.out'

    with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
        sentence_index = 0
        word_index = 0
    
        while True:
            line = in_file.readline()
            # if eof
            if not line:
                # edge case            
                break
            # if not blank line
            if line.strip(): 
                label_num =  predictions[sentence_index][word_index]
                line_buffer = line.strip().split()               
                out_file.write(line_buffer[0] + " " + line_buffer[1] + " " + REVERSE_NER_LABELS[label_num] + "\n")                
                word_index += 1
         
            # if blank line      
            else:
                out_file.write("\n")
                sentence_index += 1
                word_index = 0

# Generate required output files

In [27]:
#Create_required_output_file(loaded_model, dev_loader, "dev")
#Create_required_output_file(loaded_model, predict_loader_test, "test")

# Hyper Tune Playground

In [None]:
!pip install optuna
import optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 KB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.10.2 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.0


In [None]:
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.nn.utils import clip_grad_norm_

class_weights_list = list(class_weights.values())
class_weights_tensor = torch.tensor(class_weights_list, dtype=torch.float)

def objective(trial):
    # Generate hyperparameters using Optuna
    lr = trial.suggest_float("lr", 0.001, 0.1)
    momentum = trial.suggest_float("momentum", 0.59, 0.99)
    weight_decay = trial.suggest_float("weight_decay", 0.000001, 0.01)
    max_norm = trial.suggest_float("max_norm", 1.0, 5.0)

    # Use the generated hyperparameters in your model
    model = BiLSTM(len(vocab), embedding_dim=100, lstm_hidden_dim=256, lstm_layers=1, dropout_p=0.33, linear_output_dim=128, num_classes=9)
    criterion = nn.CrossEntropyLoss(ignore_index=9, weight=class_weights_tensor)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, nesterov=True, weight_decay=weight_decay)
    lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True)

    patience = 30
    best_f_score = 0.0
    epochs_without_improvement = 0

    # Training loop
    for epoch in range(1000):                 
        model.train()
        running_loss = 0.0
        #print("Epoch: " + str(epoch))

        for i, (inputs, labels, lengths) in enumerate(train_loader):       
            optimizer.zero_grad()
            outputs = model(inputs, lengths)
            loss = criterion(outputs.view(-1, 9), labels.view(-1))
            loss.backward()
            # Apply gradient clipping
            clip_grad_norm_(model.parameters(), max_norm=max_norm)  
            optimizer.step()          
            running_loss += loss.item()

        # Calculate average loss per epoch
        train_loss = running_loss / len(train_loader)
       # print("Training Loss: {:.4f}".format(train_loss))     

        # Validation loss
        model.eval() 
        total_loss = 0
        num_batches = 0 
    
        with torch.no_grad():
            for inputs, labels, lengths in dev_loader:             
                outputs = model(inputs, lengths)        
                # Calculate loss
                loss = criterion(outputs.view(-1, 9), labels.view(-1))
                total_loss += loss.item()
                num_batches += 1        
        # Calculate average loss per epoch
        valid_loss = total_loss / num_batches
       # print("Validation Loss: {:.4f}".format(valid_loss))    


        # F-Score    
        input_sequences = []
        predictions_dev = []

        with torch.no_grad():
            for inputs, labels, lengths in dev_loader:           
                outputs = model(inputs, lengths)
                _, predicted = torch.max(outputs, dim=2)
        
                # Remove padding from inputs and predictions
                for seq_idx, seq_length in enumerate(lengths):
                    input_sequences.append(inputs[seq_idx, :seq_length].tolist())
                    predictions_dev.append(predicted[seq_idx, :seq_length].tolist())

        # Calculate F-score using the Perl eval scrip 
        generate_eval_input(predictions_dev)
        run_perl_script()
        f_score = get_f_score()    

        # Learning rate scheduling
        lr_scheduler.step(f_score)

        # Early stopping
        if f_score > best_f_score:
            best_f_score = f_score
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                print("Early stopping")
                break

    # Return the metric you want to optimize (e.g., F-score)
    return f_score





study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)
best_hyperparameters = study.best_trial.params
print("Best hyperparameters:", best_hyperparameters)


[32m[I 2023-03-20 08:39:18,832][0m A new study created in memory with name: no-name-73e407ac-9d8d-4650-a1a8-707ac1204dbb[0m


Epoch 00074: reducing learning rate of group 0 to 4.4916e-03.
Epoch 00096: reducing learning rate of group 0 to 4.4916e-04.
Epoch 00110: reducing learning rate of group 0 to 4.4916e-05.
Epoch 00121: reducing learning rate of group 0 to 4.4916e-06.


[32m[I 2023-03-20 10:23:02,989][0m Trial 0 finished with value: 79.23 and parameters: {'lr': 0.044916170841524106, 'momentum': 0.739150962156391, 'weight_decay': 0.0005259915141631132, 'max_norm': 3.8917770277258894}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00029: reducing learning rate of group 0 to 2.0349e-03.
Epoch 00056: reducing learning rate of group 0 to 2.0349e-04.
Epoch 00067: reducing learning rate of group 0 to 2.0349e-05.


[32m[I 2023-03-20 11:23:14,123][0m Trial 1 finished with value: 47.2 and parameters: {'lr': 0.020349484243255122, 'momentum': 0.776278079787831, 'weight_decay': 0.00598962853729965, 'max_norm': 3.5623079503384574}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00026: reducing learning rate of group 0 to 7.8774e-03.
Epoch 00048: reducing learning rate of group 0 to 7.8774e-04.
Epoch 00059: reducing learning rate of group 0 to 7.8774e-05.


[32m[I 2023-03-20 12:18:00,371][0m Trial 2 finished with value: 53.42 and parameters: {'lr': 0.07877388620740604, 'momentum': 0.6230878564198803, 'weight_decay': 0.004671901324055991, 'max_norm': 2.850560669228965}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00038: reducing learning rate of group 0 to 9.5833e-04.
Epoch 00056: reducing learning rate of group 0 to 9.5833e-05.
Epoch 00067: reducing learning rate of group 0 to 9.5833e-06.


[32m[I 2023-03-20 13:18:38,904][0m Trial 3 finished with value: 26.6 and parameters: {'lr': 0.009583287225956392, 'momentum': 0.6485390631830622, 'weight_decay': 0.0008333430716193432, 'max_norm': 1.5736799964148194}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00029: reducing learning rate of group 0 to 6.2633e-03.
Epoch 00052: reducing learning rate of group 0 to 6.2633e-04.
Epoch 00063: reducing learning rate of group 0 to 6.2633e-05.


[32m[I 2023-03-20 14:16:04,105][0m Trial 4 finished with value: 66.36 and parameters: {'lr': 0.06263250034128952, 'momentum': 0.5951146280188576, 'weight_decay': 0.00013043985425026226, 'max_norm': 4.369195267003846}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00026: reducing learning rate of group 0 to 6.3130e-03.
Epoch 00059: reducing learning rate of group 0 to 6.3130e-04.
Epoch 00084: reducing learning rate of group 0 to 6.3130e-05.
Epoch 00095: reducing learning rate of group 0 to 6.3130e-06.


[32m[I 2023-03-20 15:41:39,560][0m Trial 5 finished with value: 48.45 and parameters: {'lr': 0.06313003159113442, 'momentum': 0.989452363178321, 'weight_decay': 0.006143224446521812, 'max_norm': 3.9574094161246585}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00029: reducing learning rate of group 0 to 7.9423e-03.
Epoch 00074: reducing learning rate of group 0 to 7.9423e-04.
Epoch 00085: reducing learning rate of group 0 to 7.9423e-05.


[32m[I 2023-03-20 16:56:44,019][0m Trial 6 finished with value: 71.55 and parameters: {'lr': 0.0794232080425641, 'momentum': 0.8338182969445536, 'weight_decay': 2.5806001922471358e-05, 'max_norm': 2.5205428930153446}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00015: reducing learning rate of group 0 to 7.0920e-03.
Epoch 00028: reducing learning rate of group 0 to 7.0920e-04.
Epoch 00039: reducing learning rate of group 0 to 7.0920e-05.


[32m[I 2023-03-20 17:34:29,915][0m Trial 7 finished with value: 35.79 and parameters: {'lr': 0.07092037973084149, 'momentum': 0.8790419949530034, 'weight_decay': 0.007587791983954284, 'max_norm': 1.949039133242536}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00031: reducing learning rate of group 0 to 4.9138e-03.
Epoch 00051: reducing learning rate of group 0 to 4.9138e-04.
Epoch 00062: reducing learning rate of group 0 to 4.9138e-05.


[32m[I 2023-03-20 18:31:43,405][0m Trial 8 finished with value: 55.01 and parameters: {'lr': 0.049137565866171934, 'momentum': 0.6162608053486164, 'weight_decay': 0.004398623816839849, 'max_norm': 2.475921750358526}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00031: reducing learning rate of group 0 to 5.1774e-03.
Epoch 00044: reducing learning rate of group 0 to 5.1774e-04.
Epoch 00055: reducing learning rate of group 0 to 5.1774e-05.


[32m[I 2023-03-20 19:22:24,038][0m Trial 9 finished with value: 67.38 and parameters: {'lr': 0.05177356874033556, 'momentum': 0.594177449048631, 'weight_decay': 0.002554444949325595, 'max_norm': 4.584561557033924}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00019: reducing learning rate of group 0 to 9.9056e-03.
Epoch 00039: reducing learning rate of group 0 to 9.9056e-04.
Epoch 00050: reducing learning rate of group 0 to 9.9056e-05.


[32m[I 2023-03-20 20:09:17,170][0m Trial 10 finished with value: 26.23 and parameters: {'lr': 0.09905578975854262, 'momentum': 0.7204853642935177, 'weight_decay': 0.009017747264867253, 'max_norm': 4.932617597728175}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00024: reducing learning rate of group 0 to 3.3850e-03.
Epoch 00056: reducing learning rate of group 0 to 3.3850e-04.
Epoch 00067: reducing learning rate of group 0 to 3.3850e-05.


[32m[I 2023-03-20 21:09:36,250][0m Trial 11 finished with value: 70.63 and parameters: {'lr': 0.033849873667793184, 'momentum': 0.8212149165564964, 'weight_decay': 0.0019544500535906956, 'max_norm': 3.344770485518569}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00060: reducing learning rate of group 0 to 3.8896e-03.
Epoch 00093: reducing learning rate of group 0 to 3.8896e-04.
Epoch 00104: reducing learning rate of group 0 to 3.8896e-05.


[32m[I 2023-03-20 22:40:05,424][0m Trial 12 finished with value: 73.14 and parameters: {'lr': 0.03889629986546357, 'momentum': 0.7128951513201877, 'weight_decay': 7.50074309425852e-05, 'max_norm': 2.7010492511814252}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00037: reducing learning rate of group 0 to 3.5470e-03.
Epoch 00048: reducing learning rate of group 0 to 3.5470e-04.


[32m[I 2023-03-20 23:25:22,983][0m Trial 13 finished with value: 61.51 and parameters: {'lr': 0.03546958806789123, 'momentum': 0.7075551903587405, 'weight_decay': 0.002031094084903463, 'max_norm': 1.0585611187869026}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00028: reducing learning rate of group 0 to 3.1176e-03.
Epoch 00043: reducing learning rate of group 0 to 3.1176e-04.
Epoch 00054: reducing learning rate of group 0 to 3.1176e-05.


[32m[I 2023-03-21 00:15:23,104][0m Trial 14 finished with value: 66.73 and parameters: {'lr': 0.0311758378044555, 'momentum': 0.698691266473841, 'weight_decay': 0.0027383874161297866, 'max_norm': 3.6097038657112357}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00055: reducing learning rate of group 0 to 5.1141e-04.
Epoch 00066: reducing learning rate of group 0 to 5.1141e-05.


[32m[I 2023-03-21 01:15:04,255][0m Trial 15 finished with value: 27.99 and parameters: {'lr': 0.005114082610982847, 'momentum': 0.7606244757422095, 'weight_decay': 0.0010325873248162262, 'max_norm': 3.07886298972595}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00020: reducing learning rate of group 0 to 4.5004e-03.
Epoch 00047: reducing learning rate of group 0 to 4.5004e-04.
Epoch 00058: reducing learning rate of group 0 to 4.5004e-05.


[32m[I 2023-03-21 02:08:39,484][0m Trial 16 finished with value: 60.63 and parameters: {'lr': 0.04500427370658524, 'momentum': 0.6751153111060524, 'weight_decay': 0.0036082561592212375, 'max_norm': 4.100866851358502}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00054: reducing learning rate of group 0 to 2.2645e-03.
Epoch 00065: reducing learning rate of group 0 to 2.2645e-04.


[32m[I 2023-03-21 03:07:19,227][0m Trial 17 finished with value: 75.6 and parameters: {'lr': 0.022644763601185933, 'momentum': 0.7558797249297367, 'weight_decay': 0.0012252488451767324, 'max_norm': 3.81156757924116}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00058: reducing learning rate of group 0 to 2.2095e-03.
Epoch 00099: reducing learning rate of group 0 to 2.2095e-04.
Epoch 00110: reducing learning rate of group 0 to 2.2095e-05.


[32m[I 2023-03-21 04:42:27,898][0m Trial 18 finished with value: 74.6 and parameters: {'lr': 0.022094527911927123, 'momentum': 0.7503808994292024, 'weight_decay': 0.0013281941171752634, 'max_norm': 3.9856626373271147}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00059: reducing learning rate of group 0 to 2.0943e-03.
Epoch 00070: reducing learning rate of group 0 to 2.0943e-04.


[32m[I 2023-03-21 05:45:46,751][0m Trial 19 finished with value: 63.67 and parameters: {'lr': 0.020943056516897665, 'momentum': 0.6660279562273274, 'weight_decay': 0.003322745330902158, 'max_norm': 3.265176619885054}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00055: reducing learning rate of group 0 to 1.2635e-03.
Epoch 00076: reducing learning rate of group 0 to 1.2635e-04.
Epoch 00087: reducing learning rate of group 0 to 1.2635e-05.


[32m[I 2023-03-21 07:01:33,787][0m Trial 20 finished with value: 72.25 and parameters: {'lr': 0.01263542917551149, 'momentum': 0.798325154938923, 'weight_decay': 0.0017486633410018129, 'max_norm': 3.7506564437946706}. Best is trial 0 with value: 79.23.[0m


Early stopping
Epoch 00095: reducing learning rate of group 0 to 2.1533e-03.


# Task 2

In [None]:
import gzip

def load_glove_embeddings(file_path):
    embeddings = {}
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            values = line.strip().split()
            word = values[0]
            vector = list(map(float, values[1:]))
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings('/content/drive/MyDrive/Dataset/given/glove.6B.100d.gz')

In [None]:
import numpy as np

vocab_size = len(vocab)
embedding_dim = 100  # Assuming 100-dimensional GloVe embeddings

embedding_matrix = np.zeros((vocab_size + 2, embedding_dim + 1))  # Add 1 to the embedding_dim

for word, idx in vocab.items():
    glove_vector = glove_embeddings.get(word.lower())
    if glove_vector is not None:
        capitalization_scalar = 0
        if word.isupper():
            capitalization_scalar = 2
        elif word[0].isupper():
            capitalization_scalar = 1
        extended_vector = np.concatenate((glove_vector, [capitalization_scalar]))  # Append the capitalization scalar
        embedding_matrix[idx] = extended_vector
    # else: the vector remains zero if the word is not in GloVe