In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

df = None

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        df = pd.read_csv(os.path.join(dirname, filename)).reset_index(drop=True)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df.shape

In [None]:
df.head()

In [None]:
target_list  = ['*special problem', '2-sat', 'binary search', 'bitmasks',
               'brute force', 'chinese remainder theorem', 'combinatorics',
               'constructive algorithms', 'data structures', 'dfs and similar',
               'divide and conquer', 'dp', 'dsu', 'expression parsing', 'fft', 'flows',
               'games', 'geometry', 'graph matchings', 'graphs', 'greedy', 'hashing',
               'implementation', 'interactive', 'math', 'matrices',
               'meet-in-the-middle', 'number theory', 'probabilities', 'schedules',
               'shortest paths', 'sortings', 'string suffix structures', 'strings',
               'ternary search', 'trees', 'two pointers']

In [None]:
df = df.drop(df.columns[0], axis = 1)

In [None]:
df.shape

In [None]:
for tag in target_list:
    df[tag] = df[tag].astype(int)

In [None]:
df.head()

In [None]:
df[target_list].sum().sort_values().plot(kind="barh", figsize=(10,10))

In [None]:
for dropped_tag in target_list:
    if df[dropped_tag].sum() <= 200:
        df = df.drop(dropped_tag, axis=1)

In [None]:
df = df.drop("*special problem", axis=1)

In [None]:
target_list = list(df.columns[1:])

In [None]:
df.head()

In [None]:
df[target_list].sum().sort_values().plot(kind="barh", figsize=(10,10))

In [None]:
na = []
for _ in df.iterrows():
    if sum(_[1][target_list]) == 0:
        na.append(_[0])

In [None]:
df = df.drop(na).reset_index(drop=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
!pip install transformers

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import shutil

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# hyperparameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 20
LEARNING_RATE = 1e-04
n_classes = len(target_list)

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['text']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [None]:
train_path = "train.csv"
test_path = "test.csv"

In [None]:
df[:6300].to_csv("train.csv", index = False)
df[6300:].to_csv("test.csv", index = False)

In [None]:
df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
df.shape

In [None]:
train_size = 0.8
train_df = df.sample(frac=train_size, random_state=42).reset_index(drop=True)
val_df = df.drop(train_df.index).reset_index(drop=True)

In [None]:
train_df.shape, val_df.shape

In [None]:
train_df[target_list].sum().sort_values().plot(kind="barh", figsize=(10,10))

In [None]:
val_df[target_list].sum().sort_values().plot(kind="barh", figsize=(10,10))

In [None]:
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)

In [None]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch']

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, n_classes)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

In [None]:
def calculate_pos_weights(class_counts):
    pos_weights = np.ones_like(class_counts)
    neg_counts = [len(train_df) - pos_count for pos_count in class_counts]
    for cdx,(pos_count, neg_count) in enumerate(zip(class_counts,  neg_counts)):
        pos_weights[cdx] = len(train_df) / pos_count
        
    return torch.as_tensor(pos_weights, dtype=torch.float)

In [None]:
class_counts = train_df[target_list].sum()
class_weights = calculate_pos_weights(class_counts).to(device)
class_weights

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss(pos_weight=class_weights)(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
val_targets = []
val_outputs = []

In [None]:
from sklearn.metrics import f1_score

In [None]:
train_loss_plot, valid_loss_plot, valid_acc_plot = [], [], []
def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
   
    # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf

    for epoch in range(1, n_epochs + 1):
        train_loss = 0
        valid_loss = 0

        model.train()
        print(f'############# Epoch {epoch}: Training Start   #############')
        for batch_idx, data in enumerate(training_loader):
            # print('yyy epoch', batch_idx)
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)

            optimizer.zero_grad()
            loss = loss_fn(outputs, targets)
            # if batch_idx%5000==0:
            #     print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #print('before loss data in training', loss.item(), train_loss)
            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
            #print('after loss data in training', loss.item(), train_loss)
    
        print(f"############# Epoch {epoch}: Training End     #############")
        
        print(f"############# Epoch {epoch}: Validation Start   #############")
        ######################    
        # validate the model #
        ######################
    
        model.eval()
        
        val_targets = []
        val_outputs = []

        with torch.no_grad():
            for batch_idx, data in enumerate(validation_loader, 0):
                ids = data['input_ids'].to(device, dtype = torch.long)
                mask = data['attention_mask'].to(device, dtype = torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.float)
                outputs = model(ids, mask, token_type_ids)
                
                loss = loss_fn(outputs, targets)
                valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
                val_targets.extend(targets.cpu().detach().numpy().tolist())
                val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

            print(f'############# Epoch {epoch}: Validation End     #############')
        # calculate average losses
        #print('before cal avg train loss', train_loss)
            train_loss = train_loss/len(training_loader)
            valid_loss = valid_loss/len(validation_loader)
        # print training/validation statistics 
            
            val_outputs  = np.array(val_outputs)
            val_targets = np.array(val_targets)
            pred = np.array(val_outputs > 0.5).astype(np.float)
            total_f1 = 0
            for i, tag in enumerate(target_list):
                total_f1 += f1_score(val_targets[:, i], pred[:, i])
            total_f1 /= len(target_list)
            
            train_loss_plot.append(train_loss)
            valid_loss_plot.append(valid_loss)
            valid_acc_plot.append(total_f1)
            
            print(f'Epoch: {epoch} \tAvgerage Training Loss: {train_loss:.6f} \tAverage Validation Loss: {valid_loss:.6f} \tAverage Accuracy(F1 score): {total_f1:.6f}')
        
        # create checkpoint variable and add important data
            checkpoint = {
                'epoch': epoch + 1,
                'valid_loss_min': valid_loss,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
        }
            
            # save checkpoint
            save_ckp(checkpoint, False, checkpoint_path, best_model_path)
            
        ## TODO: save the model if validation loss has decreased
            if valid_loss <= valid_loss_min:
                print(f'Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}).  Saving model ...')
            # save checkpoint as best model
                save_ckp(checkpoint, True, checkpoint_path, best_model_path)
                valid_loss_min = valid_loss

        print(f'############# Epoch {epoch}  Done   #############\n')

    return model

In [None]:
ckpt_path = "curr_ckpt"
best_model_path = "best_model.pt"

In [None]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.plot(train_loss_plot, label="train loss")
plt.plot(valid_loss_plot, label="validation loss")
plt.ylabel("Loss")
plt.xlabel("Epochs")
plt.show()

In [None]:
plt.plot(valid_acc_plot)
plt.ylabel("Validation accuracy (F1 score)")
plt.xlabel("Epochs")
plt.show()

In [None]:
pred = []
for test_case in range(len(test_df)):
    example = test_df['text'][test_case]
    encodings = tokenizer.encode_plus(
        example,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    model.eval()
    
    with torch.no_grad():
        input_ids = encodings['input_ids'].to(device, dtype=torch.long)
        attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
        output = model(input_ids, attention_mask, token_type_ids)
        final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
        pred.append(final_output[0])

In [None]:
pred = np.array(np.array(pred) > 0.5).astype(np.float)
answer = np.array(test_df[target_list]).astype(np.float)

In [None]:
test_f1 = 0
for i, tag in enumerate(target_list):
    tag_f1_score = f1_score(answer[:, i], pred[:, i])
    test_f1 += tag_f1_score
    print(f"F1 score of {tag} tag: {tag_f1_score}")
test_f1 /= len(target_list)
print(f"Test accuracy: {test_f1}")

In [None]:
example = """
Given a list of D digits and an integer K, we consider all different ways to permute these digits
into a D-digit decimal number. The target of this problem is to find a permutation X, such
that when X is divided by K, the remainder (between 0 and K − 1) is the largest among
all other permutations. If there are more than one possible permutation, output the largest
permutation.
For instance, suppose that we have D = 3 digits, and they are respectively 1, 2, 3.
1. If K = 1, then we see that every permutation will give a remainder 0 when divided by K,
and 321 is thus the desired answer, as it is the largest permutation among all.
2. If K = 10, then both 123 and 213 will give a remainder 3 when divided by K, and this is
the largest possible remainder in this case. Consequently, the desired output is 213 since
it is a larger permutation.
3. If K = 100, then the largest remainder we can get is 32, when the permutation is 132.
Input Format
The first line of the input contains a positive integer D followed by a positive integer K. Then,
the next line contains D digits, each digit d has value 1 ≤ d ≤ 9 and is separated from the next
one by a space.
Output Format
Output the largest permutation that gives the largest remainder in a single line.
Technical Specification
∙ 1 ≤ D ≤ 16
∙ 1 ≤ K ≤ 200
∙ 1 ≤ d ≤ 9
"""
encodings = tokenizer.encode_plus(
    example,
    None,
    add_special_tokens=True,
    max_length=MAX_LEN,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)
model.eval()
with torch.no_grad():
    input_ids = encodings['input_ids'].to(device, dtype=torch.long)
    attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
    output = model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    print(f"Tags for the problem: ")
    for i, prob in enumerate(final_output[0]):
        if prob >= 0.5:
            print(f"{target_list[i]}({round(prob*100, 2)}%)", end=' ')
    print()

In [None]:
best_model = BERTClass()
best_model.to(device)
best_optimize = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
best_model, best_optimize, best_epoch = load_ckp("best_model.pt", best_model, best_optimize)

In [None]:
pred = []
for test_case in range(len(test_df)):
    example = test_df['text'][test_case]
    encodings = tokenizer.encode_plus(
        example,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    best_model.eval()
    
    with torch.no_grad():
        input_ids = encodings['input_ids'].to(device, dtype=torch.long)
        attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
        output = best_model(input_ids, attention_mask, token_type_ids)
        final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
        pred.append(final_output[0])
        
pred = np.array(np.array(pred) > 0.5).astype(np.float)
answer = np.array(test_df[target_list]).astype(np.float)

test_f1 = 0
for i, tag in enumerate(target_list):
    tag_f1_score = f1_score(answer[:, i], pred[:, i])
    test_f1 += tag_f1_score
    print(f"F1 score of {tag} tag: {tag_f1_score}")
test_f1 /= len(target_list)
print(f"Test accuracy: {test_f1}")

In [None]:
example = """
Given a list of D digits and an integer K, we consider all different ways to permute these digits
into a D-digit decimal number. The target of this problem is to find a permutation X, such
that when X is divided by K, the remainder (between 0 and K − 1) is the largest among
all other permutations. If there are more than one possible permutation, output the largest
permutation.
For instance, suppose that we have D = 3 digits, and they are respectively 1, 2, 3.
1. If K = 1, then we see that every permutation will give a remainder 0 when divided by K,
and 321 is thus the desired answer, as it is the largest permutation among all.
2. If K = 10, then both 123 and 213 will give a remainder 3 when divided by K, and this is
the largest possible remainder in this case. Consequently, the desired output is 213 since
it is a larger permutation.
3. If K = 100, then the largest remainder we can get is 32, when the permutation is 132.
Input Format
The first line of the input contains a positive integer D followed by a positive integer K. Then,
the next line contains D digits, each digit d has value 1 ≤ d ≤ 9 and is separated from the next
one by a space.
Output Format
Output the largest permutation that gives the largest remainder in a single line.
Technical Specification
∙ 1 ≤ D ≤ 16
∙ 1 ≤ K ≤ 200
∙ 1 ≤ d ≤ 9
"""
encodings = tokenizer.encode_plus(
    example,
    None,
    add_special_tokens=True,
    max_length=MAX_LEN,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)
best_model.eval()
with torch.no_grad():
    input_ids = encodings['input_ids'].to(device, dtype=torch.long)
    attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
    output = best_model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    print(f"Tags for the problem: ")
    for i, prob in enumerate(final_output[0]):
        if prob >= 0.5:
            print(f"{target_list[i]}({round(prob*100, 2)}%)", end=' ')
    print()

In [None]:
example = """
find the probability of rolling a die
"""
encodings = tokenizer.encode_plus(
    example,
    None,
    add_special_tokens=True,
    max_length=MAX_LEN,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)
best_model.eval()
with torch.no_grad():
    input_ids = encodings['input_ids'].to(device, dtype=torch.long)
    attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
    output = best_model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    print(f"Tags for the problem: ")
    for i, prob in enumerate(final_output[0]):
        if prob >= 0.5:
            print(f"{target_list[i]}({round(prob*100, 2)}%)", end=' ')
    print()