In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import pandas as pd
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')
import torch
from transformers import BertTokenizer
from transformers import RobertaTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel,RobertaModel
from transformers import AdamW, get_linear_schedule_with_warmup

In [24]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [4]:
df = pd.read_csv("/kaggle/input/resumecorpus-cleaned/finale.csv", index_col=0)

In [51]:
df.shape

In [5]:
df.head()

In [6]:
classes = df.columns[2:12]

In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",do_lower_case=True)

In [10]:
def preprocessing_for_bert(data):
    input_ids = []
    attention_masks = []
    
    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            text = sent,   #preprocess sentence
            add_special_tokens = True,         #Add `[CLS]` and `[SEP]`
            max_length= MAX_LEN  ,             #Max length to truncate/pad
            pad_to_max_length = True,          #pad sentence to max length 
            return_attention_mask= True        #Return attention mask 
        )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))
        
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    
    return input_ids,attention_masks

In [12]:
all_text = np.concatenate([df.Ents])
len_sent = [len(sent) for sent in all_text]

In [13]:
avg_len = np.mean(len_sent)
print('Avg length: ',avg_len)

In [14]:
from sklearn.model_selection import train_test_split
X = df.Ents.values
y = df[classes].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2022)

In [98]:
y_test[0]

In [15]:
MAX_LEN = 512

token_ids = list(preprocessing_for_bert([X[0]])[0].squeeze().numpy())
print('Original: ',X[0])
print('Token IDs: ',token_ids)

In [16]:
train_inputs, train_masks = preprocessing_for_bert(X_train)
test_inputs, test_masks = preprocessing_for_bert(X_test)

In [17]:
train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)

batch_size = 8

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs,train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [18]:
class BertClassifier(nn.Module):
    def __init__(self, freeze_bert=False):
        super(BertClassifier,self).__init__()
        # Specify hidden size of Bert, hidden size of our classifier, and number of labels
        D_in, H,D_out = 768,30,10
        
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        
        self.classifier = nn.Sequential(
                            nn.Linear(D_in, H),
                            nn.ReLU(),
                            nn.Linear(H, D_out))
        self.sigmoid = nn.Sigmoid()
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
    
    def forward(self,input_ids,attention_mask):
        outputs = self.bert(input_ids=input_ids,
                           attention_mask = attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:,0,:]
        
        # Feed input to classifier to compute logits
        logit = self.classifier(last_hidden_state_cls)
        
#         logits = self.sigmoid(logit)
        
        return logit

In [22]:
def initialize_model(epochs=20):
    
    bert_classifier = BertClassifier(freeze_bert=False)
    bert_classifier.to(device)
    optimizer = AdamW(bert_classifier.parameters(),
                     lr=5e-5, 
                     eps=1e-8 
                     )

    total_steps = len(train_dataloader) * epochs
    
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps=0, 
                                              num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [19]:
from tqdm import tqdm
import random
import time

In [27]:
loss_fn = nn.BCEWithLogitsLoss()

def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, test_dataloader=None, epochs=20, evaluation=False):
    print("Start training...\n")
    for epoch_i in range(epochs):
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Test Loss':^10} | {'Test Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()

        total_loss, batch_loss, batch_counts = 0, 0, 0

        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
       
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        
            model.zero_grad()

            logits = model(b_input_ids, b_attn_mask)

            loss = loss_fn(logits, b_labels.float())
            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if (step % 50000 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)

        if evaluation == True:

            test_loss, test_accuracy = evaluate(model, test_dataloader)

            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {test_loss:^10.6f} | {test_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!!!")


def evaluate(model, test_dataloader):
   
    model.eval()

    test_accuracy = []
    test_loss = []

    # For each batch in our validation set...
    for batch in test_dataloader:
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        loss = loss_fn(logits, b_labels.float())
        test_loss.append(loss.item())

        accuracy = accuracy_thresh(logits.view(-1,10),b_labels.view(-1,10))
        
        test_accuracy.append(accuracy)

    test_loss = np.mean(test_loss)
    test_accuracy = np.mean(test_accuracy)

    return test_loss, test_accuracy

def accuracy_thresh(y_pred, y_true, thresh:float=0.5, sigmoid:bool=True):
    if sigmoid: 
        y_pred = y_pred.sigmoid()
    return ((y_pred>thresh)==y_true.byte()).float().mean().item()

In [28]:
set_seed(42)    
bert_classifier, optimizer, scheduler = initialize_model(epochs=1)
train(bert_classifier, train_dataloader, test_dataloader, epochs=1, evaluation=True)

In [30]:
def bert_predict(model, test_dataloader):

    model.eval()

    all_logits = []

    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    #probs = F.softmax(all_logits, dim=1).cpu().numpy()
    probs = all_logits.sigmoid().cpu().numpy()
    

    return probs

In [82]:
probs = bert_pre(bert_classifier,train_dataloader)

In [83]:
probs.shape

In [86]:
submission = pd.DataFrame(probs,columns=['Software_Developer', 'Database_Administrator', 'Systems_Administrator',
       'Project_manager', 'Web_Developer', 'Network_Administrator',
       'Security_Analyst', 'Python_Developer', 'Java_Developer',
       'Front_End_Developer'])

In [93]:
submission.to_csv("./train.csv",index=False)

In [99]:
y_train[:5]

In [92]:
submission

In [90]:
y_test[:5]

In [34]:
full_train_data = torch.utils.data.ConcatDataset([train_data, test_data])
full_train_sampler = RandomSampler(full_train_data)
full_train_dataloader = DataLoader(full_train_data, sampler=full_train_sampler, batch_size=batch_size)

set_seed(42)
bert_classifier, optimizer, scheduler = initialize_model(epochs=4)
train(bert_classifier, full_train_dataloader, epochs=4)

In [57]:
torch.save(bert_classifier.state_dict(), "bert.pt")

In [45]:
## Run preprocessing_for_bert on the test set
print('Tokenizing data...')
test_inputs, test_masks = preprocessing_for_bert(X_test)

# Create the DataLoader for our test set
test_dataset = TensorDataset(test_inputs, test_masks)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=16)

In [46]:
probs2 = bert_predict(bert_classifier, test_dataloader)

In [47]:
probs.shape

In [100]:
submission = pd.DataFrame(probs2,columns=['Software_Developer', 'Database_Administrator', 'Systems_Administrator',
       'Project_manager', 'Web_Developer', 'Network_Administrator',
       'Security_Analyst', 'Python_Developer', 'Java_Developer',
       'Front_End_Developer'])
'''
test[['Software_Developer', 'Database_Administrator', 'Systems_Administrator',
       'Project_manager', 'Web_Developer', 'Network_Administrator',
       'Security_Analyst', 'Python_Developer', 'Java_Developer',
       'Front_End_Developer']]=submission
final_sub = test[['Software_Developer', 'Database_Administrator', 'Systems_Administrator',
       'Project_manager', 'Web_Developer', 'Network_Administrator',
       'Security_Analyst', 'Python_Developer', 'Java_Developer',
       'Front_End_Developer']]
final_sub.head()
'''

In [53]:
submission.shape

In [101]:
submission.to_csv("./test.csv",index=False)

In [102]:
submission.head()

In [104]:
submission.shape

In [73]:
y_test[:5]

In [74]:
y_test.shape

In [105]:
import numpy as np
np.savetxt("y-test-kaggle.txt", y_test, fmt="%f")