In [1]:
import random
import sys
import os

from joblib import load
from copy import copy
import seaborn as sns

from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm
tqdm.pandas()
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, Subset
from transformers import AutoModelForTokenClassification, BertTokenizerFast, BertConfig, AutoModel, AutoTokenizer, RobertaTokenizerFast, BertForMaskedLM
from transformers import get_cosine_schedule_with_warmup

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dataclasses import dataclass


@dataclass
class TrainingConfig:
    # Model
    model_name = "ai-forever/ruRoberta-large"
    
    # Training
    batch_size = 256
    epochs = 20
    learning_rate = 5e-5
    lr_warmup_steps = 500

    # Accelerator
    gradient_accumulation_steps = 1
    mixed_precision = 'fp16'  # `no` for float32, `fp16` for automatic mixed precision

    device = "cuda"
    random_state = 42 


config = TrainingConfig()

In [3]:
def seed_everything(seed: int,
                    use_deterministic_algos: bool = False) -> None:
    
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.use_deterministic_algorithms(use_deterministic_algos)
    random.seed(seed)
    

seed_everything(config.random_state)

In [4]:
model_config = BertConfig.from_pretrained(config.model_name)
model = BertForMaskedLM(model_config)
model.to(config.device)
model.eval()

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-05, 

In [5]:
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 786,432 || all params: 356,198,489 || trainable%: 0.22078476587810567


In [6]:
tokenizer = RobertaTokenizerFast.from_pretrained(config.model_name)

# Datasets

In [7]:
import os

from tqdm.auto import tqdm
import textract


class ResumeParser:
    def __init__(self):
        pass
        
    def read_pdf(self, filename: str) -> str:
        text = textract.process(filename).decode("utf-8")
        return text

    def read_docx(self, filename: str) -> str:
        text = textract.process(filename).decode("utf-8")
        return text

    def read_doc(self, filename: str) -> str:
        text = textract.process(filename).decode("utf-8")
        return text

    def parse(self, filename: str) -> str:
        if filename.endswith('.pdf'):
            return self.read_pdf(filename)
            
        elif filename.endswith('.docx'):
            return self.read_docx(filename)

        elif filename.endswith('.doc'):
            return self.read_doc(filename)

        elif filename.endswith('.txt'):
            return Error()

        elif filename.endswith('.tex'):
            raise Error()


parser = ResumeParser()

## Dataset 1

In [8]:
dataset1 = pd.read_csv("data/pretrain_data/UpdatedResumeDataSet.csv")
dataset1

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."
...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...
958,Testing,â Willingness to accept the challenges. â ...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne..."
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...


In [9]:
resumes_1 = dataset1["Resume"].tolist()

## Dataset 2

In [10]:
resumes_2 = []

dataset2_path = "data/pretrain_data/data/data/"
categories = os.listdir(dataset2_path)

for cat in categories:
    for file in tqdm(os.listdir(f"{dataset2_path}/{cat}")):
        resumes_2.append(parser.parse(f"{dataset2_path}/{cat}/{file}"))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 115/115 [00:02<00:00, 48.16it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 116/116 [00:02<00:00, 54.60it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:01<00:00, 52.29it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████

## Dataset 3 (Given)

In [11]:
DATA = "data/Резюме для 1 кейса Хакатона"

resumes_3 = []
files = os.listdir(DATA)

for file in tqdm(files):
    resumes_3.append(parser.parse(f"{DATA}/{file}"))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 214/214 [00:02<00:00, 81.95it/s]


In [12]:
resumes_3

["Victor Babenko | FRONT-END DEVELOPER\n\nBerlin, Germany | +49 000 4255723 | a.baaanko.e@gmail.com\n\n\n\n\n\nEXPERIENCE\n\nShoop Germany GmbH\n\nFront-End Developer (Angular, Ionic)\n\n\n\n\n\n\nBerlin, Germany\n\nAugust 2020 – Present\n\n\n\n\n\nCurrently, I am working as a part of the Core Development Team, participating in the development of Web Application, Ionic Mobile Application, and Support System.\n\n\t\tImplement complex features as Two-Factor-Auth, Onboarding Flow, Support Ticket System\n\n\t\tIncreased E2E and Unit tests coverage by 25%\n\n\t\tMigrated apps to the strict mode, that improved DX and project’s overall reliability\n\n\t\tParticipated in constant project’s architecture refactoring\n\n\t\tResponsible for making releases, including App Store and Google Play\n\n\n\n\n\nShiji Deutschland GmbH\n\nFront-End Developer & Report Engine Expert\n\n(ReactJS, EmberJS, AngularJS)\n\n\nBerlin, Germany\n\nMay 2018 – August 2020\n\n\n\n\n\nWorked on the development of one of t

## Dataset 4 (given other)

In [13]:
dataset4 = pd.read_csv("data/resumes_train.csv")[["text"]]
dataset4

Unnamed: 0,text
0,Клим Тетерина 1991-05-23 Россия Москва Личные ...
1,Алиса Ситникова 1990-07-18 Россия Нижний Новго...
2,Розалина Андреев 1990-01-01 Россия Санкт-Петер...
3,Антон Кудрявцева 1990-01-01 Россия Москва Общи...
4,Александра Панова 1995-01-01 Россия Москва О с...
...,...
108,Кристина Яковлева 1995-06-17 Россия Москва Non...
109,Аполлон Белякова 1987-01-01 Россия Мураши Боле...
110,Клавдия Пономарёв 1984-11-22 Россия Москва В с...
111,Фёдор Харитонова 1993-06-20 Турция Анталия Пои...


In [14]:
resumes_4 = dataset1["Resume"].tolist()

## Dataset 5

In [15]:
DATA = "data/pretrain_data/resume_corpus"

resumes_5 = []
# files = os.listdir(DATA)

# for file in tqdm(files):
#     if file.endswith(".txt"):
#         with open(f"{DATA}/{file}", "rb") as fin:
#             text = "\n".join([el.decode("utf-8") for el in fin.readlines()])
            
#         resumes_5.append(text)

## Dataset 6

In [16]:
from datasets import load_dataset

resumes_6 = pd.DataFrame(load_dataset("Lakshmi12/Resume_Dataset", split="train"))['Resume_str'].tolist()
resumes_6

["         HR ADMINISTRATOR/MARKETING ASSOCIATE\n\nHR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management.   Respected builder and leader of customer-focused teams; strives to instill a shared, enthusiastic commitment to customer service.         Highlights         Focused on customer satisfaction  Team management  Marketing savvy  Conflict resolution techniques     Training and development  Skilled multi-tasker  Client relations specialist           Accomplishments      Missouri DOT Supervisor Training Certification  Certified by IHG in Customer Loyalty and Marketing by Segment   Hilton Worldwide General Manager Training Certification  Accomplished Trainer for cross server hospitality systems such as    Hilton OnQ  ,   Micros    Opera PMS   , Fidelio    OPERA    Reservation System (ORS) ,   Holidex    Completed courses and seminars in customer service, sales strategies, inventory control, loss p

## Dataset 7

In [17]:
resumes_7 = pd.DataFrame(load_dataset("ganchengguang/resume_seven_class", split="train"))["text"].tolist()
resumes_7

['Exp\tName: Abiral Pandey',
 'PI\tEmail:',
 'PI\tPhone: 940-242-3303',
 'PI\tCurrent Location: Woonsocket, Rhode Island',
 'PI\tVisa Status: US Citizen',
 'Sum\tSUMMARY:',
 'Sum\tDynamic individual with 6 years of software development experience in design, development, deployment, maintenance, production and support of web - based and Client-Server business applications using OOP and Java/J2EE technologies.',
 'Sum\tExposure to all phases of Software Development Life Cycle(SDLC) using Agile, RUP, Waterfall.',
 'Sum\tDesigned and developed web UI screen using Angular-JS.',
 'Sum\tDeveloped AngularJS Controllers, Services, filters and directives for various modules in the application.',
 'Sum\tKnowledge on ETL tools like Kettle Pentaho and Microsoft SSIS tools.',
 'Sum\tCreated custom directives, decorators and services using AngularJS to interface with both RESTful and legacy network services also DOM applications.',
 'Sum\tExperience with MVC frameworks like Struts, SPRING and ORM too

## Dataset 8

In [18]:
resumes_8 = pd.DataFrame(load_dataset("ganchengguang/resume-5label-classification", split="train"))["text"].tolist()
resumes_8

Repo card metadata block was not found. Setting CardData to empty.


['meta\tothers\tJitesh Vishwakarma',
 'meta\tothers\tE-mail-Id: - jvishwakarma123@gmail.com',
 'meta\tothers\tContact Number: - 9960902548',
 'header\texperience\tPROFESSIONAL SUMMARY:',
 'content\texperience\t· 4 years of technical experience in implementation, customization, integration and support of business application system.',
 'content\texperience\t· Having Domain Experience in PAYMENT, AUTOMOBILE and HEALTH-CARE.',
 'content\texperience\t· Experienced in developing Web based applications with J2EE, JSP, Servlets, JDBC, Spring, Hibernate.',
 'content\texperience\t· Experience in designing, developing and deploying J2EE application on IBM WebSphere/Web Logic Application Servers, Tomcat, etc.',
 'content\texperience\t· Exposure to AGILE methodologies.',
 'content\texperience\t· Hands on exposure to multiple Application Servers like GLASSFISH, and IBM Web Sphere Server.',
 'content\texperience\t· Expertise in back-end procedure development, for Database Applications using ORACLE a

# Dataset

In [19]:
dataset = resumes_1 + resumes_2 + resumes_3 + resumes_4 + resumes_5 + resumes_6  + resumes_7  + resumes_8
len(dataset)

125777

In [20]:
from datasets import load_dataset, Dataset

df = pd.DataFrame({"text": dataset})
df

Unnamed: 0,text
0,Skills * Programming Languages: Python (pandas...
1,Education Details \r\nMay 2013 to May 2017 B.E...
2,"Areas of Interest Deep Learning, Control Syste..."
3,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,"Education Details \r\n MCA YMCAUST, Faridab..."
...,...
125772,"meta\tknowledge\tLanguages Known : English, Hindi"
125773,meta\tothers\tNationality : Indian
125774,meta\tothers\tP A N K A J M I S H R A 2 @ H O ...
125775,"meta\tothers\t3 / 5 0 2 , G A R D E N E S T A ..."


In [21]:
import re


def remove_keys(input_string): 
    pattern = r'\[(\w+)\s*:\s*([^\]]+)\]' 
    cleaned_string = re.sub(pattern, r'\2', input_string) 
    return cleaned_string

In [22]:
df["text"] = df["text"].progress_apply(remove_keys)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 125777/125777 [00:00<00:00, 592647.53it/s]


In [23]:
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['text'],
    num_rows: 125777
})

In [25]:
lens = [len(tokenizer.encode_plus(el)["input_ids"]) for el in tqdm(df["text"])]
lens


  0%|                                                                                                                                                                                                                                                                         | 0/125777 [00:00<?, ?it/s][A
  0%|▎                                                                                                                                                                                                                                                            | 148/125777 [00:00<01:25, 1472.74it/s][A
  0%|▌                                                                                                                                                                                                                                                            | 296/125777 [00:00<01:38, 1274.65it/s][A
  0%|▊                                                                                          

[2193,
 600,
 887,
 3377,
 231,
 342,
 806,
 3919,
 846,
 1914,
 2193,
 600,
 887,
 3377,
 231,
 342,
 806,
 3919,
 846,
 1914,
 2193,
 600,
 887,
 3377,
 231,
 342,
 806,
 3919,
 846,
 1914,
 2193,
 600,
 887,
 3377,
 231,
 342,
 806,
 3919,
 846,
 1914,
 217,
 1281,
 97,
 98,
 98,
 91,
 82,
 500,
 253,
 524,
 434,
 217,
 1281,
 97,
 98,
 98,
 91,
 82,
 500,
 253,
 524,
 434,
 217,
 1281,
 97,
 98,
 98,
 91,
 82,
 500,
 253,
 524,
 434,
 217,
 1281,
 97,
 98,
 98,
 91,
 82,
 500,
 253,
 524,
 434,
 826,
 222,
 106,
 399,
 1341,
 394,
 277,
 583,
 279,
 191,
 826,
 222,
 106,
 399,
 1341,
 394,
 277,
 583,
 279,
 191,
 842,
 463,
 3626,
 211,
 292,
 576,
 842,
 463,
 3626,
 211,
 292,
 576,
 842,
 463,
 3626,
 211,
 292,
 576,
 842,
 463,
 3626,
 211,
 292,
 576,
 842,
 463,
 3626,
 211,
 292,
 576,
 842,
 463,
 3626,
 211,
 292,
 576,
 1740,
 1837,
 1740,
 344,
 1719,
 1740,
 1837,
 1740,
 344,
 1719,
 1740,
 1837,
 1740,
 344,
 1719,
 1740,
 1837,
 1740,
 344,
 1719,
 1740,
 1837,
 1

In [None]:
import seaborn as sns

In [9]:
encoded_dataset = dataset.map(
    lambda sample: tokenizer(
        sample['question'] truncation=True, padding='max_length', max_length=256
    ),
    batched=True,
)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 2476083/2476083 [02:54<00:00, 14155.57 examples/s]


In [10]:
list(encoded_dataset[0].keys())

['question',
 'answer',
 'relevance',
 '__index_level_0__',
 'input_ids',
 'token_type_ids',
 'attention_mask']

In [11]:
encoded_dataset = encoded_dataset.remove_columns([
 'question',
 'answer',
 '__index_level_0__',
])

list(encoded_dataset[0].keys())

['relevance', 'input_ids', 'token_type_ids', 'attention_mask']

In [12]:
encoded_dataset.set_format(type='torch', columns=['relevance', 'input_ids', 'token_type_ids', 'attention_mask'])

In [13]:
train_inds, test_inds = train_test_split([i for i in range(len(encoded_dataset))], test_size=0.2)

train_dataloader = DataLoader(
    Subset(encoded_dataset, train_inds), 
    batch_size=76,
    shuffle=True
)

val_dataloader = DataLoader(
    Subset(encoded_dataset, test_inds), 
    batch_size=76,
    shuffle=False
)

In [14]:
from torchmetrics.classification import 

ROCAUC = ()

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim


def train_epoch(model, dataloader, optimizer, scheduler, criterion):
    model.train()

    all_probas = []
    all_labels = []

    total_loss = 0

    for batch in tqdm(dataloader):
        input_ids, attention_masks, token_type_ids, labels = batch["input_ids"], batch["attention_mask"], batch["token_type_ids"], batch["relevance"]
        input_ids, attention_masks, token_type_ids, labels = input_ids.to(DEVICE), attention_masks.to(DEVICE), token_type_ids.to(DEVICE), labels.to(DEVICE)
        
        optimizer.zero_grad()
        output = model(
            input_ids=input_ids, 
            attention_mask=attention_masks,
            token_type_ids=token_type_ids
        ).logits

        loss = criterion(output, labels)
        total_loss += loss.item()
        
        accelerator.backward(loss)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        probas = output.softmax(dim=-1)
        all_probas.append(probas.detach().cpu())
        all_labels.append(labels.cpu())
        
    metrics = {
        "Loss": total_loss / len(dataloader)
    }
        
    return metrics



def val_epoch(model, dataloader, criterion):
    model.eval()
    
    all_probas = []
    all_labels = []

    total_loss = 0

    for batch in tqdm(dataloader):
        input_ids, attention_masks, token_type_ids, labels = batch["input_ids"], batch["attention_mask"], batch["token_type_ids"], batch["relevance"]
        input_ids, attention_masks, token_type_ids, labels = input_ids.to(DEVICE), attention_masks.to(DEVICE), token_type_ids.to(DEVICE), labels.to(DEVICE)

        with torch.no_grad():
            output = model(
                input_ids=input_ids, 
                attention_mask=attention_masks,
                token_type_ids=token_type_ids
            ).logits

        loss = criterion(output, labels)
        total_loss += loss.item()

        probas = output.softmax(dim=-1)
        all_probas.append(probas.detach().cpu())
        all_labels.append(labels.cpu())
        

    metrics = {
        "Loss": total_loss / len(dataloader)
    }
        
    return metrics


def train_loop(
    project_name,
    model, 
    epochs,
    train_dataloader,
    test_dataloader,
    optimizer,
    scheduler,
    criterion
):
    for i in range(epochs):
        train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, criterion)
        test_loss = val_epoch(model, test_dataloader, criterion)

        print(train_loss, test_loss)

        torch.save(model, f"{project_name}/{i}.pt")

In [16]:
from madgrad import MADGRAD

epochs = 20
optimizer = MADGRAD([
        {"params": model.parameters(), "lr": config.learning_rate},
])
total_steps = int(len(train_dataloader) * epochs)
scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                    num_warmup_steps = cofig.lr_warmup_steps, # Default value in run_glue.py
                                    num_training_steps = total_steps)

train_loop(
    project_name="",
    model=model, 
    epochs=epochs,
    train_dataloader=train_dataloader,
    test_dataloader=val_dataloader,
    optimizer=optimizer,
    scheduler=scheduler,
    criterion=nn.CrossEntropyLoss(),
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26065/26065 [1:56:39<00:00,  3.72it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6517/6517 [10:42<00:00, 10.14it/s]


{'ROCAUC': tensor(0.7484), 'Loss': 0.5820043456033029} {'ROCAUC': tensor(0.7690), 'Loss': 0.5696628986165735}


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26065/26065 [1:57:44<00:00,  3.69it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6517/6517 [10:45<00:00, 10.09it/s]


{'ROCAUC': tensor(0.7757), 'Loss': 0.5572885225995955} {'ROCAUC': tensor(0.7905), 'Loss': 0.54619039610904}


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26065/26065 [1:55:01<00:00,  3.78it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6517/6517 [09:34<00:00, 11.34it/s]


{'ROCAUC': tensor(0.8260), 'Loss': 0.5038450323406849} {'ROCAUC': tensor(0.8565), 'Loss': 0.4671186056466525}


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26065/26065 [2:00:14<00:00,  3.61it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6517/6517 [10:43<00:00, 10.13it/s]


{'ROCAUC': tensor(0.8759), 'Loss': 0.43368750158550873} {'ROCAUC': tensor(0.8766), 'Loss': 0.4372068587116107}


  7%|███████▉                                                                                                              | 1741/26065 [07:46<1:48:43,  3.73it/s]


KeyboardInterrupt: 

In [51]:
print("\n".join(['resume', 'resume_id', 'first_name', 'last_name', 'middle_name', 'birth_date', 'birth_date_year_only', 'country', 'city', 'about', 'key_skills', 'salary_expectations_amount', 'salary_expectations_currency', 'photo_path', 'gender', 'language', 'resume_name', 'source_link', 'contactItems', 'resume_contact_item_id', 'value', 'comment', 'contact_type', 'educationItems', 'resume_education_item_id', 'year', 'organization', 'faculty', 'specialty', 'result', 'education_type', 'education_level', 'experienceItems', 'resume_experience_item_id', 'starts', 'ends', 'employer', 'city', 'url', 'position', 'description', 'order', 'languageItems', 'resume_language_item_id', 'language', 'language_level', 'O']))

resume
resume_id
first_name
last_name
middle_name
birth_date
birth_date_year_only
country
city
about
key_skills
salary_expectations_amount
salary_expectations_currency
photo_path
gender
language
resume_name
source_link
contactItems
resume_contact_item_id
value
comment
contact_type
educationItems
resume_education_item_id
year
organization
faculty
specialty
result
education_type
education_level
experienceItems
resume_experience_item_id
starts
ends
employer
city
url
position
description
order
languageItems
resume_language_item_id
language
language_level
O
