In [None]:
# !pip install --no-deps seqeval[gpu]
# !python -m spacy download en_core_web_lg
# !pip install pytorch-pretrained-bert
# !pip install PyMuPDF
import locale
import nltk

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")


def getpreferredencoding(do_setlocale=True):
    return "UTF-8"


locale.getpreferredencoding = getpreferredencoding

#Import Libraries

In [None]:
import random

random.seed(42)

In [None]:
import numpy as np
import pandas as pd

import spacy
from spacy.training import offsets_to_biluo_tags

nlp = spacy.load("en_core_web_lg")

from tqdm import trange
import torch
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

from seqeval.metrics import classification_report, accuracy_score, f1_score
import warnings

warnings.filterwarnings("ignore")

# Text preprocessing tools
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import fitz

# Make each token predict result into softmax mode
from scipy.special import softmax

In [7]:
import spacy

# python  -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")

In [None]:
# Adding '\n' to the default spacy tokenizer
prefixes = [i + "\\n" for i in nlp.Defaults.prefixes]
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
nlp.tokenizer.prefix_search = prefix_regex.search

In [None]:
entity_dict = {
    "NAME": "Name",
    "EMAIL": "Email Address",
    "GITHUB": "Github",
    "LOC": "Location",
    "PHONE": "Phone",
    "UNI": "University",
    "DEG": "Degree",
    "GPA": "GPA",
    "GRADUATION_YEAR": "Graduation Year",
    "COMPANY_MONTHS": "Company with Duration",
    "EXPERIENCE_MONTHS": "Total Experience",
    "DESIG": "Designation",
    "TECHSTACK_SKILLS": "Technical Skills",
    "PROJECT": "Projects",
    "CERTIFICATION": "Certifications",
}

# Load Data

In [None]:
import os
import json
import pandas as pd


def load_ner_json_to_dataframe(folder_path: str) -> pd.DataFrame:
    """
    Read all JSON files from a folder and convert them to a DataFrame.

    Args:
        folder_path: Path to the folder containing JSON files

    Returns:
        pd.DataFrame with columns: filename, content, entities
    """
    data = []

    # Check if folder exists
    if not os.path.exists(folder_path):
        print(f"Error: Folder '{folder_path}' does not exist.")
        return pd.DataFrame()

    # Get all JSON files from the folder
    json_files = [f for f in os.listdir(folder_path) if f.endswith(".json")]

    if not json_files:
        print(f"No JSON files found in folder '{folder_path}'")
        return pd.DataFrame()

    print(f"Found {len(json_files)} JSON files")

    for filename in json_files:
        file_path = os.path.join(folder_path, filename)

        try:
            with open(file_path, "r", encoding="utf-8") as file:
                json_data = json.load(file)

                # Extract content and entities
                annotations = json_data.get("annotations", [])

                for annotation in annotations:
                    content = annotation[0]  # This is the "<text>" string
                    entities_dict = annotation[1]

                    # Convert entities to list of tuples
                    entities = []
                    if "entities" in entities_dict:
                        entities = [
                            tuple(entity) for entity in entities_dict["entities"]
                        ]

                    data.append(
                        {"filename": filename, "content": content, "entities": entities}
                    )

        except json.JSONDecodeError:
            print(f"Error: Invalid JSON in file '{filename}'")
            continue
        except Exception as e:
            print(f"Error processing file '{filename}': {str(e)}")
            continue

    # Create DataFrame
    df = pd.DataFrame(data)

    if not df.empty:
        print(f"\nDataFrame created successfully with {len(df)} rows")
        print("\nDataFrame info:")
        print(df.info())
        print("\nFirst few rows:")
        print(df.head())

    return df

In [None]:
df = load_ner_json_to_dataframe("ner_resumes")

In [17]:
df.iloc[0].content

'Ngo Thi Thanh Embedded Developer - AI AVATAR Ho Chi Minh Ho Chi Minh, Viet Nam - ngo.thi.thanh.09092008@gmail.com - 0923123456 - linkedin.com/in/ngo-thi-thanh-89012345 - github.com/ngothithanh I am a Cybersecurity graduate with a focus on embedded systems and IoT development. My goal is to innovate in secure embedded solutions for smart technologies. WORK EXPERIENCE AI AVATAR Ho Chi Minh Embedded Developer Designed firmware for IoT devices using C and RTOS, ensuring low power consumption. Integrated security protocols, reducing vulnerabilities by 20%. May 2020 - Present Ho Chi Minh AINKA Technology Solutions Hanoi Senior Embedded Developer Developed embedded software for industrial sensors with C++, improving data accuracy. Optimized memory usage for resource-constrained devices. Jan 2016 - Apr 2020 Hanoi EDUCATION Bachelor of Cybersecurity University of Law Hanoi GPA: 3.1/4.0 Sep 2014 - Jun 2018 PROJECTS IoT Device Firmware Description: Built firmware with C for smart home devices, e

# Parser data
- Parser data into document structure

In [30]:
# Your original approach
doc = nlp("0987654321")
biluo_tags = offsets_to_biluo_tags(doc, [(0, 10, 'PHONE')])
print(f"spaCy BILUO tags: {biluo_tags}")  # ['U-PHONE']
tokenizer = BertTokenizer.from_pretrained("bert-base-cased", do_lower_case=False)
text = "0987654321"
tokenized = tokenizer.tokenize(text)
print(f"BERT tokens: {tokenized}") 
# Get word ids and alignment
encoding = tokenizer(text, return_offsets_mapping=True)
word_ids = encoding.word_ids()
offset_mapping = encoding['offset_mapping']

# Create BERT labels
def align_labels_with_tokenization(biluo_tags, tokenizer, text):
    encoding = tokenizer(text, return_offsets_mapping=True)
    bert_labels = []
    
    # For each token in the tokenization
    for i, offset in enumerate(encoding['offset_mapping']):
        start, end = offset
        
        # Special tokens get ignored
        if start == end:
            bert_labels.append('X')
            continue
            
        # Find which spacy token this corresponds to
        for j, (token_start, token_end) in enumerate([(0, 10)]):  # Your entities
            if start >= token_start and end <= token_end:
                # Check if this is first subword of entity
                if i == 0 or encoding['offset_mapping'][i-1][1] < token_start:
                    bert_labels.append('B-PHONE')  # Beginning
                else:
                    bert_labels.append('I-PHONE')  # Inside
                break
        else:
            bert_labels.append('O')  # Outside
    
    return bert_labels

# Apply the alignment
bert_labels = align_labels_with_tokenization(biluo_tags, tokenizer, text)
print(f"BERT labels: {bert_labels}")
# Should output: ['B-PHONE', 'I-PHONE', 'I-PHONE', 'I-PHONE', 'I-PHONE', 'I-PHONE']

spaCy BILUO tags: ['U-PHONE']
BERT tokens: ['09', '##8', '##7', '##65', '##43', '##21']


TypeError: 'BertTokenizer' object is not callable

In [None]:
def get_train_data(df):
    tags = []
    sentences = []

    for i in range(len(df)):
        text = df["content"][i]
        entities = df["entities"][i]
        doc = nlp(text)

        tag = offsets_to_biluo_tags(doc, entities)
        tmp = pd.DataFrame([list(doc), tag]).T
        loc = []
        for i in range(len(tmp)):
            if tmp[0][i].text == "." and tmp[1][i] == "O":
                loc.append(i)
        loc.append(len(doc))

        last = 0
        data = []
        for pos in loc:
            data.append([list(doc)[last:pos], tag[last:pos]])
            last = pos

        for d in data:
            tag = ["O" if t == "-" else t for t in d[1]]
            if len(set(tag)) > 1:
                sentences.append(d[0])
                tags.append(tag)

    return sentences, tags

In [None]:
sentences, tags = get_train_data(df)
print(len(sentences), len(tags))

In [None]:
print(sentences[0])
print(tags[0])

[Abhishek, Jha, 
, Application, Development, Associate, -, Accenture, 
, 
, Bengaluru, ,, Karnataka, -, Email, me, on, Indeed, :, indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a, 
, 
, •, To, work, for, an, organization, which, provides, me, the, opportunity, to, improve, my, skills, 
, and, knowledge, for, my, individual, and, company, 's, growth, in, best, possible, ways]
['B-NAME', 'L-NAME', 'O', 'B-DESIG', 'I-DESIG', 'L-DESIG', 'O', 'U-COMPANY', 'O', 'O', 'U-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-EMAIL', 'I-EMAIL', 'I-EMAIL', 'L-EMAIL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
# Find all unique tags
tag_vals = set(["X", "[CLS]", "[SEP]"])
for i in range(len(tags)):
    tag_vals = tag_vals.union(tags[i])

# tag2idx convert tag to idx, is a dict contains (tag,idx)
tag2idx = {t: i for i, t in enumerate(tag_vals)}

# idx2tag convert tag to idx, is a dict contains (idx,tag)
idx2tag = {tag2idx[key]: key for key in tag2idx.keys()}

In [None]:
print(tag_vals)
print(tag2idx)
print(idx2tag)

{'U-EMAIL', 'I-CLG', 'U-CLG', 'B-SKILLS', 'U-LOC', 'U-SKILLS', '[CLS]', 'U-GRADYEAR', 'U-DESIG', 'L-EMAIL', 'B-YOE', 'X', 'I-NAME', 'I-SKILLS', 'I-GRADYEAR', 'B-DESIG', 'B-EMAIL', 'L-YOE', 'I-YOE', 'L-DEG', 'B-DEG', 'L-SKILLS', 'L-LOC', '[SEP]', 'L-DESIG', 'B-NAME', 'B-CLG', 'B-GRADYEAR', 'L-GRADYEAR', 'I-EMAIL', 'I-DESIG', 'B-COMPANY', 'L-NAME', 'U-YOE', 'O', 'B-LOC', 'I-LOC', 'I-COMPANY', 'L-CLG', 'I-DEG', 'U-COMPANY', 'L-COMPANY', 'U-DEG'}
{'U-EMAIL': 0, 'I-CLG': 1, 'U-CLG': 2, 'B-SKILLS': 3, 'U-LOC': 4, 'U-SKILLS': 5, '[CLS]': 6, 'U-GRADYEAR': 7, 'U-DESIG': 8, 'L-EMAIL': 9, 'B-YOE': 10, 'X': 11, 'I-NAME': 12, 'I-SKILLS': 13, 'I-GRADYEAR': 14, 'B-DESIG': 15, 'B-EMAIL': 16, 'L-YOE': 17, 'I-YOE': 18, 'L-DEG': 19, 'B-DEG': 20, 'L-SKILLS': 21, 'L-LOC': 22, '[SEP]': 23, 'L-DESIG': 24, 'B-NAME': 25, 'B-CLG': 26, 'B-GRADYEAR': 27, 'L-GRADYEAR': 28, 'I-EMAIL': 29, 'I-DESIG': 30, 'B-COMPANY': 31, 'L-NAME': 32, 'U-YOE': 33, 'O': 34, 'B-LOC': 35, 'I-LOC': 36, 'I-COMPANY': 37, 'L-CLG': 38, 'I-D

# Make training data
- Set GPU environment
- Load tokenizer and tokenize
- Set 3 embedding - Token embedding, Mask word embedding, Segmentation embedding
- Split dataset into train and validate, then send them to DataLoader

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

1

In [None]:
# BERT pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased", do_lower_case=False)

100%|██████████| 213450/213450 [00:00<00:00, 2445182.39B/s]


## Tokenizer Text

- In hunggieface for bert, when come across OOV, will word piece the word.

- We need to adjust the labels base on the tokenize result, “##abc” need to set label "X".

- Need to set "[CLS]" at front and "[SEP]" at the end, as what the paper do, BERT indexer should add [CLS] and [SEP] tokens.

In [None]:
def get_tokenized_train_data(sentences, tags):

    tokenized_texts = []
    word_piece_labels = []

    for word_list, label in zip(sentences, tags):

        # Add [CLS] at the front
        temp_lable = ["[CLS]"]
        temp_token = ["[CLS]"]

        for word, lab in zip(word_list, label):
            token_list = tokenizer.tokenize(word.text)
            for m, token in enumerate(token_list):
                temp_token.append(token)
                if m == 0:
                    temp_lable.append(lab)
                else:
                    temp_lable.append("X")

        # Add [SEP] at the end
        temp_lable.append("[SEP]")
        temp_token.append("[SEP]")

        tokenized_texts.append(temp_token)
        word_piece_labels.append(temp_lable)

    return tokenized_texts, word_piece_labels

In [None]:
tokenized_texts, word_piece_labels = get_tokenized_train_data(sentences, tags)

In [None]:
# Vector representations of the corresponding words from the input
print(tokenized_texts[0])
print(word_piece_labels[0])

['[CLS]', 'A', '##b', '##his', '##he', '##k', 'J', '##ha', 'Application', 'Development', 'Associate', '-', 'A', '##cc', '##ent', '##ure', 'Bengal', '##uru', ',', 'Karnataka', '-', 'Em', '##ail', 'me', 'on', 'Indeed', ':', 'indeed', '.', 'com', '/', 'r', '/', 'A', '##b', '##his', '##he', '##k', '-', 'J', '##ha', '/', '10', '##e', '##7', '##a', '##8', '##c', '##b', '##7', '##32', '##b', '##c', '##43', '##a', '•', 'To', 'work', 'for', 'an', 'organization', 'which', 'provides', 'me', 'the', 'opportunity', 'to', 'improve', 'my', 'skills', 'and', 'knowledge', 'for', 'my', 'individual', 'and', 'company', "'", 's', 'growth', 'in', 'best', 'possible', 'ways', '[SEP]']
['[CLS]', 'B-NAME', 'X', 'X', 'X', 'X', 'L-NAME', 'X', 'B-DESIG', 'I-DESIG', 'L-DESIG', 'O', 'U-COMPANY', 'X', 'X', 'X', 'U-LOC', 'X', 'O', 'O', 'O', 'O', 'X', 'O', 'O', 'B-EMAIL', 'I-EMAIL', 'I-EMAIL', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X',

## Set Token Embedding
- Pad or trim the text and label to fit the need for MAX_LEN

In [None]:
MAX_LEN = 512
bs = 4

# Make text tokens into ids
input_ids = pad_sequences(
    [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
    maxlen=MAX_LEN,
    dtype="long",
    truncating="post",
    padding="post",
)
print(len(input_ids[0]))
print(input_ids[0])



512
[  101   138  1830 27516  4638  1377   147  2328 22491  3273  9666   118
   138 19515  3452  3313  7756 12328   117 12247   118 18653 11922  1143
  1113 10364   131  5750   119  3254   120   187   120   138  1830 27516
  4638  1377   118   147  2328   120  1275  1162  1559  1161  1604  1665
  1830  1559 17101  1830  1665 25631  1161   794  1706  1250  1111  1126
  2369  1134  2790  1143  1103  3767  1106  4607  1139  4196  1105  3044
  1111  1139  2510  1105  1419   112   188  3213  1107  1436  1936  3242
   102     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     

In [None]:
"""
pad_sequences -> https://www.tensorflow.org/api_docs/python/tf/keras/utils/pad_sequences
====================
maxlen=512: maximum length of all sequences
padding    ='post': pad after each sequence
truncating ='post': remove values from sequences larger than maxlen at the end of the sequences

convert_tokens_to_ids -> converts a string to a sequence of ids (int)
"""

tags = pad_sequences(
    [[tag2idx.get(l) for l in lab] for lab in word_piece_labels],
    maxlen=MAX_LEN,
    value=tag2idx["O"],
    padding="post",
    dtype="long",
    truncating="post",
)
print(len(tags[0]))
print(tags[0])

512
[ 6 25 11 11 11 11 32 11 15 30 24 34 40 11 11 11  4 11 34 34 34 34 11 34
 34 16 29 29 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 11 11 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34
 34 34 34 34 34 34 11 34 34 34 34 34 23 34 34 34 34 34 34 34 34 34 34 34
 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34
 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34
 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34
 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34
 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34
 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34
 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34
 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34
 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34
 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 3

## Set Mask Word Embeeding
- For fine tune of predict, with token mask is 1, pad token is 0

In [None]:
# if the current value >0, then assign 1, else =0
attention_masks = [[float(i > 0) for i in ii] for ii in input_ids]
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

## Split data into Train and Validate
- 70% for training, 30% for validation

In [None]:
# train inputs, validation inputs, train tags, validation tags, train masks, validation masks
tr_inputs, val_inputs, tr_tags, val_tags, tr_masks, val_masks = train_test_split(
    input_ids, tags, attention_masks, random_state=2000, test_size=0.3
)

In [None]:
len(tr_inputs), len(val_inputs), len(tr_tags), len(val_tags), len(tr_masks), len(
    val_masks
)

(545, 234, 545, 234, 545, 234)

### Set data into tensor

In [None]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

### Put data into Data Loader

In [None]:
"""
TORCH.UTILS.DATA -> https://pytorch.org/docs/stable/data.html

TensorDataset: Dataset wrapping tensors. Each sample will be retrieved by indexing tensors along the first dimension.
RandomSampler: Samples elements randomly
DataLoader   : Python iterable over a dataset

Notes: Only set token embeeding, attention embedding, no segment embedding
"""

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

# Train Model


In [None]:
# Defining the model
bert_model = BertForTokenClassification.from_pretrained(
    "bert-base-cased", num_labels=len(tag2idx)
)

100%|██████████| 404400730/404400730 [00:08<00:00, 48381462.02B/s]


In [None]:
# Set model to GPU
bert_model.cuda()

### Set Fine-Tuning method
- Manual optimizer

In [None]:
FULL_FINETUNING = True
# If full tuning=True: Fine tuning all the layers
if FULL_FINETUNING:
    param_optimizer = list(bert_model.named_parameters())
    no_decay = ["bias", "gamma", "beta"]

    # n=name, p=parameter
    optimizer_grouped_parameters = [
        # Params that not inside no_decay
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay_rate": 0.01,
        },
        # Params that are inside no_decay
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay_rate": 0.0,
        },
    ]

# If full tuning=False -> Not full tuning, only fine tune classifier params
else:
    param_optimizer = list(bert_model.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

# Optimizer and learning scheduler
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

### Fine-Tuning Model

In [None]:
# Set epoch and grad max num
epochs = 10
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # Train loop
    bert_model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Forward pass
        loss = bert_model(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels,
        )

        # Backward pass
        loss.backward()

        # Track train loss
        tr_loss += loss.item()

        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=bert_model.parameters(), max_norm=max_grad_norm
        )

        # Update parameters
        optimizer.step()
        bert_model.zero_grad()

    # Print train loss per epoch
    print("Train loss: {}".format(tr_loss / nb_tr_steps))

Epoch:  10%|█         | 1/10 [01:02<09:23, 62.67s/it]

Train loss: 0.012598166418564335


Epoch:  20%|██        | 2/10 [02:04<08:18, 62.34s/it]

Train loss: 0.01448450641397278


Epoch:  30%|███       | 3/10 [03:06<07:15, 62.19s/it]

Train loss: 0.01193943092247716


Epoch:  40%|████      | 4/10 [04:09<06:13, 62.25s/it]

Train loss: 0.01086661145966296


Epoch:  50%|█████     | 5/10 [05:11<05:11, 62.27s/it]

Train loss: 0.011256178366198768


Epoch:  60%|██████    | 6/10 [06:13<04:08, 62.20s/it]

Train loss: 0.00856177856048219


Epoch:  70%|███████   | 7/10 [07:15<03:06, 62.15s/it]

Train loss: 0.008043766321051853


Epoch:  80%|████████  | 8/10 [08:17<02:04, 62.17s/it]

Train loss: 0.006707057559923914


Epoch:  90%|█████████ | 9/10 [09:19<01:02, 62.15s/it]

Train loss: 0.007049659804851462


Epoch: 100%|██████████| 10/10 [10:21<00:00, 62.19s/it]

Train loss: 0.0072043191242825416





# Save Bert Model

In [None]:
bert_out_address = "/content/drive/MyDrive/Colab Notebooks/NLP/BERT/models/"

# Save a trained model, configuration and tokenizer
model_to_save = bert_model.module if hasattr(bert_model, "module") else bert_model

output_model_file = (
    "/content/drive/MyDrive/Colab Notebooks/NLP/BERT/models/pytorch_model.bin"
)
output_config_file = (
    "/content/drive/MyDrive/Colab Notebooks/NLP/BERT/models/config.json"
)

# Save model into file
torch.save(model_to_save.state_dict(), output_model_file)

model_to_save.config.to_json_file(output_config_file)

tokenizer.save_vocabulary(bert_out_address)

# Load back the bert model
bert_model = BertForTokenClassification.from_pretrained(
    bert_out_address, num_labels=len(tag2idx)
)

bert_model.cuda()

if n_gpu > 1:
    bert_model = torch.nn.DataParallel(bert_model)

In [None]:
bert_out_address = "/content/drive/MyDrive/Colab Notebooks/NLP/BERT/models/"
# Load back the bert model
bert_model = BertForTokenClassification.from_pretrained(
    bert_out_address, num_labels=len(tag2idx)
)

# Model Evaluation

In [None]:
# Evaluate loop
bert_model.eval()

y_true = []
y_pred = []

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch

    """
  no_grad: context-manager that disabled gradient calculation (disabling gradient calc is useful for inference when u r 
  sure that u will not call Tensor.backward() -> reduce memory consumption for computations)
  https://pytorch.org/docs/stable/generated/torch.no_grad.html
  """
    with torch.no_grad():
        # logits are the output of the BERT model before a software activation function is applied to the output of BERT
        logits = bert_model(
            input_ids,
            token_type_ids=None,
            attention_mask=input_mask,
        )

    # Get NER predict result
    logits = logits.detach().cpu().numpy()
    logits = [list(p) for p in np.argmax(logits, axis=2)]

    # Get NER true result
    label_ids = label_ids.to("cpu").numpy()

    # Only predict real word, mark=0 will not calculate
    input_mask = input_mask.to("cpu").numpy()

    # Compare the valuable predict result
    for i, mask in enumerate(input_mask):
        temp_1 = []  # Real one
        temp_2 = []  # Predict one

        for j, m in enumerate(mask):
            # Mark=0 meaning its a pad word then no need compare
            if m:
                if (
                    idx2tag[label_ids[i][j]] != "X"
                    and idx2tag[label_ids[i][j]] != "[CLS]"
                    and idx2tag[label_ids[i][j]] != "[SEP]"
                ):
                    temp_1.append(idx2tag[label_ids[i][j]])
                    temp_2.append(idx2tag[logits[i][j]])
            else:
                break

        y_true.append(temp_1)
        y_pred.append(temp_2)
print("f1 socre: %f" % (f1_score(y_true, y_pred)))
print("Accuracy score: %f" % (accuracy_score(y_true, y_pred)))
print(classification_report(y_true, y_pred, digits=4))

f1 socre: 0.532571
Accuracy score: 0.859883
              precision    recall  f1-score   support

         CLG     0.3068    0.4154    0.3529        65
     COMPANY     0.6194    0.5963    0.6076       161
         DEG     0.5116    0.6567    0.5752        67
       DESIG     0.4890    0.6846    0.5705       130
       EMAIL     0.5405    0.4167    0.4706        48
    GRADYEAR     0.6000    0.3889    0.4719        54
         LOC     0.7791    0.5929    0.6734       113
        NAME     0.9296    0.9041    0.9167        73
      SKILLS     0.2179    0.2957    0.2509       115
         YOE     0.4000    0.2222    0.2857         9
           _     0.0000    0.0000    0.0000         0

   micro avg     0.5093    0.5581    0.5326       835
   macro avg     0.4904    0.4703    0.4705       835
weighted avg     0.5514    0.5581    0.5461       835



BERT preprocessing

In [None]:
# JSON formatting functions
import logging
import re


def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines = []
        with open(dataturks_JSON_FilePath, "r", encoding="utf-8") as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data["content"].replace("\n", " ")
            entities = []
            data_annotations = data["annotation"]
            if data_annotations is not None:
                for annotation in data_annotations:
                    # only a single point in text annotation.
                    point = annotation["points"][0]
                    labels = annotation["label"]
                    # handle both list of labels or a single label.
                    if not isinstance(labels, list):
                        labels = [labels]

                    for label in labels:
                        point_start = point["start"]
                        point_end = point["end"]
                        point_text = point["text"]

                        lstrip_diff = len(point_text) - len(point_text.lstrip())
                        rstrip_diff = len(point_text) - len(point_text.rstrip())
                        if lstrip_diff != 0:
                            point_start = point_start + lstrip_diff
                        if rstrip_diff != 0:
                            point_end = point_end - rstrip_diff
                        entities.append((point_start, point_end + 1, label))
            training_data.append((text, {"entities": entities}))
        return training_data
    except Exception as e:
        logging.exception(
            "Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e)
        )
        return None


def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r"\s")

    cleaned_data = []
    for text, annotations in data:
        entities = annotations["entities"]
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                text[valid_start]
            ):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {"entities": valid_entities}])
    return cleaned_data

In [None]:
data = trim_entity_spans(convert_dataturks_to_spacy(data_file_address))
data[0]

["Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

# Inference
- After we trained a model, we can make it into service, sending the new resume then get the prediction.

Process

1) Load model

2) Load tokenizer

3) Set test query (PDF file)

4) Make query into embedding

5) Predict with model

6) Parser result


In [None]:
def getWordnetPos(words):
    tag = pos_tag([words])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV,
    }
    return tag_dict.get(tag, wordnet.NOUN)


def cv_preprocessing(cv_data):
    # Tokenization
    tokenized_text = word_tokenize(cv_data)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filter_text = []
    for token in tokenized_text:
        if token not in stop_words:
            filter_text.append(token)

    # POS and lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatizeResults = [
        lemmatizer.lemmatize(token, getWordnetPos(token)) for token in filter_text
    ]
    return " ".join(lemmatizeResults)


def pdftotext(m, preprocessing=False):
    # Open pdf file
    doc = fitz.open(m)

    # Convert pdf to text
    text = ""
    for page in doc:
        text += page.get_text()

    # Remove new line
    text = " ".join(text.split("\n"))

    if preprocessing:
        return cv_preprocessing(text)
    else:
        return text

In [None]:
def bert_predict(cv_data: str):
    # Token id embeddig, mask word embeddig
    tokenized_texts = []
    temp_token = []

    # Add [CLS] at the front
    temp_token.append("[CLS]")
    token_list = tokenizer.tokenize(cv_data)

    for m, token in enumerate(token_list):
        temp_token.append(token)

    # Trim the token to fit the length requirement
    if len(temp_token) > MAX_LEN - 1:
        temp_token = temp_token[: MAX_LEN - 1]

    # Add [SEP] at the end
    temp_token.append("[SEP]")

    tokenized_texts.append(temp_token)

    # Make id embedding
    input_ids = pad_sequences(
        [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
        maxlen=MAX_LEN,
        dtype="long",
        truncating="post",
        padding="post",
    )
    # Make mask embeeding -> For fine tune of predict, with token mask is 1, pad token is 0
    attention_masks = [[float(i > 0) for i in ii] for ii in input_ids]
    segment_ids = [[0] * len(input_id) for input_id in input_ids]

    # Make embeddings into torch tensor
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    segment_ids = torch.tensor(segment_ids)

    with torch.no_grad():
        outputs = bert_model(
            input_ids.cuda(),
            token_type_ids=None,
            attention_mask=None,
        )
        # For eval mode, the first result of outputs is logits
        logits = outputs[0]

    predict_results = logits.detach().cpu().numpy()
    results_arrays_soft = softmax(
        predict_results
    )  # Make each token predict result into softmax mode
    result_array = results_arrays_soft
    result_list = np.argmax(result_array, axis=-1)

    # Get token predict tag
    for i, mark in enumerate(attention_masks[0]):
        if mark > 0:
            print(f"{temp_token[i]:50} {idx2tag[result_list[i]]}")

# Test with one train data

In [None]:
bert_predict(df.iloc[0]["content"])

[CLS]                                              [CLS]
A                                                  B-NAME
##b                                                X
##his                                              X
##he                                               X
##k                                                X
J                                                  L-NAME
##ha                                               X
Application                                        B-DESIG
Development                                        I-DESIG
Associate                                          L-DESIG
-                                                  O
A                                                  O
##cc                                               X
##ent                                              X
##ure                                              X
Bengal                                             U-LOC
##uru                                              X
,         

In [None]:
bert_predict(data[0][0])

[CLS]                                              [CLS]
A                                                  B-NAME
##b                                                X
##his                                              X
##he                                               X
##k                                                X
J                                                  L-NAME
##ha                                               X
Application                                        B-DESIG
Development                                        I-DESIG
Associate                                          L-DESIG
-                                                  O
A                                                  O
##cc                                               X
##ent                                              X
##ure                                              X
Bengal                                             U-LOC
##uru                                              X
,         