In [1]:
tag2idx = {
    "U-DESIG": 0,
    "U-PHONE": 1,
    "I-CERTIFICATION": 2,
    "O": 3,
    "I-UNI": 4,
    "U-GITHUB": 5,
    "I-PROJECT_DESCRIPTION": 6,
    "B-GRADUATION_YEAR": 7,
    "I-GRADUATION_YEAR": 8,
    "L-GITHUB": 9,
    "I-LOC": 10,
    "B-WORKING_DESCRIPTION": 11,
    "L-LOC": 12,
    "I-WORKING_COMPANY_EXPERIENCES": 13,
    "B-CERTIFICATION": 14,
    "I-NAME": 15,
    "[SEP]": 16,
    "U-WORKING_COMPANY_EXPERIENCES": 17,
    "L-UNI": 18,
    "B-WORKING_TIME_EXPERIENCES": 19,
    "L-PROJECT": 20,
    "B-PROJECT": 21,
    "B-NAME": 22,
    "I-WORKING_DESCRIPTION": 23,
    "L-TECHSTACK_SKILLS": 24,
    "U-TECHSTACK_SKILLS": 25,
    "B-DEG": 26,
    "U-LOC": 27,
    "L-WORKING_TIME_EXPERIENCES": 28,
    "L-CERTIFICATION": 29,
    "L-DEG": 30,
    "L-GRADUATION_YEAR": 31,
    "B-TECHSTACK_SKILLS": 32,
    "L-DESIG": 33,
    "L-WORKING_DESCRIPTION": 34,
    "I-DEG": 35,
    "I-PROJECT": 36,
    "U-EMAIL": 37,
    "I-TECHSTACK_SKILLS": 38,
    "L-PROJECT_DESCRIPTION": 39,
    "B-GITHUB": 40,
    "B-UNI": 41,
    "[CLS]": 42,
    "I-DESIG": 43,
    "B-DESIG": 44,
    "I-WORKING_TIME_EXPERIENCES": 45,
    "B-LOC": 46,
    "L-NAME": 47,
    "B-PROJECT_DESCRIPTION": 48,
    "B-WORKING_COMPANY_EXPERIENCES": 49,
    "L-WORKING_COMPANY_EXPERIENCES": 50,
    "U-GPA": 51,
    "U-CERTIFICATION": 52,
    "X": 53,
}

idx2tag = {
    0: "U-DESIG",
    1: "U-PHONE",
    2: "I-CERTIFICATION",
    3: "O",
    4: "I-UNI",
    5: "U-GITHUB",
    6: "I-PROJECT_DESCRIPTION",
    7: "B-GRADUATION_YEAR",
    8: "I-GRADUATION_YEAR",
    9: "L-GITHUB",
    10: "I-LOC",
    11: "B-WORKING_DESCRIPTION",
    12: "L-LOC",
    13: "I-WORKING_COMPANY_EXPERIENCES",
    14: "B-CERTIFICATION",
    15: "I-NAME",
    16: "[SEP]",
    17: "U-WORKING_COMPANY_EXPERIENCES",
    18: "L-UNI",
    19: "B-WORKING_TIME_EXPERIENCES",
    20: "L-PROJECT",
    21: "B-PROJECT",
    22: "B-NAME",
    23: "I-WORKING_DESCRIPTION",
    24: "L-TECHSTACK_SKILLS",
    25: "U-TECHSTACK_SKILLS",
    26: "B-DEG",
    27: "U-LOC",
    28: "L-WORKING_TIME_EXPERIENCES",
    29: "L-CERTIFICATION",
    30: "L-DEG",
    31: "L-GRADUATION_YEAR",
    32: "B-TECHSTACK_SKILLS",
    33: "L-DESIG",
    34: "L-WORKING_DESCRIPTION",
    35: "I-DEG",
    36: "I-PROJECT",
    37: "U-EMAIL",
    38: "I-TECHSTACK_SKILLS",
    39: "L-PROJECT_DESCRIPTION",
    40: "B-GITHUB",
    41: "B-UNI",
    42: "[CLS]",
    43: "I-DESIG",
    44: "B-DESIG",
    45: "I-WORKING_TIME_EXPERIENCES",
    46: "B-LOC",
    47: "L-NAME",
    48: "B-PROJECT_DESCRIPTION",
    49: "B-WORKING_COMPANY_EXPERIENCES",
    50: "L-WORKING_COMPANY_EXPERIENCES",
    51: "U-GPA",
    52: "U-CERTIFICATION",
    53: "X",
}

MAX_LEN = 512

In [2]:
# Load back the bert model
from pytorch_pretrained_bert import BertForTokenClassification

bert_out_address = "models"

bert_model = BertForTokenClassification.from_pretrained(
    bert_out_address, num_labels=len(tag2idx)
).cpu()

In [3]:
from pytorch_pretrained_bert import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 213450/213450 [00:03<00:00, 63766.30B/s]


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
import numpy as np
from scipy.special import softmax

In [8]:
def bert_predict(cv_data:str):
    #Token id embedding, mask word embedding
    tokenized_texts=[]
    temp_token=[]

    #Add [CLS] at the front
    temp_token.append('[CLS]')
    token_list = tokenizer.tokenize(cv_data)

    for m, token in enumerate(token_list):
        temp_token.append(token)

    # Trim the token to fit the length requirement
    if len(temp_token) > MAX_LEN-1:
        temp_token = temp_token[:MAX_LEN-1]

    #Add [SEP] at the end
    temp_token.append('[SEP]')

    tokenized_texts.append(temp_token)

    #Make id embedding
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=MAX_LEN,dtype='long',truncating='post',padding='post')
    #Make mask embeeding -> For fine tune of predict, with token mask is 1, pad token is 0
    attention_masks=[[float(i>0) for i in ii] for ii in input_ids]
    segment_ids = [[0] * len(input_id) for input_id in input_ids]

    #Make embeddings into torch tensor
    input_ids=torch.tensor(input_ids)
    attention_masks=torch.tensor(attention_masks)
    segment_ids = torch.tensor(segment_ids)

    with torch.no_grad():
        outputs=bert_model(input_ids,token_type_ids=None,attention_mask=None,)
        #For eval mode, the first result of outputs is logits
        logits=outputs[0]

    predict_results=logits.detach().cpu().numpy()
    results_arrays_soft=softmax(predict_results) #Make each token predict result into softmax mode
    result_array=results_arrays_soft
    result_list=np.argmax(result_array,axis=-1)

    # #Get token predict tag
    # for i, mark in enumerate(attention_masks[0]):
    #     if mark>0:
    #         print(f'{temp_token[i]:50} {idx2tag[result_list[i]]}')
        # Build token-tag pairs
    token_tag_pairs = []
    for i, mark in enumerate(attention_masks[0]):
        if mark > 0:
            token_tag_pairs.append({
                'token': temp_token[i],
                'tag': idx2tag[result_list[i]],
                'position': i
            })

    return token_tag_pairs

In [9]:
predict = bert_predict("""
Tran Thi Mai Fullstack Developer - Accenture Da Nang Da Nang, Viet Nam - tran.thi.mai.02022001@gmail.com - 0987654321 - linkedin.com/in/tran-thi-mai-87654321 - github.com/tranthimai As a passionate Information Technology graduate, I excel in building robust full-stack solutions with a focus on user-centric design. I aspire to innovate in software development, delivering high-quality applications that solve real-world problems. WORK EXPERIENCE Accenture Da Nang Fullstack Developer Designed and deployed a client management system using React and NestJS, enhancing data processing efficiency. Optimized database queries with PostgreSQL to improve performance by 25%. Mar 2022 - Present Da Nang Adnovum Vietnam Software Engineer Developed secure RESTful services with Spring Boot and integrated frontend components with Vue.js. Conducted code reviews and improved CI/CD pipelines using GitLab. Jan 2020 - Feb 2022 Hanoi EDUCATION Bachelor of Information Technology (BIT) University of Science Hanoi GPA: 3.2/4.0 Sep 2016 - Jun 2020 PROJECTS Inventory Management System Description: Created a tool for tracking stock levels using Angular and Flask, with real-time updates. Outcome: Reduced inventory errors by 15% and improved restocking efficiency. Jun 2021 - Aug 2021 Social Media Platform Description: Built a platform with React and Node.js, featuring user profiles and messaging. Outcome: Enhanced user engagement by 30% through intuitive design and fast backend responses. Sep 2020 - Nov 2020 SKILLS Frontend: HTML, CSS, React, Angular, Tailwind CSS Backend: NestJS, Flask, Node.js DevOps: GitLab, Docker CERTIFICATIONS AWS Certified Solutions Architect – Associate Google Cloud Certified – Associate Cloud Engineer
""")

In [None]:
predict

[{'token': '[CLS]', 'tag': '[CLS]', 'position': 0},
 {'token': 'T', 'tag': 'B-NAME', 'position': 1},
 {'token': '##ran', 'tag': 'X', 'position': 2},
 {'token': 'T', 'tag': 'I-NAME', 'position': 3},
 {'token': '##hi', 'tag': 'X', 'position': 4},
 {'token': 'Mai', 'tag': 'L-NAME', 'position': 5},
 {'token': 'Full', 'tag': 'B-DESIG', 'position': 6},
 {'token': '##sta', 'tag': 'X', 'position': 7},
 {'token': '##ck', 'tag': 'X', 'position': 8},
 {'token': 'Dev', 'tag': 'L-DESIG', 'position': 9},
 {'token': '##elo', 'tag': 'X', 'position': 10},
 {'token': '##per', 'tag': 'X', 'position': 11},
 {'token': '-', 'tag': 'O', 'position': 12},
 {'token': 'A', 'tag': 'I-PROJECT_DESCRIPTION', 'position': 13},
 {'token': '##cc', 'tag': 'X', 'position': 14},
 {'token': '##ent', 'tag': 'X', 'position': 15},
 {'token': '##ure', 'tag': 'X', 'position': 16},
 {'token': 'Da', 'tag': 'B-LOC', 'position': 17},
 {'token': 'Nan', 'tag': 'L-LOC', 'position': 18},
 {'token': '##g', 'tag': 'X', 'position': 19},
 {

: 