# BERT

In [3]:
#Install Torch
!pip install torch



In [4]:
#Install PDF processing library
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m777.9 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.0

In [51]:
# Load the CV PDF and extract text
import pdfplumber

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return ''.join(pages)

pdf_path = "about.pdf"
cv_text = extract_text_from_pdf(pdf_path)[:125]
print (cv_text)

9/24/24, 9:23 AM About Me - Kaushik Roy
About Me
I am Kaushik Roy, a Ph.D. candidate at the Artificial Intelligence
Institute


In [54]:
#BERT code
import torch
import random
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer

class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, embed_size, bias=False)
        self.keys = nn.Linear(self.head_dim, embed_size, bias=False)
        self.queries = nn.Linear(self.head_dim, embed_size, bias=False)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split embedding into self.heads pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])  # Dot product attention
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.embed_size
        )

        return self.fc_out(out)

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

class BERT(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_size=768,
        num_layers=6,
        heads=8,
        forward_expansion=4,
        dropout=0.1,
        max_length=512
    ):
        super(BERT, self).__init__()
        self.embed_size = embed_size
        self.word_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [TransformerBlock(embed_size, heads, dropout, forward_expansion) for _ in range(num_layers)]
        )

        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(embed_size, vocab_size)

    def forward(self, x, mask=None):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(x.device)
        out = self.dropout(self.word_embedding(x) + self.position_embedding(positions))

        for layer in self.layers:
            out = layer(out, out, out, mask)

        return self.fc_out(out)

# Step 1: Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
mask_token_id = tokenizer.convert_tokens_to_ids("[MASK]")
vocab_size = tokenizer.vocab_size

# Step 2: Define the function to tokenize input and mask tokens for MLM
def create_masked_input(text, tokenizer, mask_token_id, vocab_size, mask_prob=0.15):
    tokens = tokenizer.encode(text, return_tensors='pt')
    labels = tokens.clone()

    probability_matrix = torch.full(labels.shape, mask_prob)
    masked_indices = torch.bernoulli(probability_matrix).bool()

    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    random_tokens = torch.randint(vocab_size, labels.shape, dtype=torch.long)
    tokens[masked_indices] = torch.where(
        torch.rand_like(tokens[masked_indices].float()) < 0.8,
        mask_token_id,
        random_tokens[masked_indices]
    )

    return tokens, labels

# Step 3: Example input text and creating masked inputs
text = cv_text[:512]
input_ids, labels = create_masked_input(text, tokenizer, mask_token_id, vocab_size)

# Step 4: Create a simple dataset and data loader
input_ids = input_ids.repeat(32, 1)  # Simulate a batch of 32 examples for training
labels = labels.repeat(32, 1)
train_dataset = TensorDataset(input_ids, labels)
train_loader = DataLoader(train_dataset, batch_size=8)

# Initialize a simple BERT model from scratch (previously defined in sections above)
model = BERT(vocab_size=vocab_size)

# Optimizer
optimizer = Adam(model.parameters(), lr=3e-5)

# Training loop
model.train()
for epoch in range(20):
    for batch in train_loader:
        inputs, labels = batch

        # Forward pass
        outputs = model(inputs)
        loss = F.cross_entropy(outputs.view(-1, vocab_size), labels.view(-1), ignore_index=-100)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} completed with loss: {loss.item()}")

Epoch 1 completed with loss: 5.764271259307861
Epoch 2 completed with loss: 2.1977362632751465
Epoch 3 completed with loss: 0.7900721430778503
Epoch 4 completed with loss: 0.3722098171710968
Epoch 5 completed with loss: 0.19815658032894135
Epoch 6 completed with loss: 0.12749354541301727
Epoch 7 completed with loss: 0.08989217877388
Epoch 8 completed with loss: 0.0662025585770607
Epoch 9 completed with loss: 0.05837758630514145
Epoch 10 completed with loss: 0.0419316403567791
Epoch 11 completed with loss: 0.032264646142721176
Epoch 12 completed with loss: 0.025068260729312897
Epoch 13 completed with loss: 0.025160793215036392
Epoch 14 completed with loss: 0.023082973435521126
Epoch 15 completed with loss: 0.024032730609178543
Epoch 16 completed with loss: 0.019851211458444595
Epoch 17 completed with loss: 0.021124975755810738
Epoch 18 completed with loss: 0.019287118688225746
Epoch 19 completed with loss: 0.019684620201587677
Epoch 20 completed with loss: 0.01445895154029131


In [55]:
#Inference code
def masked_language_modeling_inference(text, tokenizer, model, top_k=5):
    tokens = tokenizer.encode(text, return_tensors='pt')
    mask_token_index = (tokens == mask_token_id).nonzero(as_tuple=True)[1]

    # Forward pass
    with torch.no_grad():
        outputs = model(tokens)
    logits = outputs[0, mask_token_index]

    # Get top predictions for masked tokens
    top_k_tokens = torch.topk(logits, top_k, dim=-1).indices.tolist()[0]
    predicted_tokens = [tokenizer.decode([token]) for token in top_k_tokens]
    return predicted_tokens

In [60]:
#Inference test
prompt = cv_text[:torch.randint(1,len(list(cv_text)),(1,))] + '[MASK]'
print ('prompt: ', prompt)
predictions = masked_language_modeling_inference(prompt, tokenizer, model)
print(f"Predicted words for [MASK]: {predictions}")

prompt:  9/24/24, 9:23 AM About Me - Kaushik Roy
About Me
I am Kaushik Roy, a Ph.[MASK]
Predicted words for [MASK]: ['am', '.', 'ᅩ', 'penelope', '##jet']


# BERT for Resume Process

In [61]:
#install transformers library
!pip install transformers



In [75]:
import pdfplumber
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Step 1: Load the CV PDF and extract text
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return ''.join(pages)

# Extract text from the CV
pdf_path = "CV.pdf"
cv_text = extract_text_from_pdf(pdf_path)

# Step 2: Load pre-trained BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

# Step 3: Use pipeline for Named Entity Recognition
nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

# Step 4: Extract entities from the CV text
ner_results = nlp(cv_text)

# Display the recognized entities
#for entity in ner_results:
#    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}")

# Step 5: Post-process the entities for CV data extraction (optional)
# For example, grouping entities like degree, institution, and dates
def extract_education_details(ner_results):
    education = []
    current_education = {}
    for entity in ner_results:
        if entity['entity_group'] == 'ORG':
            current_education['institution'] = entity['word']
        elif entity['entity_group'] == 'MISC':  # Assuming degrees are labeled as MISC
            current_education['degree'] = entity['word']
        elif entity['entity_group'] == 'DATE':
            current_education['year'] = entity['word']

        # Save the current education entry
        if 'institution' in current_education:
            education.append({'institution ': current_education['institution']})
    return education

education_details = extract_education_details(ner_results)

# Display the structured education data
print("Extracted Education Details:", education_details)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Extracted Education Details: [{'institution ': 'University of South Carolina Scholar'}, {'institution ': '##s'}, {'institution ': '##s'}, {'institution ': 'Google'}, {'institution ': 'Google'}, {'institution ': 'University of Texas at Dallas G'}, {'institution ': 'Computer Science'}, {'institution ': 'Indiana University Bloomington'}, {'institution ': 'Indiana University Bloomington'}, {'institution ': 'Machine'}, {'institution ': 'Machine'}, {'institution ': 'B'}, {'institution ': 'E'}, {'institution ': 'Computer Science'}, {'institution ': 'RV College of Engineering'}, {'institution ': 'RV College of Engineering'}, {'institution ': 'RV College of Engineering'}]


# Evaluation

In [76]:
#Evaluation Instrinsic - Perplexity
# Get the needed libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

inputs = tokenizer("ABC is a startup based in New York City and Paris", return_tensors = "pt")
loss = model(input_ids = inputs["input_ids"], labels = inputs["input_ids"]).loss
ppl = torch.exp(loss)
print(ppl)

inputs_wiki_text = tokenizer("Generative Pretrained Transformer is an opensource artificial intelligence created by OpenAI in February 2019", return_tensors = "pt")
loss = model(input_ids = inputs_wiki_text["input_ids"], labels = inputs_wiki_text["input_ids"]).loss
ppl = torch.exp(loss)
print(ppl)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



tensor(29.4841, grad_fn=<ExpBackward0>)
tensor(211.8131, grad_fn=<ExpBackward0>)
