# **Library(s) Setup**

In [7]:
!pip install rake-nltk

Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl.metadata (6.4 kB)
Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6


# **Frequency-based Keyword Extraction**

In [4]:
import pandas as pd
import re
from collections import Counter

# Sample CV text
cv_text = """
Experienced data scientist with expertise in machine learning, natural language processing, and AI applications.
Proficient in Python, TensorFlow, and cloud computing. Published in top-tier AI journals and conferences.
Skilled in data analysis, statistical modeling, and developing end-to-end AI pipelines.
"""

# Preprocess CV text (basic cleanup)
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert text to lowercase
    return text

preprocessed_text = preprocess_text(cv_text)

# Tokenize the text
words = preprocessed_text.split()

# Remove common stop words
stop_words = set(['in', 'the', 'and', 'with', 'to', 'of', 'a', 'on'])
filtered_words = [word for word in words if word not in stop_words]

# Get word frequencies
word_counts = Counter(filtered_words)

# Create a DataFrame from the word frequencies
word_freq_df = pd.DataFrame(word_counts.items(), columns=['word', 'frequency'])

# Sort the DataFrame by frequency in descending order
word_freq_df = word_freq_df.sort_values(by='frequency', ascending=False)

# Select the top 3 keywords
top_keywords = word_freq_df.head(3)

# Output results
print("Top 3 keywords based on frequency:")
print(top_keywords)

# Quantitative measure: Total frequency count of top 3 keywords
informative_content_score = top_keywords['frequency'].sum()
print(f"Informative Content Score (Sum of top 3 keyword frequencies): {informative_content_score}")

Top 3 keywords based on frequency:
          word  frequency
9           ai          3
1         data          2
0  experienced          1
Informative Content Score (Sum of top 3 keyword frequencies): 6


# **Term Frequency and Inverse Document Frequency (TF and IDF)-based**

In [5]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# Sample CV text
cv_text = """
Experienced data scientist with expertise in machine learning, natural language processing, and AI applications.
Proficient in Python, TensorFlow, and cloud computing. Published in top-tier AI journals and conferences.
Skilled in data analysis, statistical modeling, and developing end-to-end AI pipelines.
"""

# Preprocess CV text (basic cleanup)
def preprocess_text(text):
    # Remove special characters and convert text to lowercase
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

preprocessed_text = preprocess_text(cv_text)

# Vectorize the CV text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform([preprocessed_text])
feature_names = vectorizer.get_feature_names_out()

# Get the TF-IDF scores for the CV
tfidf_scores = tfidf_matrix.toarray()[0]

# Create a DataFrame with the words and their corresponding TF-IDF scores
word_tfidf_df = pd.DataFrame({
    'word': feature_names,
    'tfidf': tfidf_scores
})

# Sort the words by their TF-IDF scores in descending order
word_tfidf_df = word_tfidf_df.sort_values(by='tfidf', ascending=False)

# Select the top 3 keywords
top_keywords = word_tfidf_df.head(3)

# Quantitative measure: Total TF-IDF score (sum of top 3 keyword scores)
informative_content_score = top_keywords['tfidf'].sum()

# Output results
print("Top 3 keywords based on TF-IDF:")
print(top_keywords)

print(f"Informative Content Score: {informative_content_score}")

Top 3 keywords based on TF-IDF:
       word     tfidf
0        ai  0.486664
6      data  0.324443
14  machine  0.162221
Informative Content Score: 0.9733285267845753


# **Rapid Automatic Keyword Extraction (RAKE)**



In [10]:
import rake_nltk
import pandas as pd

import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Sample CV text
cv_text = """
Experienced data scientist with expertise in machine learning, natural language processing, and AI applications.
Proficient in Python, TensorFlow, and cloud computing. Published in top-tier AI journals and conferences.
Skilled in data analysis, statistical modeling, and developing end-to-end AI pipelines.
"""

# Initialize RAKE
rake = rake_nltk.Rake()

# Extract keywords using RAKE
rake.extract_keywords_from_text(cv_text)
ranked_phrases_with_scores = rake.get_ranked_phrases_with_scores()

# Convert the phrases and scores into a DataFrame
keywords_df = pd.DataFrame(ranked_phrases_with_scores, columns=['score', 'keyword'])

# Sort the keywords by score in descending order and select the top 3
top_keywords = keywords_df.head(3)

# Output results
print("Top 3 keywords based on RAKE:")
print(top_keywords)

# Quantitative measure: Total RAKE score of top 3 keywords
informative_content_score = top_keywords['score'].sum()
print(f"Informative Content Score (Sum of top 3 RAKE scores): {informative_content_score}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Top 3 keywords based on RAKE:
      score                      keyword
0  9.000000  natural language processing
1  8.666667             tier ai journals
2  8.500000   experienced data scientist
Informative Content Score (Sum of top 3 RAKE scores): 26.166666666666664


# **(Neural) BERT-based**

In [11]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Sample CV text
cv_text = """
Experienced data scientist with expertise in machine learning, natural language processing, and AI applications.
Proficient in Python, TensorFlow, and cloud computing. Published in top-tier AI journals and conferences.
Skilled in data analysis, statistical modeling, and developing end-to-end AI pipelines.
"""

# Tokenize the CV text and convert to input IDs for BERT
tokens = tokenizer(cv_text, return_tensors='pt', truncation=True, padding=True)
with torch.no_grad():
    outputs = model(**tokens)

# Extract token embeddings (ignoring [CLS] and [SEP])
embeddings = outputs.last_hidden_state[0].numpy()

# Get the actual tokens (excluding special tokens like [CLS] and [SEP])
tokens_list = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])[1:-1]

# Compute the importance of each token using cosine similarity with the [CLS] token (representing the sentence meaning)
cls_embedding = embeddings[0]  # First token is [CLS]
token_importances = [cosine_similarity([cls_embedding], [embedding])[0][0] for embedding in embeddings[1:-1]]

# Create a DataFrame with tokens and their importance scores
token_df = pd.DataFrame({'token': tokens_list, 'importance': token_importances})

# Sort the tokens by importance score in descending order and select the top 3 keywords
top_keywords = token_df.sort_values(by='importance', ascending=False).head(3)

# Output results
print("Top 3 keywords based on BERT embeddings:")
print(top_keywords)

# Quantitative measure: Sum of cosine similarity scores of top 3 keywords
informative_content_score = top_keywords['importance'].sum()
print(f"Informative Content Score (Sum of top 3 keyword importance): {informative_content_score}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Top 3 keywords based on BERT embeddings:
          token  importance
0   experienced    0.539161
56            .    0.334990
37            .    0.333345
Informative Content Score (Sum of top 3 keyword importance): 1.2074966430664062


# **(Symbolic) Knowledge Graph-based**

In [12]:
import requests
import pandas as pd

# Sample CV text
cv_text = """
Experienced data scientist with expertise in machine learning, natural language processing, and AI applications.
Proficient in Python, TensorFlow, and cloud computing. Published in top-tier AI journals and conferences.
Skilled in data analysis, statistical modeling, and developing end-to-end AI pipelines.
"""

# Split the text into words
words = set(cv_text.lower().split())

# Function to query ConceptNet API for knowledge graph connections
def query_conceptnet(word):
    url = f"http://api.conceptnet.io/c/en/{word}"
    response = requests.get(url).json()
    return response

# Extract keywords based on ConceptNet relevance (by checking if words are linked to important concepts)
def get_conceptnet_keywords(words):
    keyword_scores = []
    for word in words:
        try:
            response = query_conceptnet(word)
            # Count the number of linked edges to determine importance
            score = len(response['edges'])
            if score > 0:
                keyword_scores.append((word, score))
        except:
            pass  # Skip words not found in ConceptNet

    # Sort keywords by their scores
    keyword_scores.sort(key=lambda x: x[1], reverse=True)
    return keyword_scores

# Get top 3 keywords using ConceptNet
conceptnet_keywords = get_conceptnet_keywords(words)
top_keywords_df = pd.DataFrame(conceptnet_keywords[:3], columns=['keyword', 'score'])

# Output results
print("Top 3 keywords based on ConceptNet:")
print(top_keywords_df)

# Quantitative measure: Sum of ConceptNet link scores for the top 3 keywords
informative_content_score = top_keywords_df['score'].sum()
print(f"Informative Content Score (Sum of top 3 ConceptNet scores): {informative_content_score}")

Top 3 keywords based on ConceptNet:
       keyword  score
0  experienced     20
1  statistical     20
2        cloud     20
Informative Content Score (Sum of top 3 ConceptNet scores): 60


# **(Neurosymbolic) Hybrid Approach**

In [13]:
import torch
from transformers import BertTokenizer, BertModel
import requests
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Sample CV text
cv_text = """
Experienced data scientist with expertise in machine learning, natural language processing, and AI applications.
Proficient in Python, TensorFlow, and cloud computing. Published in top-tier AI journals and conferences.
Skilled in data analysis, statistical modeling, and developing end-to-end AI pipelines.
"""

# Tokenize the CV text and convert to input IDs for BERT
tokens = tokenizer(cv_text, return_tensors='pt', truncation=True, padding=True)
with torch.no_grad():
    outputs = model(**tokens)

# Extract token embeddings (ignoring [CLS] and [SEP])
embeddings = outputs.last_hidden_state[0].numpy()

# Get the actual tokens (excluding special tokens like [CLS] and [SEP])
tokens_list = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])[1:-1]

# Compute the importance of each token using cosine similarity with the [CLS] token (representing the sentence meaning)
cls_embedding = embeddings[0]  # First token is [CLS]
token_importances = [cosine_similarity([cls_embedding], [embedding])[0][0] for embedding in embeddings[1:-1]]

# Create a DataFrame with tokens and their importance scores
token_df = pd.DataFrame({'token': tokens_list, 'importance': token_importances})

# Sort the tokens by importance score in descending order
sorted_token_df = token_df.sort_values(by='importance', ascending=False)

# Take the top 10 tokens based on BERT importance for further analysis
top_bert_tokens = sorted_token_df.head(10)['token'].values

# Function to query ConceptNet API for knowledge graph connections
def query_conceptnet(word):
    url = f"http://api.conceptnet.io/c/en/{word}"
    response = requests.get(url).json()
    return response

# Validate BERT tokens using ConceptNet by checking their relevance
def get_conceptnet_scores(words):
    conceptnet_scores = []
    for word in words:
        try:
            response = query_conceptnet(word)
            # Count the number of linked edges to determine importance
            score = len(response['edges'])
            if score > 0:
                conceptnet_scores.append((word, score))
        except:
            pass  # Skip words not found in ConceptNet
    return conceptnet_scores

# Get ConceptNet scores for top BERT tokens
conceptnet_results = get_conceptnet_scores(top_bert_tokens)

# Combine BERT and ConceptNet scores into a single DataFrame
bert_conceptnet_df = pd.DataFrame(conceptnet_results, columns=['token', 'conceptnet_score'])

# Merge BERT importance and ConceptNet score
final_df = pd.merge(sorted_token_df, bert_conceptnet_df, on='token', how='inner')

# Calculate a hybrid score: weighted average of BERT importance and ConceptNet score
final_df['hybrid_score'] = 0.7 * final_df['importance'] + 0.3 * final_df['conceptnet_score']

# Sort the final DataFrame by the hybrid score
final_df = final_df.sort_values(by='hybrid_score', ascending=False)

# Select the top 3 keywords based on the hybrid score
top_keywords = final_df.head(3)

# Output results
print("Top 3 keywords based on Hybrid Approach:")
print(top_keywords)

# Quantitative measure: Sum of hybrid scores for the top 3 keywords
informative_content_score = top_keywords['hybrid_score'].sum()
print(f"Informative Content Score (Sum of top 3 hybrid scores): {informative_content_score}")



Top 3 keywords based on Hybrid Approach:
         token  importance  conceptnet_score  hybrid_score
0  experienced    0.539161                20      6.377412
2            .    0.334990                20      6.234493
3            .    0.334990                20      6.234493
Informative Content Score (Sum of top 3 hybrid scores): 18.84639909863472


# **Sample Evaluation**

In [15]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Example ground truth and predicted keywords for evaluation
ground_truth = [
    ["Python", "Java", "cloud computing"],
    ["machine learning", "deep learning", "data visualization"],
    ["natural language processing", "computer vision", "TensorFlow"],
    ["statistical analysis", "medical data", "healthcare informatics"],
    ["network security", "threat analysis", "ethical hacking"]
]

# Example predicted keywords by different methods (frequency-based, RAKE, BERT-based, etc.)
# Replace these with the actual predicted keywords from your methods
predicted_keywords = {
    "frequency_based": [
        ["Python", "Java", "microservices"],
        ["machine learning", "data visualization", "research papers"],
        ["natural language processing", "computer vision", "PyTorch"],
        ["statistical analysis", "R", "SQL"],
        ["network security", "penetration testing", "threat analysis"]
    ],
    "rake_based": [
        ["software engineer", "cloud computing", "microservices"],
        ["machine learning", "deep learning", "top journals"],
        ["natural language processing", "computer vision", "open-source"],
        ["statistical analysis", "medical data", "informatics"],
        ["cybersecurity", "network security", "ethical hacking"]
    ],
    "bert_based": [
        ["Python", "Java", "cloud computing"],
        ["machine learning", "deep learning", "data visualization"],
        ["natural language processing", "computer vision", "TensorFlow"],
        ["statistical analysis", "medical data", "healthcare informatics"],
        ["network security", "threat analysis", "ethical hacking"]
    ],
    "conceptnet_based": [
        ["Python", "Java", "cloud computing"],
        ["machine learning", "deep learning", "research papers"],
        ["natural language processing", "TensorFlow", "open-source"],
        ["healthcare", "medical data", "statistical analysis"],
        ["network security", "cybersecurity", "threat analysis"]
    ],
    "hybrid": [
        ["Python", "Java", "cloud computing"],
        ["machine learning", "deep learning", "data visualization"],
        ["natural language processing", "TensorFlow", "computer vision"],
        ["statistical analysis", "medical data", "healthcare informatics"],
        ["network security", "threat analysis", "ethical hacking"]
    ]
}

# Function to compute evaluation metrics for each approach
def evaluate_keywords(ground_truth, predicted_keywords):
    results = {}

    for method, predictions in predicted_keywords.items():
        precision_list = []
        recall_list = []
        f1_list = []

        for gt_keywords, pred_keywords in zip(ground_truth, predictions):
            # Convert lists to sets for easier comparison
            gt_set = set(gt_keywords)
            pred_set = set(pred_keywords)

            # Calculate precision, recall, and F1 for each CV
            precision = len(gt_set & pred_set) / len(pred_set) if len(pred_set) > 0 else 0
            recall = len(gt_set & pred_set) / len(gt_set) if len(gt_set) > 0 else 0
            f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

            precision_list.append(precision)
            recall_list.append(recall)
            f1_list.append(f1)

        # Average the scores over all the CVs
        avg_precision = sum(precision_list) / len(precision_list)
        avg_recall = sum(recall_list) / len(recall_list)
        avg_f1 = sum(f1_list) / len(f1_list)

        # Store the results
        results[method] = {
            'precision': avg_precision,
            'recall': avg_recall,
            'f1_score': avg_f1
        }

    return results

# Compute the evaluation metrics for all methods
evaluation_results = evaluate_keywords(ground_truth, predicted_keywords)

# Display the results
evaluation_df = pd.DataFrame(evaluation_results).T
print(evaluation_df)

                  precision    recall  f1_score
frequency_based    0.600000  0.600000  0.600000
rake_based         0.600000  0.600000  0.600000
bert_based         1.000000  1.000000  1.000000
conceptnet_based   0.733333  0.733333  0.733333
hybrid             1.000000  1.000000  1.000000


# **Fine-tuning Pretrained Models**

## *Sample Dataset Creation*

In [16]:
import pandas as pd
import random

# List of synthetic CV texts
cv_texts = [
    "Experienced software engineer with skills in Python, Java, and cloud computing. Developed microservices for large-scale systems.",
    "Data scientist with expertise in machine learning, deep learning, and data visualization. Published multiple research papers in top journals.",
    "AI researcher focused on natural language processing and computer vision. Skilled in TensorFlow and PyTorch. Contributed to open-source projects.",
    "Healthcare data analyst with experience in statistical analysis, medical data processing, and healthcare informatics. Proficient in R and SQL.",
    "Cybersecurity expert with a background in network security, threat analysis, and ethical hacking. Experienced in penetration testing tools.",
]

# Associated keywords for each CV
keywords = [
    ["Python", "Java", "cloud computing"],
    ["machine learning", "deep learning", "data visualization"],
    ["natural language processing", "computer vision", "TensorFlow"],
    ["statistical analysis", "medical data", "healthcare informatics"],
    ["network security", "threat analysis", "ethical hacking"],
]

# Create synthetic dataset of users, CV texts, and keywords
users = [f"user_{i}" for i in range(1, 6)]
data = pd.DataFrame({
    'user': users,
    'cv_text': cv_texts,
    'keywords': keywords
})

print("Synthetic dataset of users, their CVs, and keywords created.")

Synthetic dataset of users, their CVs, and keywords created.


## *Fine-tuning Code*

In [20]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from torch.nn.utils.rnn import pad_sequence

# Prepare tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Synthetic dataset preparation for fine-tuning
class KeywordDataset(Dataset):
    def __init__(self, cv_texts, keywords, tokenizer):
        self.cv_texts = cv_texts
        self.keywords = keywords
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.cv_texts)

    def __getitem__(self, idx):
        cv_text = self.cv_texts[idx]
        keyword_list = self.keywords[idx]

        # Tokenize CV text and get offset mappings
        tokens = self.tokenizer(cv_text, return_tensors='pt', truncation=True, padding=True, return_offsets_mapping=True)
        input_ids = tokens['input_ids'].squeeze()
        offsets = tokens['offset_mapping'].squeeze()

        # Create label for each token: 1 if it matches a keyword, else 0
        labels = []
        for offset in offsets:
            start, end = offset.tolist()
            token_text = cv_text[start:end]
            if any(kw in token_text for kw in keyword_list):
                labels.append(1)
            else:
                labels.append(0)

        # Ensure labels match token length
        labels += [-100] * (len(input_ids) - len(labels))

        return input_ids, torch.tensor(labels)

# Custom collate function to handle padding of inputs and labels
def collate_fn(batch):
    # Get the inputs and labels from the batch
    inputs = [item[0] for item in batch]
    labels = [item[1] for item in batch]

    # Pad the inputs and labels to have the same length
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)  # Use -100 for ignored labels

    return inputs_padded, labels_padded

# Create dataset
cv_texts = data['cv_text'].tolist()
keywords = data['keywords'].tolist()
dataset = KeywordDataset(cv_texts, keywords, tokenizer)

# Split into train and test set
train_size = int(0.8 * len(dataset))
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])

# Create data loaders with the custom collate_fn
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

# Fine-tuning process
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs, labels = batch
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

print("Model fine-tuned successfully.")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1, Loss: 0.5243277847766876
Epoch 2, Loss: 0.06195594742894173
Epoch 3, Loss: 0.006692057941108942
Model fine-tuned successfully.


## *Testing the Model*

In [21]:
# Sample test CV
test_cv = "Skilled software engineer experienced in Java, cloud computing, and microservices. Contributed to large-scale system designs."

# Tokenize and prepare the test CV
inputs = tokenizer.encode(test_cv, return_tensors='pt', truncation=True, padding=True)

# Model prediction
model.eval()
with torch.no_grad():
    outputs = model(inputs)
    predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()

# Extract tokens and corresponding predictions
tokens = tokenizer.convert_ids_to_tokens(inputs[0])
predicted_keywords = [tokens[i] for i, label in enumerate(predictions) if label == 1]

# Display the predicted keywords
print("Predicted Keywords:", predicted_keywords)

Predicted Keywords: []
