In [10]:
import os 
import json
import kagglehub
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict

In [2]:
# resume dataset from kaggle
path = kagglehub.dataset_download("dataturks/resume-entities-for-ner")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/dataturks/resume-entities-for-ner?dataset_version_number=1...


100%|█████████████████████████████████████████| 323k/323k [00:00<00:00, 600kB/s]

Extracting files...
Path to dataset files: /Users/juanbeiroa/.cache/kagglehub/datasets/dataturks/resume-entities-for-ner/versions/1





In [51]:
# functions by K Scott Mader in https://www.kaggle.com/code/kmader/finding-good-parts-of-resumes/notebook
base_json = path + '/Entity Recognition in Resumes.json'
def pop_annot(raw_line):
    in_line = defaultdict(list, **raw_line)
    if 'annotation' in in_line:
        labels = in_line['annotation']
        for c_lab in labels:
            if len(c_lab['label'])>0:
                in_line[c_lab['label'][0]] += c_lab['points']
    return in_line
with open(base_json, 'r') as f:
    # data is jsonl and so we parse it line-by-line
    resume_data = [json.loads(f_line) for f_line in f.readlines()]
    resume_df = pd.DataFrame([pop_annot(line) for line in resume_data])
resume_df['length'] = resume_df['content'].map(len)

def extract_higlights(raw_line):
    in_line = defaultdict(list, **raw_line)
    if 'annotation' in in_line:
        labels = in_line['annotation']
        for c_lab in labels:
            if len(c_lab['label'])>0:
                in_line['highlight'] += [dict(category = c_lab['label'][0], **cpts) for cpts in c_lab['points']]
    return in_line
resume_hl_df = pd.DataFrame([extract_higlights(line) for line in resume_data])
resume_hl_df['length'] = resume_hl_df['content'].map(len)

In [32]:
def merge_repeating_categories(data):
    """
    Courtesy of ChatGPT.
    Merges the text of repeating categories in a list of dictionaries.
    
    Parameters:
        data (list): A list of dictionaries with 'category' and 'text' keys.
    
    Returns:
        dict: A dictionary where keys are unique categories and values are merged texts.
    """
    merged_categories = defaultdict(str)  # Initialize a defaultdict for concatenated text
    
    for entry in data:
        category = entry['category']
        text = entry['text']
        # Append text with a separator (e.g., a space or newline)
        merged_categories[category] += text.strip() + '\n'
    
    # Remove trailing newlines in the final dictionary
    return {category: text.strip() for category, text in merged_categories.items()}

{'Skills': 'Teradata\nMainframe\nTeradata\nMainframe\nTeradata\nJcl\ncobol\nMainframe\nservicenow',
 'College Name': 'Anurag College of Engineering (Jntuh)',
 'Degree': 'Electrical and Electronics Engineering',
 'Location': 'Hyderabad\nHyderabad\nHyderabad',
 'Companies worked at': 'Infosys Limited\nInfosys Limited',
 'Designation': 'Senior Systems Engineer\nSenior Systems Engineer',
 'Email Address': 'indeed.com/r/Akhil-Yadav-Polemaina/\nf6931801c51c63b1',
 'Name': 'Akhil Yadav Polemaina'}

In [52]:
# extra cleaning of hihlights
resume_hl_df['highlight'] = resume_hl_df['highlight'].map(merge_repeating_categories)

In [62]:
def summarize_resume(data):
    sections = ['Name', 'Skills', 'College Name', 'Degree', 'Companies worked at', 'Designation']
    keys = [section for section in sections if section in data.keys()]
    summary = ""
    for key in keys:
        summary += f"*{key}*: {data[key]}.\n"
    return summary

In [66]:
summarize_resume(resume_hl_df['highlight'][1])

'*Name*: Afreen Jamadar.\n*Skills*: Database (Less than 1 year), HTML (Less than 1 year), Linux. (Less than 1 year), MICROSOFT\nACCESS (Less than 1 year), MICROSOFT WINDOWS (Less than 1 year)\n\nADDITIONAL INFORMATION\n\nTECHNICAL SKILLS:\n\n• Programming Languages: C, C++, Java, .net, php.\n• Web Designing: HTML, XML\n• Operating Systems: Windows […] Windows Server 2003, Linux.\n• Database: MS Access, MS SQL Server 2008, Oracle 10g, MySql..\n*College Name*: Shivaji University Kolhapur\nCDAC ACTS.\n*Degree*: Bachelor of Engg in Information Technology\nPG-DAC.\n'

In [63]:
len(summarize_resume(resume_hl_df['highlight'][1]))

550

In [65]:
len(resume_hl_df['content'][1])

1240

In [68]:
resume_hl_df['summary'] = resume_hl_df['highlight'].map(summarize_resume)

In [74]:
resume_hl_df.drop('extras', axis=1, inplace=True)

In [77]:
from datasets import Dataset

In [105]:
dataset = Dataset.from_pandas(resume_hl_df)

In [108]:
ds = dataset.train_test_split(test_size=0.15)

In [109]:
ds

DatasetDict({
    train: Dataset({
        features: ['content', 'annotation', 'highlight', 'length', 'summary'],
        num_rows: 187
    })
    test: Dataset({
        features: ['content', 'annotation', 'highlight', 'length', 'summary'],
        num_rows: 33
    })
})

In [110]:
ds.push_to_hub('jbeiroa/resume_entities_ner_summaries')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/33.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jbeiroa/resume_entities_ner_summaries/commit/856d2b9e2e73ec530d6146c557782845a4d912e4', commit_message='Upload dataset', commit_description='', oid='856d2b9e2e73ec530d6146c557782845a4d912e4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jbeiroa/resume_entities_ner_summaries', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jbeiroa/resume_entities_ner_summaries'), pr_revision=None, pr_num=None)