In [1]:
import os 
import json
import kagglehub
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict

In [2]:
# resume dataset from kaggle
path = kagglehub.dataset_download("dataturks/resume-entities-for-ner")

print("Path to dataset files:", path)

Path to dataset files: /Users/juanbeiroa/.cache/kagglehub/datasets/dataturks/resume-entities-for-ner/versions/1


In [3]:
# functions by K Scott Mader in https://www.kaggle.com/code/kmader/finding-good-parts-of-resumes/notebook
base_json = path + '/Entity Recognition in Resumes.json'
def pop_annot(raw_line):
    in_line = defaultdict(list, **raw_line)
    if 'annotation' in in_line:
        labels = in_line['annotation']
        for c_lab in labels:
            if len(c_lab['label'])>0:
                in_line[c_lab['label'][0]] += c_lab['points']
    return in_line
with open(base_json, 'r') as f:
    # data is jsonl and so we parse it line-by-line
    resume_data = [json.loads(f_line) for f_line in f.readlines()]
    resume_df = pd.DataFrame([pop_annot(line) for line in resume_data])
resume_df['length'] = resume_df['content'].map(len)

def extract_higlights(raw_line):
    in_line = defaultdict(list, **raw_line)
    if 'annotation' in in_line:
        labels = in_line['annotation']
        for c_lab in labels:
            if len(c_lab['label'])>0:
                in_line['highlight'] += [dict(category = c_lab['label'][0], **cpts) for cpts in c_lab['points']]
    return in_line
resume_hl_df = pd.DataFrame([extract_higlights(line) for line in resume_data])
resume_hl_df['length'] = resume_hl_df['content'].map(len)

In [4]:
def merge_repeating_categories(data):
    """
    Courtesy of ChatGPT.
    Merges the text of repeating categories in a list of dictionaries.
    
    Parameters:
        data (list): A list of dictionaries with 'category' and 'text' keys.
    
    Returns:
        dict: A dictionary where keys are unique categories and values are merged texts.
    """
    merged_categories = defaultdict(str)  # Initialize a defaultdict for concatenated text
    
    for entry in data:
        category = entry['category']
        text = entry['text']
        # Append text with a separator (e.g., a space or newline)
        merged_categories[category] += text.strip() + '\n'
    
    # Remove trailing newlines in the final dictionary
    return {category: text.strip() for category, text in merged_categories.items()}

In [5]:
# extra cleaning of hihlights
resume_hl_df['highlight'] = resume_hl_df['highlight'].map(merge_repeating_categories)

In [6]:
def summarize_resume(data):
    sections = ['Name', 'Skills', 'College Name', 'Degree', 'Companies worked at', 'Designation']
    keys = [section for section in sections if section in data.keys()]
    summary = ""
    for key in keys:
        summary += f"*{key}*: {data[key]}.\n"
    return summary

In [7]:
summarize_resume(resume_hl_df['highlight'][1])

'*Name*: Afreen Jamadar.\n*Skills*: Database (Less than 1 year), HTML (Less than 1 year), Linux. (Less than 1 year), MICROSOFT\nACCESS (Less than 1 year), MICROSOFT WINDOWS (Less than 1 year)\n\nADDITIONAL INFORMATION\n\nTECHNICAL SKILLS:\n\n• Programming Languages: C, C++, Java, .net, php.\n• Web Designing: HTML, XML\n• Operating Systems: Windows […] Windows Server 2003, Linux.\n• Database: MS Access, MS SQL Server 2008, Oracle 10g, MySql..\n*College Name*: Shivaji University Kolhapur\nCDAC ACTS.\n*Degree*: Bachelor of Engg in Information Technology\nPG-DAC.\n'

In [8]:
len(summarize_resume(resume_hl_df['highlight'][1]))

550

In [9]:
len(resume_hl_df['content'][1])

1240

In [10]:
resume_hl_df['summary'] = resume_hl_df['highlight'].map(summarize_resume)

In [11]:
resume_hl_df.drop('extras', axis=1, inplace=True)

In [12]:
resume_hl_df.sample(5)

Unnamed: 0,content,annotation,highlight,length,summary
99,"Vikas Singh\nChandigarh, Chandigarh - Email me...","[{'label': ['Skills'], 'points': [{'start': 53...","{'Skills': 'SECURITY (5 years), INFORMATION SE...",5750,*Name*: Vikas Singh.\n*Skills*: SECURITY (5 ye...
34,"Khushboo Choudhary\nDeveloper\n\nNoida, Uttar ...","[{'label': ['Skills'], 'points': [{'start': 14...","{'Skills': 'ANDROID (Less than 1 year), CISCO ...",1874,*Name*: Khushboo Choudhary.\n*Skills*: ANDROID...
46,"Navas Koya\nTest Engineer\n\nMangalore, Karnat...","[{'label': ['Skills'], 'points': [{'start': 21...","{'Skills': 'SKILL SET • ASP.NET, C# • QA tools...",2404,*Name*: Navas Koya.\n*Skills*: SKILL SET • ASP...
111,Zaheer Uddin\nTechnical Project Manager\n\nHyd...,"[{'label': ['Location'], 'points': [{'start': ...","{'Location': 'Hyderabad Hyderabad', 'College N...",4993,*Name*: Zaheer Uddin.\n*College Name*: Osmania...
80,Shreyanshu Gupta\nSoftware Development Enginee...,"[{'label': ['Skills'], 'points': [{'start': 41...",{'Skills': 'GIT HTML PHP Web Development Veloc...,4193,*Name*: Shreyanshu Gupta.\n*Skills*: GIT\nHTML...


In [13]:
from datasets import Dataset

In [14]:
dataset = Dataset.from_pandas(resume_hl_df)

In [17]:
ds = dataset.train_test_split(test_size=0.3)

In [18]:
ds

DatasetDict({
    train: Dataset({
        features: ['content', 'annotation', 'highlight', 'length', 'summary'],
        num_rows: 154
    })
    test: Dataset({
        features: ['content', 'annotation', 'highlight', 'length', 'summary'],
        num_rows: 66
    })
})

In [19]:
ds.push_to_hub('jbeiroa/resume_entities_ner_summaries')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jbeiroa/resume_entities_ner_summaries/commit/8939e0b6d24e740a8d18c4b2cc64a549bc47f7af', commit_message='Upload dataset', commit_description='', oid='8939e0b6d24e740a8d18c4b2cc64a549bc47f7af', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jbeiroa/resume_entities_ner_summaries', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jbeiroa/resume_entities_ner_summaries'), pr_revision=None, pr_num=None)