# GPT2-based Career Recomendation / Skill Suggestion model

In [1]:
#pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
#pip install torch[distributed]
#pip install --upgrade pip
#pip install torchvision --user
#pip install datasets
#pip install -U accelerate
#pip install -U transformers
#pip install tensorflow-gpu

In [2]:
#pip show tensorflow

In [3]:
import torch
import textwrap

In [4]:
import tensorflow as tf
import os


In [5]:
print("GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

GPU available: True
GPU name: NVIDIA GeForce GTX 1660 SUPER


# Load Dataset

In [50]:
from datasets import load_dataset
dataset = load_dataset("nakamoto-yama/job-descriptions-public")

Repo card metadata block was not found. Setting CardData to empty.


In [7]:
dataset["train"]["Job"][:10]

['Sr. Software Engineer - NCR Corporation | Built In',
 'Senior Data Engineer - Quantexa | Built In',
 'Senior Software Engineer (Fullstack) - Zeal | Built In',
 'Finance Business Analyst - Apex Fintech Solutions | Built In',
 'Sales Data Analyst - Madhive | Built In',
 'Senior AI Engineer (F/M/D) - remberg | Built In',
 'Senior Android Engineer, Instacart Business - Instacart | Built In',
 'Program Acquisition Analyst - Aetos Systems, Inc. | Built In',
 'Team Lead Engineering (m/f/d) - SoSafe | Built In',
 'Backend Software Engineer - Patrick J McGovern Foundation | Built In']

In [51]:
dataset["train"][:1]

{'id': [18583],
 'Job': ['Sr. Software Engineer - NCR Corporation | Built In'],
 'Description': ['About NCR VOYIX NCR VOYIX Corporation (NYSE: VYX) is a leading global provider of digital commerce solutions for the retail, restaurant and banking industries. NCR VOYIX is headquartered in Atlanta, Georgia, with approximately 16,000 employees in 35 countries across the globe. For nearly 140 years, we have been the global leader in consumer transaction technologies, turning everyday consumer interactions into meaningful moments. Today, NCR VOYIX transforms the stores, restaurants and digital banking experiences with cloud-based, platform-led SaaS and services capabilities. Not only are we the leader in the market segments we serve and the technology we deliver, but we create exceptional consumer experiences in partnership with the world\'s leading retailers, restaurants and financial institutions. We leverage our expertise, R&D capabilities and unique platform to help navigate, simplify an

## Cleaning the Dataset

In [None]:
import re # Regular Expression

# Function to clean the Job titles
def clean_job_title(job_title):
    # Remove company names, typically after a dash or pipe
    job_title = re.sub(r" - .*", "", job_title)  # Remove anything after ' - '
    job_title = re.sub(r" \| .*", "", job_title)  # Remove anything after ' | '

    # Remove parenthetical notes like (F/M/D), (m/f/d), etc.
    job_title = re.sub(r"\s*\(.*?\)", "", job_title)

    # Remove extra spaces, punctuation, or symbols at the start or end
    job_title = job_title.strip()
    job_title = re.sub(r"[^\w\s\-]+$", "", job_title)  # Remove trailing punctuation
    return job_title

In [9]:
# Apply the cleaning function to the dataset
def preprocess_example(example):
    example["Job"] = clean_job_title(example["Job"])
    return example

In [10]:
cleaned_dataset = dataset.map(preprocess_example)


In [11]:
cleaned_dataset["train"]["Job"][:10]  # Display first 10 cleaned job titles

['Sr. Software Engineer',
 'Senior Data Engineer',
 'Senior Software Engineer',
 'Finance Business Analyst',
 'Sales Data Analyst',
 'Senior AI Engineer',
 'Senior Android Engineer, Instacart Business',
 'Program Acquisition Analyst',
 'Team Lead Engineering',
 'Backend Software Engineer']

In [12]:
cleaned_dataset.save_to_disk("./cleaned_job_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/3125 [00:00<?, ? examples/s]

## Formating the Prompt - Completion

In [14]:
# Define a function to format the data for training
def format_example(example):
    prompt = f"Description: {example['Description']}"
    completion = f"Job: {example['Job']}"
    return {"input": prompt, "output": completion}

In [15]:
# Apply formatting to the dataset
formatted_data = cleaned_dataset.map(format_example)

Map:   0%|          | 0/3125 [00:00<?, ? examples/s]

In [16]:
formatted_data

DatasetDict({
    train: Dataset({
        features: ['id', 'Job', 'Description', 'input', 'output'],
        num_rows: 3125
    })
})

In [17]:
formatted_data = formatted_data.remove_columns(["id"]) # irrelevant column

In [18]:
formatted_data

DatasetDict({
    train: Dataset({
        features: ['Job', 'Description', 'input', 'output'],
        num_rows: 3125
    })
})

## Spliting the formated and clean dataset

In [19]:
train_test_split = formatted_data["train"].train_test_split(test_size=0.2)
train_data = train_test_split["train"] # 2500
val_data = train_test_split["test"] # 625

## Loading the GPT2 Model

In [22]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Load the model and tokenizer
model_name = "gpt2"  # You could also try "gpt2-medium" or "gpt2-large" if resources allow
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Ensure the tokenizer can handle unknown tokens
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

In [23]:
# Keep an instance of the GPT2 model without FT for comparison 

model_default = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer_default = GPT2Tokenizer.from_pretrained(model_name)

In [24]:
model_default.save_pretrained("./default_gpt2_4")
tokenizer_default.save_pretrained("./default_gpt2_4")

('./default_gpt2_4\\tokenizer_config.json',
 './default_gpt2_4\\special_tokens_map.json',
 './default_gpt2_4\\vocab.json',
 './default_gpt2_4\\merges.txt',
 './default_gpt2_4\\added_tokens.json')

In [25]:
model_size = sum(t.numel() for t in model_default.parameters())
print(f"GTP2 Default Model size: {model_size/1000**2:.1f}M parameters")

GTP2 Default Model size: 124.4M parameters


## Tokenizing the data

In [26]:
def tokenize_function(examples):
    '''
    input: examples - each example from the dataset.
    '''
    # Combine "input" and "output" for each example
    combined = [
        example_input + tokenizer.eos_token + example_output
        for example_input, example_output in zip(examples["input"], examples["output"])
    ]
    # Tokenize the combined text
    tokenized = tokenizer(combined, truncation=True, padding=True)
    # Labels are the same as input_ids for causal language modeling
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


In [27]:
tokenized_data = formatted_data.map(tokenize_function, batched=True) # map() applies a function to each example in dataset. Batched inproves the speed
tokenized_data.set_format(type="torch", columns=["input_ids", "attention_mask" , "labels"])

Map:   0%|          | 0/3125 [00:00<?, ? examples/s]

In [28]:
train_test_split = tokenized_data["train"].train_test_split(test_size=0.2)
train_data = train_test_split["train"] # 2812
val_data = train_test_split["test"] # 313

## Training Arguments

In [31]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    overwrite_output_dir=True, #overwrite the content of the output directory
    learning_rate=5e-5,
    #per_device_train_batch_size=2,
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=4,  # batch size for evaluation
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    fp16=True,  # Enable mixed precision if using GPU
)



In [32]:
# Initialize the data collator which creates batches of the training and eval set.
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're fine-tuning GPT-2, so MLM is disabled
)

## Training the model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,  # Add the data collator
)

trainer.train()

In [None]:
model.save_pretrained("./fine_tuned_gpt2_3")
tokenizer.save_pretrained("./fine_tuned_gpt2_3")

## Inference

In [43]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model

In [44]:
def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

In [None]:
def generate_text(model_path, prompt, max_length):
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)

    # Determine device (GPU or CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Move model to device
    model = model.to(device)

    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)

    outputs = model.generate(
    inputs["input_ids"],
    max_length=max_length,  # Limit the output length
    num_return_sequences=1,
    temperature=0.7,  # Introduce randomness in generation
    top_k=50,         # Consider top 50 likely tokens for more deterministic output
    top_p=0.9,        # most probable tokens are kept
    repetition_penalty=2.5,  # Penalize repetition
    pad_token_id=tokenizer.eos_token_id  # Explicitly set the pad token
    )
    
    generated_text =tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    # Wrap the output to fit within the notebook cell
    wrapped_text = textwrap.fill(generated_text, width=100)  # Set desired line width
    print(wrapped_text)
    


In [46]:
prompt = "Tell me about the qualifications required for a Data scientist."

In [47]:
model_def_path = "./default_gpt2_4"
max_len = 150
generate_text(model_def_path,prompt,max_len)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Tell me about the qualifications required for a Data scientist. A: I am an engineer, and my
background is in data science at Microsoft Research (MSR). My main job involves analyzing large
datasets to understand how they are used by organizations or individuals within their organization;
this includes using machine learning techniques such as deep neural networks that can be applied
across multiple domains of analysis including human-computer interaction research with social media
platforms like Facebook Messenger etc., but also other types where you need more than just one
person working on your project - it's much easier if there aren't many people involved! In addition
we have our own team who work closely together so when someone comes along looking into something
new then everyone will know what was done before them... So please


In [48]:
model_Trained_path = "./fine_tuned_gpt2_3"
max_len = 150
generate_text(model_Trained_path,prompt,max_len)

Tell me about the qualifications required for a Data scientist. What you'll do: Work with data
scientists to understand how our customers use our products and services, identify opportunities
that could improve their business processes or drive revenue growth by leveraging AI models in order
create insights into customer behavior through analytics tools such as Salesforce Analytics,
SalesForce Cloud Platforms (SaaN), etc., Develop best practices across all of your projects Build
scalable solutions using Python/Python-based technologies like Django & Flask Create automated tests
on existing systems Write clean code based upon user feedback Participate actively within teams
where necessary Provide technical support via email Confirm work is completed successfully
Qualifications Required Skills You will be working at a startup environment Experience building
software applications from scratch Strong understanding OFM
