In [1]:
import pandas as pd
import numpy as np

from unsloth import FastLanguageModel 
from unsloth import is_bfloat16_supported
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset, Dataset
from tqdm import tqdm

np.random.seed(42)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


# Install Note

Note: to use this notebook, you need:
1. a gpu machine
2. install the following extra dependencies: https://github.com/unslothai/unsloth

I used an AWS `g5.xlarge` instance with the Ubuntu pytorch image.

# Utils

In [2]:
max_seq_length = 2048
num_valid = 100

train_steps = 50

In [3]:
instructions= """
You are a website classifier. I'm going to give you access to a website content (converted to markdown). Your job is to determine whether the website contains jobs.

You shall classify websites into the following buckets:
- Job list: the website explicitly lists actual jobs (with titles), or links to jobs that you can apply for. There has to be 1+ job listed.
- Empty list: the website has a section where jobs did and will apear, but right now it's empty, or company is currently not hiring. If there is a list with zero items, that also counts as empty list Even if they encourga people to send their CV, it counts as empty list if they state that they have no jobs right now. If the website mentions current roles but does not list any specific jobs, it is an empty list.
- Link to jobs: the page does not contain job lists, but has a link to a career / job page that has it. The link could be on the same domain, a separate domain, or it could be a link to an ATS provider or job board. Note: if the navbar (or footer) has a "Company"/"About"/"About us" (or similar section that often contains jobs), select this option. Also, it has a button saying "Apply" with no further context, it still counts as a link, not open apply; unless generic applications are specifically stated.
- Job open apply: the website says their accepting jobs, but does not list the jobs. Instead, it encourages people to apply, or email HR, or just has a rolling generic process. Note: the previous category takes precedence!
- No jobs: the website does not have any explicit jobs, does not have a vague apply by sending HR your CV, and does not even include a link to a separate page for jobs. This is different from section 1 in that there is no mention of jobs/careers/etc.

Use a waterfall approach: if the first category is satisfied (Job list), select that, then move to the next. If empty list is satisfied, select that, and so on. Categories higher up in the list take precedence.

Note:
- If the site does not list specific links AND mentions that they currently have no jobs, it is an empty list - NO MATTER if they still encourage you to apply.

Examples:
- "While FitLife has no current openings, we're always looking for talented fitness professionals. Please check back soon" -> Empty list
- "    StreamIt - Unlimited Movies and TV Shows. Enjoy the latest movies and shows without ads. Start streaming today!" -> No jobs
- "    Interested in working with us? Check our [career page](https://www.acme.com/jobs) for the latest openings." -> Link to jobs
- "GreenEnergy is always looking for skilled professionals to join our mission for sustainable energy. Send us your resume." -> Job open apply
- '''
TravelPro is an award-winning travel agency offering luxury vacations and personalized travel services.

    Positions available:
    - [Travel Consultant](https://travelpro.com/jobs/travel-consultant)
    - [Customer Service Associate](https://travelpro.com/jobs/customer-service)
''' -> Job list
- "    LearnTech is always looking for new talent. While we have no current positions available, feel free to submit your resume for future consideration on our [careers portal](https://learntech.com/careers)." -> Empty list
- "   ACME is always looking for talented individuals to join our team, but we currently don't have any open positions.\n\n    Please check back soon or submit your resume for future opportunities via our [general application form](https://acme.com/apply)." -> Empty list
- "    We currently have no job openings, but you can always stay updated via our [careers page](https://bluesky.com/careers)." -> Empty list
- " ACME is currently not having any open roles, but we're always looking for talented individuals. Please send us your resume." -> Empty list
- " Jobs at FitLife: \n\n - \n" -> Empty list

Only answer with the classification `Job list`, `Empty list`, `Link to jobs`, `Job open apply`, `No jobs` and nothing else.
    """.strip()

# Set Up Data

In [4]:
df = pd.read_csv("../data/04b_first_status.csv")

df.head(2)

Unnamed: 0,CompanyName,Website,scrape_status,md_status,md,html_length,md_length,valid_website,status
0,Sawa Impact Labs,https://sawa.gallery/,Success,Success,Squarespace \- Website Expired\n\n\n \nWebsite...,6998,124,valid,No jobs
1,OpenMI at the University of Michigan,https://maizepages.umich.edu/organization/openmi,Success,Success,OpenMI \- University of Michigan Maize Pages\n...,39237,784,valid,No jobs


In [5]:
df.status.value_counts()

status
No jobs           1476
Link to jobs       277
Job open apply      48
Job list            11
Name: count, dtype: int64

In [6]:
# Hacky way to get exactly 200 validation examples
df['split'] = 'train'
df.loc[np.random.choice(len(df), size=num_valid, replace=False), 'split'] = 'valid'

In [7]:
df.split.value_counts()

split
train    1900
valid     100
Name: count, dtype: int64

In [8]:
train = df[df.split == 'train'].copy()
valid = df[df.split == 'valid'].copy()

len(train), len(valid)

(1900, 100)

# Settings

Pretty much using unsloth defaults everywhere.

In [9]:

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A10G. Max memory: 21.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [10]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,
    loftq_config = None, 
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [11]:
prompt = """### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs       = examples["md"]
    outputs      = examples["status"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(instructions, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

# Finetune LoRA adapter

In [12]:
dataset = Dataset.from_pandas(train, split='train')

dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/1900 [00:00<?, ? examples/s]

In [13]:
len(dataset)

1900

Note: only training for 50 steps for demo purposes (takes ~10min), obviously would train longer (until validation accuracy increases)

In [14]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
#         num_train_epochs = 3,
        max_steps = train_steps,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/1900 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [15]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 50
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.9922
2,2.1103
3,2.1324
4,1.883
5,2.0154
6,1.8957
7,1.6979
8,1.6911
9,1.726
10,1.3539


In [16]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

# Inference

In [17]:
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = None,
        load_in_4bit = True,
    )
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A10G. Max memory: 21.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
      

In [18]:
sample = valid.sample(1)
url = sample.Website.iloc[0]
md = sample.md.iloc[0]

In [19]:
def limit_string(x: str, n=7000) -> str:
    if n < 3:
        raise ValueError("n must be at least 3")
    if len(x) > n:
        return x[: (n - 3)] + "..."
    return x

def extract_response(out):
    try:
        return out.split("<|end_of_text|>")[0].split("Response:")[1].strip()
    except Exception as err:
        print(err)
        return None

def predict_batch(mds):
    
    mds = [limit_string(md) for md in mds]  # super hacky
    
    lengths = [len(md) for md in mds]
    
    assert all(x > 5 for x in lengths)
    
    _formatted = [prompt.format(instructions, md, "") for md in mds]
    
    inputs = tokenizer(_formatted, return_tensors = "pt", padding=True).to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 32, use_cache = True)
    whole_texts = tokenizer.batch_decode(outputs)
    return [extract_response(t) for t in whole_texts]

In [20]:
predict_batch(valid.sample(3).md.tolist())

['No jobs', 'No jobs', 'No jobs']

# Get Accuracy

In [21]:
batched = [valid.md.iloc[i:(i+4)] for i in range(0, num_valid, 4)]

In [22]:
batched_res = [predict_batch(b) for b in tqdm(batched)]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:38<00:00,  3.94s/it]


In [23]:
unbatched = [x for b in batched_res for x in b]
len(unbatched)

100

In [24]:
valid['llama_results'] = unbatched

In [25]:
def _correct_syntax(x):
    if x in ["Job list", "Job open apply", "Link to jobs", "No jobs"]:
        return True
    return False

In [26]:
valid['syntax_correct'] = valid.llama_results.apply(_correct_syntax)

In [27]:
valid.sample(5).llama_results.tolist()

['No jobs list', 'No jobs\n\n###', 'No jobs', 'No jobs found', 'No jobs list']

Note: this was the result for training for 5 steps:

In [59]:
valid.syntax_correct.mean(), (valid.results == valid.status).mean()

(np.float64(0.21), np.float64(0.04))

Results for 50 steps:

In [29]:
valid.syntax_correct.mean(), (valid.llama_results == valid.status).mean()

(np.float64(0.66), np.float64(0.49))

What we really want is precision & recall for job links found

In [30]:
sum(valid.status == "Link to jobs"), sum(valid.llama_results == "Link to jobs")

(16, 1)

In [31]:
# recall
(valid.llama_results == valid.status)[valid.status == "Link to jobs"].mean()

np.float64(0.0625)

In [32]:
# precision
(valid.llama_results == valid.status)[valid.llama_results == "Link to jobs"].mean()

np.float64(1.0)

In [34]:
valid.to_csv("../data/05_ft_validation.csv", index=False)