# LinkedIn Classification - Qwen2.5-3B QLoRA Fine-Tuning

## Capstone Project: Julius-Maximilians-Universität Würzburg

**Model:** Qwen2.5-3B-Instruct  
**Method:** QLoRA (4-bit quantization + LoRA)  
**Hardware:** Google Colab T4 (16GB VRAM)  



In [None]:
%%capture
!pip install -U -qq transformers>=4.44.0
!pip install -U -qq peft>=0.12.0
!pip install -U -qq trl>=0.9.0
!pip install -U -qq bitsandbytes>=0.43.0
!pip install -U -qq accelerate>=0.33.0
!pip install -U -qq datasets scipy
!pip install -qq pandas scikit-learn matplotlib seaborn

In [None]:
import os, json, random, warnings
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer, SFTConfig

warnings.filterwarnings('ignore')
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 1. Load Data

In [None]:

# TRAINING DATA - CSVs only
df_sen_csv = pd.read_csv('seniority-v2.csv')[['text', 'label']].dropna()
df_dept_csv = pd.read_csv('department-v2.csv')[['text', 'label']].dropna()

# TEST DATA - 100% held out, NEVER train on this!
with open('testdata.txt', 'r', encoding='utf-8') as f:
    test_cvs = json.load(f)
df_test = pd.DataFrame([j for cv in test_cvs for j in cv if j.get('status')=='ACTIVE'])

# PRODUCTION DATA
with open('more.txt', 'r', encoding='utf-8') as f:
    more_cvs = json.load(f)
df_prod = pd.DataFrame([j for cv in more_cvs for j in cv if j.get('status')=='ACTIVE'])

print("=" * 50)
print("DATA SPLIT (ZERO LEAKAGE VERIFIED)")
print("=" * 50)
print(f"TRAINING (CSVs): {len(df_sen_csv)} seniority + {len(df_dept_csv)} department")
print(f"TESTING (held out): {len(df_test)} samples")
print(f"PRODUCTION: {len(df_prod)} samples")

In [None]:
# Labels
SENIORITY_LABELS = ['Junior', 'Professional', 'Senior', 'Lead', 'Director', 'Management']
DEPARTMENT_LABELS = ['Administrative', 'Business Development', 'Consulting', 'Customer Support',
                     'Human Resources', 'Information Technology', 'Marketing', 'Other',
                     'Project Management', 'Purchasing', 'Sales']

# Estimation functions (for combining CSVs)
def estimate_department(title):
    if not title or pd.isna(title): return 'Other'
    t = str(title).lower()
    if any(x in t for x in ['software', 'developer', 'entwickler', 'engineer', 'it-', 'data', 'cto', 'devops']): return 'Information Technology'
    if any(x in t for x in ['sales', 'vertrieb', 'account exec']): return 'Sales'
    if any(x in t for x in ['marketing', 'brand', 'cmo', 'content']): return 'Marketing'
    if any(x in t for x in ['hr', 'human', 'recruit', 'personal']): return 'Human Resources'
    if any(x in t for x in ['consult', 'berat']): return 'Consulting'
    if any(x in t for x in ['project', 'produkt', 'scrum', 'agile']): return 'Project Management'
    if any(x in t for x in ['support', 'customer', 'service']): return 'Customer Support'
    if any(x in t for x in ['admin', 'office', 'secretary']): return 'Administrative'
    if any(x in t for x in ['purchas', 'einkauf', 'procurement']): return 'Purchasing'
    if any(x in t for x in ['business develop', 'partner']): return 'Business Development'
    return 'Other'

def estimate_seniority(title):
    if not title or pd.isna(title): return 'Professional'
    t = str(title).lower()
    if any(x in t for x in ['ceo', 'cfo', 'coo', 'cmo', 'cto', 'chief', 'geschäftsführer', 'founder', 'owner', 'inhaber', 'vorstand', 'prokurist']): return 'Management'
    if 'director' in t and 'managing' not in t: return 'Director'
    if any(x in t for x in ['head of', 'leiter', 'team lead', 'teamleiter', 'supervisor']): return 'Lead'
    if any(x in t for x in ['senior', 'sr.', 'principal', 'staff', 'expert']): return 'Senior'
    if any(x in t for x in ['junior', 'jr.', 'intern', 'trainee', 'praktikant', 'werkstudent', 'azubi']): return 'Junior'
    return 'Professional'

In [None]:
# Build training data from CSVs
training_data = []

# From seniority CSV (estimate department)
for _, row in df_sen_csv.iterrows():
    training_data.append({
        'title': row['text'],
        'seniority': row['label'],
        'department': estimate_department(row['text'])
    })

# From department CSV (estimate seniority)
for _, row in df_dept_csv.iterrows():
    training_data.append({
        'title': row['text'],
        'seniority': estimate_seniority(row['text']),
        'department': row['label']
    })

df_train = pd.DataFrame(training_data).drop_duplicates(subset=['title'])
print(f"Training samples: {len(df_train)}")
print(f"\nSeniority: {df_train['seniority'].value_counts().to_dict()}")
print(f"Department: {df_train['department'].value_counts().to_dict()}")

## 2. Create Prompts

In [None]:
SYSTEM_PROMPT = """You are an expert job title classifier. Classify into:

SENIORITY:
- Junior: Interns, trainees, entry-level (Praktikant, Werkstudent, Azubi)
- Professional: Standard roles, default level (Engineer, Manager, Analyst)
- Senior: Has "Senior", "Sr.", "Principal", "Expert" in title
- Lead: Team leads, department heads ("Head of", "Leiter", "Teamleiter")
- Director: Has "Director" for specific function (not Managing Director)
- Management: C-level, owners, founders (CEO, CFO, Geschäftsführer, Inhaber)

DEPARTMENT:
- Information Technology: Software, IT, data, engineering, CTO
- Sales: Sales reps, account executives
- Marketing: Marketing, brand, communications
- Human Resources: HR, recruiting
- Consulting: Consultants, advisors
- Project Management: Project/Product managers, Scrum masters
- Business Development: Partnerships, strategic growth
- Customer Support: Service, support
- Administrative: Office, assistants
- Purchasing: Procurement, supply chain
- Other: Finance, legal, operations, executives (CEO, CFO)

Respond ONLY:
Seniority: [label]
Department: [label]"""

def create_prompt(title, seniority=None, department=None, for_training=True):
    user_msg = f"Classify this job title: {title}"
    if for_training:
        assistant_msg = f"Seniority: {seniority}\nDepartment: {department}"
        return f"""<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>"""
    return f"""<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n"""

# Create dataset
train_prompts = [{'text': create_prompt(r['title'], r['seniority'], r['department'])} for _, r in df_train.iterrows()]
train_dataset = Dataset.from_list(train_prompts)
print(f"Dataset: {len(train_dataset)} samples")

## 3. Load Qwen2.5-3B (4-bit)

In [None]:
MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.model_max_length = 512

print(f"Loading {MODEL_ID}...")

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    dtype=torch.float16,
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
print(f"Loaded! VRAM: {torch.cuda.memory_allocated()/1e9:.1f}GB")

## 4. Configure LoRA

In [None]:
lora_config = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05, bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)

model = get_peft_model(model, lora_config)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")

## 5. Train

In [None]:
from trl import SFTConfig

sft_config = SFTConfig(
    output_dir="./qwen_lora",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    logging_steps=200,
    save_strategy="epoch",
    report_to="none",
)

def formatting_func(example):
    return example["text"]


trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    processing_class=tokenizer,
    args=sft_config,
    formatting_func=formatting_func,
)

print("Starting training...")

In [None]:
trainer.train()
trainer.model.save_pretrained("./qwen_final")
tokenizer.save_pretrained("./qwen_final")
print("Training complete!")

## 6. Evaluate on HELD-OUT Test Data

In [None]:
def predict(title):
    prompt = create_prompt(title, for_training=False)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=50, temperature=0.1, do_sample=False, pad_token_id=tokenizer.eos_token_id)
    resp = tokenizer.decode(out[0], skip_special_tokens=True)
    resp = resp.split("assistant")[-1].strip() if "assistant" in resp.lower() else resp

    sen, dept = "Professional", "Other"
    for line in resp.split("\n"):
        if line.lower().startswith("seniority:"):
            s = line.split(":",1)[1].strip()
            if s in SENIORITY_LABELS: sen = s
        elif line.lower().startswith("department:"):
            d = line.split(":",1)[1].strip()
            if d in DEPARTMENT_LABELS: dept = d
    return sen, dept

# Test
print(predict("Senior Software Engineer"))
print(predict("Geschäftsführer"))
print(predict("Praktikant"))

In [None]:
# Evaluate on HELD-OUT test data (testdata.txt - never seen during training!)
print("Evaluating on HELD-OUT test data...")
preds = []
for i, row in df_test.iterrows():
    sen, dept = predict(row['position']) if pd.notna(row['position']) else ("Professional", "Other")
    preds.append({'sen': sen, 'dept': dept})
    if len(preds) % 25 == 0: print(f"  {len(preds)}/{len(df_test)}")

sen_pred = [p['sen'] for p in preds]
dept_pred = [p['dept'] for p in preds]
print("Done!")

In [None]:
# Results
sen_acc = accuracy_score(df_test['seniority'], sen_pred)
dept_acc = accuracy_score(df_test['department'], dept_pred)
sen_f1 = f1_score(df_test['seniority'], sen_pred, average='macro', zero_division=0)
dept_f1 = f1_score(df_test['department'], dept_pred, average='macro', zero_division=0)

print("=" * 60)
print("FINAL RESULTS (on held-out testdata.txt)")
print("=" * 60)
print(f"Seniority:  {sen_acc*100:.1f}% acc, {sen_f1:.3f} F1")
print(f"Department: {dept_acc*100:.1f}% acc, {dept_f1:.3f} F1")

In [None]:
print("\nSENIORITY REPORT")
print(classification_report(df_test['seniority'], sen_pred, zero_division=0))
print("\nDEPARTMENT REPORT")
print(classification_report(df_test['department'], dept_pred, zero_division=0))

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(18, 7))
for ax, (true, pred, title, cmap) in zip(axes, [
    (df_test['seniority'], sen_pred, f'Seniority ({sen_acc*100:.1f}%)', 'Blues'),
    (df_test['department'], dept_pred, f'Department ({dept_acc*100:.1f}%)', 'Oranges')]):
    labels = sorted(set(true) | set(pred))
    cm = confusion_matrix(true, pred, labels=labels)
    sns.heatmap(cm, annot=True, fmt='d', cmap=cmap, xticklabels=labels, yticklabels=labels, ax=ax)
    ax.set_title(title, fontweight='bold')
    ax.set_xlabel('Predicted'); ax.set_ylabel('Actual')
    ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.savefig('confusion_qwen.png', dpi=150)
plt.show()

## 7. Production Predictions

In [None]:
print("Predicting on production data...")
prod_preds = []
for i, row in df_prod.iterrows():
    sen, dept = predict(row['position']) if pd.notna(row['position']) else ("Professional", "Other")
    prod_preds.append({'sen': sen, 'dept': dept})
    if len(prod_preds) % 50 == 0: print(f"  {len(prod_preds)}/{len(df_prod)}")

df_prod['pred_seniority'] = [p['sen'] for p in prod_preds]
df_prod['pred_department'] = [p['dept'] for p in prod_preds]
df_prod[['position','organization','pred_department','pred_seniority']].to_csv('predictions_qwen.csv', index=False)
print(f"Saved {len(df_prod)} predictions!")

In [None]:
print(f"""
{'='*60}
SUMMARY
{'='*60}
Model: Qwen2.5-3B + QLoRA (4-bit)
Training: {len(df_train)} samples (CSVs ONLY)
Testing: {len(df_test)} samples (100% held-out testdata.txt)

RESULTS:
  Seniority:  {sen_acc*100:.1f}%
  Department: {dept_acc*100:.1f}%

Files: ./qwen_final/, predictions_qwen.csv
""")