### Job title classification

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load data from Excel file
file_path = "Occupation Data.xlsx"  # Update with the path to your Excel file
df = pd.read_excel(file_path)

# Drop irrelevant column
df.drop(columns=['O*NET-SOC Code'], inplace=True)

# Encode job titles as numerical labels
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Title'])



In [None]:
# Split data into descriptions and labels
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Description'].values.tolist(),
    df['Label'].values.tolist(),
    test_size=0.2,
    random_state=42
)

# Preprocess text data for BERT model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Define model and training arguments (remains the same)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    eval_steps=100,
    logging_steps=50,
)

# Create trainer and train the model (remains the same)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=test_encodings,
    compute_metrics=lambda eval_pred: {"accuracy": (eval_pred.predictions == eval_pred.label_ids).mean()},
)

trainer.train()

### Parsed resume summariser and Occupation predictor

In [None]:
# Load summarization model and tokenizer
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def predict_job_title(resume_text):
  # Summarize resume text
  summary = summarizer(resume_text, max_length=100)  # Limit summary length

  # Tokenize summary and predict job title
  summary_encoded = tokenizer(summary[0]["generated_text"], padding="max_length", truncation=True)
  prediction = trainer.predict(summary_encoded)
  predicted_label = label_list.id2name[prediction.predictions.argmax()]

  return predicted_label

# Example usage
resume_text = "Your parsed resume text here..."
predicted_job_title = predict_job_title(resume_text)
print("Predicted Job Title:", predicted_job_title)
