In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Load the data
df = pd.read_csv('pastoral.csv', encoding='ISO-8859-1')

# Function to clean text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply text cleaning to 'job description' column
df['cleaned_description'] = df['job description'].apply(clean_text)

# Encode labels
le = LabelEncoder()
df['encoded_label'] = le.fit_transform(df['label'])

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['cleaned_description'], df['encoded_label'], test_size=0.2, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize and encode sequences
MAX_LENGTH = 128

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=MAX_LENGTH)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=MAX_LENGTH)

# Convert to PyTorch tensors
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels.tolist())
)

val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_labels.tolist())
)

# Create DataLoaders
batch_size = 16

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

val_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size
)

print("Data preparation completed. BERT tokenizer and DataLoaders are ready.")
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")

  from .autonotebook import tqdm as notebook_tqdm


Data preparation completed. BERT tokenizer and DataLoaders are ready.
Number of training samples: 115
Number of validation samples: 29




In [2]:
# Check the version of the transformers library
import transformers
print(transformers.__version__)

# Reinitialize the BERT tokenizer
from transformers import BertTokenizer

# Initialize the tokenizer
try:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    print("Tokenizer initialized successfully.")
except Exception as e:
    print("Error initializing tokenizer:", e)

4.44.2
Tokenizer initialized successfully.


In [3]:
# Update transformers library
%pip install --upgrade transformers

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Load the data
df = pd.read_csv('pastoral.csv', encoding='ISO-8859-1')

# Function to clean text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply text cleaning to 'job description' column
df['cleaned_description'] = df['job description'].apply(clean_text)

# Encode labels
le = LabelEncoder()
df['encoded_label'] = le.fit_transform(df['label'])

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['cleaned_description'], df['encoded_label'], test_size=0.2, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize and encode sequences
MAX_LENGTH = 128

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=MAX_LENGTH)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=MAX_LENGTH)

# Convert to PyTorch tensors
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels.tolist())
)

val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_labels.tolist())
)

# Create DataLoaders
batch_size = 16

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

val_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size
)

print("Data preparation completed. BERT tokenizer and DataLoaders are ready.")
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(le.classes_),
    output_attentions=False,
    output_hidden_states=False,
)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(f"Model loaded and ready for training on {device}")

# Training function
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(dataloader)

# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                      'labels': batch[2]}
            outputs = model(**inputs)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(inputs['labels'].cpu().numpy())
    return accuracy_score(true_labels, predictions), classification_report(true_labels, predictions, target_names=le.classes_)

# Training loop
epochs = 4
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_loss = train(model, train_dataloader, optimizer, device)
    print(f"Training loss: {train_loss:.4f}")
    val_accuracy, val_report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print("Classification Report:")
    print(val_report)

print("Training completed.")

# Function to predict label for a new job description
def predict_label(job_description):
    cleaned = clean_text(job_description)
    inputs = tokenizer(cleaned, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).cpu().numpy()[0]
    return le.inverse_transform([prediction])[0]

# Test the model with a sample job description
sample_job = "Providing academic advice and support to students throughout their studies"
predicted_label = predict_label(sample_job)
print(f"\
Sample Job Description: {sample_job}")
print(f"Predicted Label: {predicted_label}")

print("\
Model is ready for predictions. You can now input job descriptions to get labeled responses.")

Collecting transformers
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading transformers-4.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading tokenizers-0.20.0-cp311-cp311-macosx_11_0_arm64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.19.1
  Attempti



Data preparation completed. BERT tokenizer and DataLoaders are ready.
Number of training samples: 115
Number of validation samples: 29


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and ready for training on cpu
Epoch 1/4
Training loss: 0.6983
Validation Accuracy: 0.7241
Classification Report:
              precision    recall  f1-score   support

  Response A       0.71      0.45      0.56        11
  Response B       0.73      0.89      0.80        18

    accuracy                           0.72        29
   macro avg       0.72      0.67      0.68        29
weighted avg       0.72      0.72      0.71        29

Epoch 2/4
Training loss: 0.6801


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.6207
Classification Report:
              precision    recall  f1-score   support

  Response A       0.00      0.00      0.00        11
  Response B       0.62      1.00      0.77        18

    accuracy                           0.62        29
   macro avg       0.31      0.50      0.38        29
weighted avg       0.39      0.62      0.48        29

Epoch 3/4
Training loss: 0.6258
Validation Accuracy: 0.7931
Classification Report:
              precision    recall  f1-score   support

  Response A       0.73      0.73      0.73        11
  Response B       0.83      0.83      0.83        18

    accuracy                           0.79        29
   macro avg       0.78      0.78      0.78        29
weighted avg       0.79      0.79      0.79        29

Epoch 4/4
Training loss: 0.5653
Validation Accuracy: 0.8276
Classification Report:
              precision    recall  f1-score   support

  Response A       0.75      0.82      0.78        11
  Response B       0

In [5]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_tfidf, y_train)

# Define models and parameters for grid search
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

params = {
    'RandomForest': {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    'GradientBoosting': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1]},
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
}

# Perform grid search for each model
best_models = {}
for model_name in models:
    print(f"\
Performing grid search for {model_name}...")
    grid_search = GridSearchCV(models[model_name], params[model_name], cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_balanced, y_train_balanced)
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best cross-validated accuracy for {model_name}: {grid_search.best_score_:.2f}")

# Evaluate the best models on the test set
for model_name, model in best_models.items():
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\
Test Accuracy for {model_name}: {accuracy:.2f}")
    print(f"Classification Report for {model_name}:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

Performing grid search for RandomForest...
Best parameters for RandomForest: {'max_depth': 20, 'n_estimators': 200}
Best cross-validated accuracy for RandomForest: 0.63
Performing grid search for GradientBoosting...
Best parameters for GradientBoosting: {'learning_rate': 0.01, 'n_estimators': 100}
Best cross-validated accuracy for GradientBoosting: 0.60
Performing grid search for SVM...
Best parameters for SVM: {'C': 10, 'kernel': 'linear'}
Best cross-validated accuracy for SVM: 0.66
Test Accuracy for RandomForest: 0.69
Classification Report for RandomForest:
              precision    recall  f1-score   support

  Response A       0.60      0.55      0.57        11
  Response B       0.74      0.78      0.76        18

    accuracy                           0.69        29
   macro avg       0.67      0.66      0.66        29
weighted avg       0.68      0.69      0.69        29

Test Accuracy for GradientBoosting: 0.72
Classification Report for GradientBoosting:
              precisio

In [1]:
import sklearn
print(f"sklearn version: {sklearn.__version__}")

# Try importing SMOTE from imbalanced-learn
try:
    from imblearn.over_sampling import SMOTE
    print("Successfully imported SMOTE")
except ImportError as e:
    print(f"Error importing SMOTE: {e}")

# Check if imbalanced-learn is installed
%pip list | grep imbalanced-learn

# If not installed, install it
%pip install imbalanced-learn

# Try importing again after installation
try:
    from imblearn.over_sampling import SMOTE
    print("Successfully imported SMOTE after installation")
except ImportError as e:
    print(f"Error importing SMOTE after installation: {e}")

# Check scikit-learn version compatibility
print(f"Required scikit-learn version for imbalanced-learn: 1.0.2 or later")
print(f"Current scikit-learn version: {sklearn.__version__}")

# If scikit-learn version is incompatible, update it
if sklearn.__version__ < '1.0.2':
    print("Updating scikit-learn...")
    %pip install --upgrade scikit-learn
    import sklearn
    print(f"Updated scikit-learn version: {sklearn.__version__}")

# Try importing SMOTE one more time
try:
    from imblearn.over_sampling import SMOTE
    print("Successfully imported SMOTE after all updates")
except ImportError as e:
    print(f"Error importing SMOTE after all updates: {e}")

sklearn version: 1.3.2
Successfully imported SMOTE
imbalanced-learn             0.12.3
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Successfully imported SMOTE after installation
Required scikit-learn version for imbalanced-learn: 1.0.2 or later
Current scikit-learn version: 1.3.2
Successfully imported SMOTE after all updates


In [4]:
def predict_label_gb(job_description):
    cleaned = clean_text(job_description)
    tfidf_vector = vectorizer.transform([cleaned])
    prediction = best_models['GradientBoosting'].predict(tfidf_vector)
    return le.inverse_transform(prediction)[0]

# Example usage
sample_job = "Providing academic advice and support to students throughout their studies"
predicted_label = predict_label_gb(sample_job)
print(f"Sample Job Description: {sample_job}")
print(f"Predicted Label: {predicted_label}")

NameError: name 'clean_text' is not defined