In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

# Load the data
df = pd.read_csv('sensory.csv', encoding='UTF-8-SIG')

# Display the first few rows and basic information
print(df.head())
print("\
Dataframe Info:")
print(df.info())
print("\
Unique Labels:")
print(df['Label'].value_counts())

   Code                               Job Responsibilites        Label
0  1405  To plan, organise and implement annual program...  Response C
1  1405  To manage and maintain the Sports fields to ap...  Response C
2  1405  To oversee and ensure correct usage of mechani...  Response C
3  1405             To supervise hard and soft landscaping  Response C
4  1405  To ensure roads and paths are cleared of snow ...  Response C
Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 345 entries, 0 to 344
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Code                  345 non-null    int64 
 1   Job Responsibilites   345 non-null    object
 2   Label                 345 non-null    object
dtypes: int64(1), object(2)
memory usage: 8.2+ KB
None
Unique Labels:
Label
Response C    217
Response D    120
Response B      8
Name: count, dtype: int64


In [2]:
# Preprocess the data and train a model

# Split the data into features and target
X = df['Job Responsibilites ']
y = df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with TfidfVectorizer and a classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\
Classification Report:\
", report)

Accuracy: 0.782608695652174
Classification Report:               precision    recall  f1-score   support

  Response B       0.00      0.00      0.00         1
  Response C       0.84      0.82      0.83        45
  Response D       0.68      0.74      0.71        23

    accuracy                           0.78        69
   macro avg       0.51      0.52      0.51        69
weighted avg       0.78      0.78      0.78        69



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
# Try a different model: Logistic Regression

# Update the pipeline to use Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\
Classification Report:\
", report)

Accuracy: 0.7971014492753623
Classification Report:               precision    recall  f1-score   support

  Response B       0.00      0.00      0.00         1
  Response C       0.82      0.89      0.85        45
  Response D       0.75      0.65      0.70        23

    accuracy                           0.80        69
   macro avg       0.52      0.51      0.52        69
weighted avg       0.78      0.80      0.79        69



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
%pip install transformers torch scikit-learn imbalanced-learn tqdm
print("Libraries installed successfully.")

Note: you may need to restart the kernel to use updated packages.
Libraries installed successfully.


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
from tqdm import tqdm
from imblearn.over_sampling import SMOTE

# Load the data
df = pd.read_csv('sensory.csv', encoding='UTF-8-SIG')

# Encode labels
le = LabelEncoder()
df['Label_encoded'] = le.fit_transform(df['Label'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['Job Responsibilites '], df['Label_encoded'], test_size=0.2, random_state=42)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_train_text = X_train.tolist()
X_train_smote, y_train_smote = smote.fit_resample(np.array(X_train_text).reshape(-1, 1), y_train)
X_train_smote = X_train_smote.flatten()

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize and encode sequences
MAX_LEN = 128

def tokenize_sequences(texts):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Tokenize training and testing sets
train_inputs, train_masks = tokenize_sequences(X_train_smote)
test_inputs, test_masks = tokenize_sequences(X_test)

# Convert labels to tensors
train_labels = torch.tensor(y_train_smote)
test_labels = torch.tensor(y_test.values)

# Create DataLoader
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

print("Data preparation completed.")
print(f"Training samples: {len(train_data)}")
print(f"Testing samples: {len(test_data)}")
print(f"Number of classes: {len(le.classes_)}")
print("Class distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())

  from .autonotebook import tqdm as notebook_tqdm


ValueError: dtype='numeric' is not compatible with arrays of bytes/strings.Convert your data to numeric values explicitly instead.

In [6]:
%pip install scikit-learn==1.0.2 imbalanced-learn==0.8.0 --upgrade
print("Libraries updated successfully.")

Collecting scikit-learn==1.0.2
  Downloading scikit-learn-1.0.2.tar.gz (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mPreparing metadata [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[1818 lines of output][0m
  [31m   [0m Partial import of sklearn during the build process.
  [31m   [0m 
  [31m   [0m   `numpy.distutils` is deprecated since NumPy 1.23.0, as a result
  [31m   [0m   of the deprecation of `distutils` itself. It will be removed for
  [31m   [0m   Python >= 3.12. For older Python versions it will remain present.
  [31m   [0m   It is r

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
from tqdm import tqdm

# Load the data
df = pd.read_csv('sensory.csv', encoding='UTF-8-SIG')

# Encode labels
le = LabelEncoder()
df['Label_encoded'] = le.fit_transform(df['Label'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['Job Responsibilites '], df['Label_encoded'], test_size=0.2, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize and encode sequences
MAX_LEN = 128

def tokenize_sequences(texts):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Tokenize training and testing sets
train_inputs, train_masks = tokenize_sequences(X_train)
test_inputs, test_masks = tokenize_sequences(X_test)

# Convert labels to tensors
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

# Create DataLoader
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

print("Data preparation completed.")
print(f"Training samples: {len(train_data)}")
print(f"Testing samples: {len(test_data)}")
print(f"Number of classes: {len(le.classes_)}")
print("Class distribution:")
print(pd.Series(y_train).value_counts())

  from .autonotebook import tqdm as notebook_tqdm


Data preparation completed.
Training samples: 276
Testing samples: 69
Number of classes: 3
Class distribution:
Label_encoded
1    172
2     97
0      7
Name: count, dtype: int64




In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
from tqdm import tqdm

# Load the data
df = pd.read_csv('sensory.csv', encoding='UTF-8-SIG')

# Encode labels
le = LabelEncoder()
df['Label_encoded'] = le.fit_transform(df['Label'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['Job Responsibilites '], df['Label_encoded'], test_size=0.2, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize and encode sequences
MAX_LEN = 128

def tokenize_sequences(texts):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Tokenize training and testing sets
train_inputs, train_masks = tokenize_sequences(X_train)
test_inputs, test_masks = tokenize_sequences(X_test)

# Convert labels to tensors
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

# Create DataLoader
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

print("Data preparation completed.")
print(f"Training samples: {len(train_data)}")
print(f"Testing samples: {len(test_data)}")
print(f"Number of classes: {len(le.classes_)}")
print("Class distribution:")
print(pd.Series(y_train).value_counts())

Data preparation completed.
Training samples: 276
Testing samples: 69
Number of classes: 3
Class distribution:
Label_encoded
1    172
2     97
0      7
Name: count, dtype: int64




In [3]:
# Train the BERT model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load BERT model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(le.classes_),
    output_attentions=False,
    output_hidden_states=False
)
model.to(device)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Training function
def train_model(model, train_dataloader, epochs=4):
    model.train()
    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}/{epochs}')
        total_loss = 0
        for step, batch in enumerate(train_dataloader):
            b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
            model.zero_grad()
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        avg_train_loss = total_loss / len(train_dataloader)
        print(f'Average training loss: {avg_train_loss:.2f}')

# Train the model
train_model(model, train_dataloader)

print("Training completed.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4
Average training loss: 0.90
Epoch 2/4
Average training loss: 0.69
Epoch 3/4
Average training loss: 0.56
Epoch 4/4
Average training loss: 0.50
Training completed.


In [4]:
# Evaluate the model

def evaluate_model(model, test_dataloader):
    model.eval()
    predictions = []
    true_labels = []
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend(np.argmax(logits, axis=1).flatten())
        true_labels.extend(label_ids.flatten())
    return predictions, true_labels

# Evaluate the model
predictions, true_labels = evaluate_model(model, test_dataloader)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f'Test Accuracy: {accuracy:.4f}')

# Print classification report
print('\
Classification Report:')
print(classification_report(true_labels, predictions, target_names=le.classes_))

# Function to predict a single job description
def predict_job_description(text):
    model.eval()
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return le.inverse_transform([predicted_class])[0]

# Example prediction
example_job = "Manage and maintain sports fields to appropriate standards"
predicted_label = predict_job_description(example_job)
print(f'\
Example Job Description: "{example_job}"')
print(f'Predicted Label: {predicted_label}')

Test Accuracy: 0.8406
Classification Report:
              precision    recall  f1-score   support

  Response B       0.00      0.00      0.00         1
  Response C       0.90      0.84      0.87        45
  Response D       0.74      0.87      0.80        23

    accuracy                           0.84        69
   macro avg       0.55      0.57      0.56        69
weighted avg       0.84      0.84      0.84        69



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Example Job Description: "Manage and maintain sports fields to appropriate standards"
Predicted Label: Response C


In [6]:
# Function to predict and display results for a given job description

def demo_prediction(job_description):
    # Predict the label for the provided job description
    predicted_label = predict_job_description(job_description)
    
    # Display the job description with predicted label
    print(f'Example Job Description: "{job_description}"')
    print(f'Predicted Label: {predicted_label}')

# Example usage
example_job_desc = "Manage and maintain sports fields to appropriate standards"
demo_prediction(example_job_desc)

# The function demo_prediction can now be used with any job description input
print("\
You can now use the demo_prediction function with any job description.")



Example Job Description: "Manage and maintain sports fields to appropriate standards"
Predicted Label: Response C
You can now use the demo_prediction function with any job description.


In [7]:
# Function to predict and display results for a given job description along with its original label

def demo_prediction_with_label(job_description, original_label):
    # Predict the label for the provided job description
    predicted_label = predict_job_description(job_description)
    
    # Display the job description with original and predicted labels
    print(f'Example Job Description: "{job_description}"')
    print(f'Original Label: {original_label}')
    print(f'Predicted Label: {predicted_label}')

# Example usage
example_job_desc = "Manage and maintain sports fields to appropriate standards"
original_label = "Response C"  # Replace with the actual original label if known
demo_prediction_with_label(example_job_desc, original_label)

# The function demo_prediction_with_label can now be used with any job description and its original label
print("\
You can now use the demo_prediction_with_label function with any job description and its original label.")



Example Job Description: "Manage and maintain sports fields to appropriate standards"
Original Label: Response C
Predicted Label: Response C
You can now use the demo_prediction_with_label function with any job description and its original label.


In [8]:
# Function to predict and display results for a given job description, retrieving the original label from the dataset

def demo_prediction_with_label_from_csv(job_description):
    # Find the original label from the dataset
    original_label = df[df['Job Responsibilites '] == job_description]['Label'].values[0]
    
    # Predict the label for the provided job description
    predicted_label = predict_job_description(job_description)
    
    # Display the job description with original and predicted labels
    print(f'Example Job Description: "{job_description}"')
    print(f'Original Label: {original_label}')
    print(f'Predicted Label: {predicted_label}')

# Example usage
example_job_desc = "Manage and maintain sports fields to appropriate standards"
demo_prediction_with_label_from_csv(example_job_desc)

# The function demo_prediction_with_label_from_csv can now be used with any job description from the dataset
print("\
You can now use the demo_prediction_with_label_from_csv function with any job description from the dataset.")

IndexError: index 0 is out of bounds for axis 0 with size 0

In [9]:
# Check the dataset for the correct job description
print(df['Job Responsibilites '].head(10))

# Let's find a job description that exists in the dataset
example_job_desc = df['Job Responsibilites '].iloc[0]
print(f"Using job description: {example_job_desc}")

demo_prediction_with_label_from_csv(example_job_desc)

0    To plan, organise and implement annual program...
1    To manage and maintain the Sports fields to ap...
2    To oversee and ensure correct usage of mechani...
3               To supervise hard and soft landscaping
4    To ensure roads and paths are cleared of snow ...
5    Lead the development of Health and Safety poli...
6                 Develop strong working relationships
7               Oversee the investigation of accidents
8                        Lead Health and Safety audits
9              Manage financial planning and reporting
Name: Job Responsibilites , dtype: object
Using job description: To plan, organise and implement annual programmes of work




Example Job Description: "To plan, organise and implement annual programmes of work"
Original Label: Response C
Predicted Label: Response C


In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the data and prepare the label encoder
df = pd.read_csv('sensory.csv', encoding='UTF-8-SIG')
le = LabelEncoder()
le.fit(df['Label'])

# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Load BERT model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(le.classes_),
    output_attentions=False,
    output_hidden_states=False
)
model.to(device)

# Define constants
MAX_LEN = 128

# Redefine the predict_job_description function
def predict_job_description(text):
    model.eval()
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return le.inverse_transform([predicted_class])[0]

# Function to get original label from CSV if it exists
def get_original_label(job_description):
    matching_row = df[df['Job Responsibilites '] == job_description]
    if not matching_row.empty:
        return matching_row['Label'].values[0]
    return "Not found in original dataset"

# Function to predict and display results for a given job description
def demo_prediction_with_label(job_description):
    # Get the original label from the CSV file
    original_label = get_original_label(job_description)
    
    # Predict the label for the provided job description
    predicted_label = predict_job_description(job_description)
    
    # Display the job description with original and predicted labels
    print(f'Job Description: "{job_description}"')
    print(f'Original Label: {original_label}')
    print(f'Predicted Label: {predicted_label}')

# Example usage
example_job_desc = "Assist with emergency situations, such as responding to fire alarms"
demo_prediction_with_label(example_job_desc)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Job Description: "Assist with emergency situations, such as responding to fire alarms"
Original Label: Response B
Predicted Label: Response C
