In [2]:
%pip install pandas
%pip install nltk
%pip install scikit-learn

In [None]:
%pip install transformers torch pandas

In [None]:
%pip install --upgrade jupyter ipywidgets

In [5]:
import numpy as np
import pandas as pd

In [6]:
# import all the necessary packages
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

# Preprocessing and Model Setup
Let's load the dataset, tokenize the text for BERT, and then fine-tune the BERT model for text classification.

In [20]:
# Load the dataset
df = pd.read_csv("/Users/kenny/Desktop/DATAINNOVATIONLAB/Kenny_claudeclassification.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/kenny/Desktop/DATAINNOVATIONLAB/Kenny_claudeclassification.csv'

In [None]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocess the comments
def preprocess_text(texts):
    return tokenizer(
        texts.tolist(),
        max_length=128,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

# Prepare labels for multi-label classification
def prepare_labels(row):
    return [
        1 if row["New"] == 1 else 0,
        1 if row["Technical"] == 1 else 0,
        1 if row["Local"] == 1 else 0,
        1 if row["Correctional"] == 1 else 0
    ]

df['labels'] = df.apply(prepare_labels, axis=1)

# Create a PyTorch Dataset
class OSMCommentsDataset(Dataset):
    def __init__(self, comments, labels):
        self.comments = comments
        self.labels = labels
    
    def __len__(self):
        return len(self.comments)
    
    def __getitem__(self, idx):
        inputs = preprocess_text([self.comments[idx]])
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

# Initialize dataset
comments = df['comment'].tolist()
labels = df['labels'].tolist()

dataset = OSMCommentsDataset(comments, labels)

# Model Fine-tuning
We will fine-tune the BERT model to handle multi-label classification. For this, we define four output labels, corresponding to New, Technical, Local, and Correctional.


In [None]:
# Load pre-trained BERT model with classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_steps=500,
    logging_dir='./logs',
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,  # Normally you'd split into training/validation sets
)

# Train the model
trainer.train()

# Making Predictions
Once the model is trained, use it to predict the categories for new comments.

In [None]:
def predict_comment(comment):
    inputs = preprocess_text([comment])
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.sigmoid(outputs.logits).round().numpy()[0]
    
    return {
        "New": int(predictions[0]),
        "Technical": int(predictions[1]),
        "Local": int(predictions[2]),
        "Correctional": int(predictions[3])
    }

# Apply prediction to the dataset
output_data = []
for idx, row in df.iterrows():
    result = predict_comment(row['comment'])
    result['id'] = row['id']
    output_data.append(result)

# Save the output to CSV
output_df = pd.DataFrame(output_data)
output_file = "/path_to_your/test_expe.csv"  # Specify your output file path
output_df.to_csv(output_file, index=False)

print(f"Classification saved to {output_file}")

# Model Evaluation
Calculating F1 score etc.

In [21]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming you have:
# `true_labels`: the ground truth labels for your dataset
# `predicted_labels`: the predicted labels from your model

# Calculate Accuracy
accuracy = accuracy_score(true_labels, predicted_labels)

# Calculate Precision
precision = precision_score(true_labels, predicted_labels, average='weighted')

# Calculate Recall
recall = recall_score(true_labels, predicted_labels, average='weighted')

# Calculate F1-score
f1 = f1_score(true_labels, predicted_labels, average='weighted')

# Print the metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
