In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt


In [7]:
set.seed(414)
# Load the dataset from Excel
df = pd.read_excel('Telus Health Quote Spreadsheet.xlsx', engine='openpyxl')  # Using openpyxl engine for .xlsx files

# Preprocess the data - clean quotes and handle missing values
df['Quote'] = df['Quote'].str.replace('\r\n', ' ')  # Clean line breaks
df['Quote'] = df['Quote'].str.replace('\n', ' ')    # Clean other line breaks
df = df[['Quote', 'Country']].dropna()  # We only need these two columns and remove rows with missing values

# Check country distribution
print("Country distribution:")
print(df['Country'].value_counts())

# Filter countries with sufficient samples (optional - adjust threshold as needed)
country_counts = df['Country'].value_counts()
min_samples = 10  # Minimum samples per country
valid_countries = country_counts[country_counts >= min_samples].index
df = df[df['Country'].isin(valid_countries)]

quotes = df['Quote'].tolist()
countries = df['Country'].tolist()

# Encode the country labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(countries)
num_labels = len(label_encoder.classes_)

print(f"\nNumber of unique countries: {num_labels}")
print("Country classes:", label_encoder.classes_)

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    quotes, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize and encode the training and testing sets
def tokenize_and_encode(texts, labels, max_length=128):  # Reduced max_length to 128 for efficiency
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return input_ids, attention_masks, labels

train_input_ids, train_attention_masks, train_labels = tokenize_and_encode(train_texts, train_labels)
test_input_ids, test_attention_masks, test_labels = tokenize_and_encode(test_texts, test_labels)

# Create DataLoader for training and validation sets
batch_size = 16  # Reduced batch size for memory efficiency

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

# Initialize BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels,
    output_attentions=False,
    output_hidden_states=False
)

# Set up GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")
model.to(device)

# Set up optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(),
                  lr=2e-5,  # Smaller learning rate
                  eps=1e-8)

epochs = 3  # Reduced epochs to prevent overfitting
total_steps = len(train_dataloader) * epochs

# Training loop with progress tracking
def train_model(model, train_dataloader, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            model.zero_grad()

            outputs = model(b_input_ids,
                          attention_mask=b_input_mask,
                          labels=b_labels)

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item():.4f}")

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"\nEpoch {epoch+1}/{epochs}, Average Training Loss: {avg_train_loss:.4f}\n")

train_model(model, train_dataloader, epochs)

# Enhanced evaluation function
def evaluate_model(model, test_dataloader):
    model.eval()
    predictions, true_labels = [], []

    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions.append(logits)
        true_labels.append(label_ids)

    predictions = np.concatenate(predictions, axis=0)
    predicted_labels = np.argmax(predictions, axis=1)
    true_labels = np.concatenate(true_labels, axis=0)

    accuracy = accuracy_score(true_labels, predicted_labels)
    print(f"\nModel Accuracy: {accuracy:.4f}")
    print("\nDetailed Classification Report:")
    print(classification_report(true_labels, predicted_labels, target_names=label_encoder.classes_))

    return predicted_labels, true_labels

# Evaluate the model
predicted_labels, true_labels = evaluate_model(model, test_dataloader)

# Function to predict country for new quotes with confidence scores
def predict_country(new_quotes, model, tokenizer, label_encoder, max_length=128):
    model.eval()

    # Tokenize and encode the new quotes
    input_ids = []
    attention_masks = []

    for quote in new_quotes:
        encoded_dict = tokenizer.encode_plus(
            quote,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0).to(device)
    attention_masks = torch.cat(attention_masks, dim=0).to(device)

    # Make predictions
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1).cpu().numpy()
    predictions = torch.argmax(logits, dim=1).cpu().numpy()
    predicted_countries = label_encoder.inverse_transform(predictions)
    confidence_scores = np.max(probabilities, axis=1)

    results = []
    for i in range(len(new_quotes)):
        country_probs = {label_encoder.classes_[j]: float(probabilities[i][j])
                        for j in range(len(label_encoder.classes_))}
        results.append({
            'quote': new_quotes[i],
            'predicted_country': predicted_countries[i],
            'confidence': float(confidence_scores[i]),
            'country_probabilities': country_probs
        })

    return results

# Example usage with new quotes
new_quotes = [
    "I prefer in-person counseling because it feels more personal.",
    "Online services are more convenient for my busy schedule.",
    "The cultural context is very important in mental health support."
]

predictions = predict_country(new_quotes, model, tokenizer, label_encoder)
print("\nSample Predictions:")
for pred in predictions:
    print(f"\nQuote: {pred['quote']}")
    print(f"Predicted Country: {pred['predicted_country']} (Confidence: {pred['confidence']:.2f})")
    print("Country Probabilities:")
    for country, prob in pred['country_probabilities'].items():
        print(f"  {country}: {prob:.4f}")

# Save the model for future use
output_dir = "./bert_country_classifier/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"\nModel saved to {output_dir}")

Country distribution:
Country
Canada           155
United States    140
Name: count, dtype: int64

Number of unique countries: 2
Country classes: ['Canada' 'United States']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using device: cpu
Epoch 1, Step 0, Loss: 0.7936
Epoch 1, Step 10, Loss: 0.6441

Epoch 1/3, Average Training Loss: 0.6999

Epoch 2, Step 0, Loss: 0.6835
Epoch 2, Step 10, Loss: 0.7002

Epoch 2/3, Average Training Loss: 0.6209

Epoch 3, Step 0, Loss: 0.5334
Epoch 3, Step 10, Loss: 0.4485

Epoch 3/3, Average Training Loss: 0.4769


Model Accuracy: 0.7458

Detailed Classification Report:
               precision    recall  f1-score   support

       Canada       0.86      0.61      0.72        31
United States       0.68      0.89      0.77        28

     accuracy                           0.75        59
    macro avg       0.77      0.75      0.74        59
 weighted avg       0.77      0.75      0.74        59


Sample Predictions:

Quote: I prefer in-person counseling because it feels more personal.
Predicted Country: United States (Confidence: 0.75)
Country Probabilities:
  Canada: 0.2499
  United States: 0.7501

Quote: Online services are more convenient for my busy schedule.
Predic