In [17]:
# DO NOT RUN THIS FILE -- WAS USED IN COLAB TO CREATE CLASSIFIER MODEL!!!
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
# Check if GPU is running
import torch
print(torch.cuda.is_available())  # Should return True if GPU is active


True


In [38]:
# Imports
import pandas as pd
import torch
import time
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Ensure GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the dataset
df_combined = pd.read_csv("/content/drive/MyDrive/qa_classifier/qa_dataset.csv")

# Step 1: Calculate the average length of non-health queries
# We calculate the length of the 'combined_text' column for non-health queries
non_health_lengths = df_combined[df_combined['label'] == 0]['combined_text'].apply(len)
avg_non_health_length = non_health_lengths.mean()
print(f"Average length of non-health queries: {avg_non_health_length}")

# Step 2: Truncate health queries to the average length of non-health queries
# We will truncate all health-related queries to this average length
def truncate_text(text, length):
    return text[:int(length)]  # Truncate the text to the desired length

# Apply truncation only to health queries (label = 1)
df_combined.loc[df_combined['label'] == 1, 'combined_text'] = df_combined[df_combined['label'] == 1]['combined_text'].apply(lambda x: truncate_text(x, avg_non_health_length))

# Step 3: Balance the dataset by downsampling
df_health = df_combined[df_combined['label'] == 1]  # Health-related queries
df_non_health = df_combined[df_combined['label'] == 0]  # Non-health queries

# Find the minimum size of the two classes
min_size = min(len(df_health), len(df_non_health))

# Downsample both classes to the minimum size
df_health_balanced = df_health.sample(n=min_size, random_state=42)
df_non_health_balanced = df_non_health.sample(n=min_size, random_state=42)

Using device: cuda
Average length of non-health queries: 48.967737789203085


In [39]:
# Combine the balanced classes and shuffle the dataset
df_balanced = pd.concat([df_health_balanced, df_non_health_balanced]).sample(frac=1, random_state=42).reset_index(drop=True)

# Check the balance
print(df_balanced['label'].value_counts())

# Step 4: Split the data into features (X) and labels (y)
X = df_balanced['combined_text']
y = df_balanced['label']

# Step 5: Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the training data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)


label
1    7780
0    7780
Name: count, dtype: int64




In [40]:
# Dataset class
class BinaryClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the datasets
train_dataset = BinaryClassificationDataset(train_encodings, y_train.tolist())
test_dataset = BinaryClassificationDataset(test_encodings, y_test.tolist())

In [41]:
from sklearn.metrics import accuracy_score

# Load BERT pre-trained model for binary classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

# Evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = accuracy_score(labels, predictions.numpy())  # Ensure labels and predictions are both numpy arrays
    return {"accuracy": accuracy}

# Define the training arguments with Mixed Precision (FP16)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',            # TensorBoard logs directory
    logging_steps=10,                # Log every 10 steps
    evaluation_strategy="steps",     # Evaluate during training
    eval_steps=100,
    fp16=True,                       # Enable Mixed Precision Training (FP16)
    save_total_limit=2,              # Save only the last 2 checkpoints to save space
    load_best_model_at_end=True,     # Load the best model when training is finished
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics  # Pass the metric function here
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Early stopping after 3 evaluation steps of no improvement
)

# Start time tracking
start_time = time.time()

# Start training
trainer.train()

# End time tracking
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time
hours, rem = divmod(elapsed_time, 3600)
minutes, seconds = divmod(rem, 60)
print(f"Training completed in {int(hours)} hours, {int(minutes)} minutes, and {int(seconds)} seconds.")



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss,Accuracy
100,0.1762,0.139809,0.982969
200,0.0086,0.007329,0.998393
300,0.0177,0.002255,0.999357
400,0.0003,0.000265,1.0
500,0.001,0.010344,0.997751
600,0.0021,0.048825,0.990039
700,0.0002,0.002295,0.999357
800,0.0001,0.021242,0.997429
900,0.0001,0.007016,0.998393
1000,0.0267,0.007999,0.998715


Training completed in 0 hours, 5 minutes, and 31 seconds.


In [42]:
# Evaluate the model on the test dataset
eval_results = trainer.evaluate()
print(f"Test Accuracy: {eval_results['eval_accuracy']}")


Test Accuracy: 1.0


In [43]:
# pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 5000)
pd.set_option('max_colwidth', 400)

print('X_train', X_train)
print('y_train', y_train)
print('X_test', X_test)
print('y_test', y_test)
y_test.value_counts()

X_train 4220                                Is early-onset glaucoma inherited ? Early-onset 
5589                                What are the treatments for autosomal recessive 
3547                                Is spinocerebellar ataxia type 3 inherited ? Thi
4981                               where does the movie road to perdition take place
14436                               How many people are affected by amyotrophic late
                                            ...                                     
5191                                   who wrote you must have been a beautiful baby
13418                                                what kind of vw jetta do i have
5390                                         where can a master at arms be stationed
860      which of the following was not one of the functions of the friedmans bureau
7270                                      how many books are in the one piece series
Name: combined_text, Length: 12448, dtype: object
y_train

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,1571
1,1541


In [44]:
# Save the model
model.save_pretrained('/content/drive/MyDrive/qa_classifier')
tokenizer.save_pretrained('/content/drive/MyDrive/qa_classifier')


('/content/drive/MyDrive/qa_classifier/tokenizer_config.json',
 '/content/drive/MyDrive/qa_classifier/special_tokens_map.json',
 '/content/drive/MyDrive/qa_classifier/vocab.txt',
 '/content/drive/MyDrive/qa_classifier/added_tokens.json')

In [45]:
# Check if GPU (CUDA) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the selected device (GPU/CPU)
model.to(device)

# Sample queries (you can change these to test the model)
test_queries = [
    # Health-related queries (label = 1)
    "What are the side effects of taking aspirin?",
    "How can I manage my high blood pressure through diet?",
    "What are the symptoms of seasonal allergies?",
    "What is the best treatment for migraines?",
    "Can regular exercise help with controlling diabetes?",
    "What medications are typically prescribed for asthma?",
    "How often should I schedule a check-up with my doctor?",
    "What is the best way to lower cholesterol levels naturally?",
    "What is a balanced diet plan for someone with heart disease?",
    "How do I know if I need to see a specialist for my back pain?",

    # Non-health-related queries (label = 0)
    "What time is the next soccer game?",
    "How do I install software on my computer?",
    "What is the capital city of Australia?",
    "Who won the Oscar for best picture in 2020?",
    "Can you recommend some good books for summer reading?",
    "What is the weather forecast for tomorrow?",
    "How do I fix a flat tire on my bike?",
    "When is the next presidential election?",
    "What are the top 5 tourist destinations in Paris?",
    "How can I learn a new programming language?"
]

# Tokenize the test queries
inputs = tokenizer(test_queries, padding=True, truncation=True, return_tensors="pt", max_length=128)

# Move input tensors to the same device as the model
inputs = {key: value.to(device) for key, value in inputs.items()}

# Put the model in evaluation mode
model.eval()

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Convert logits to predicted labels (0 = non-health, 1 = health)
predictions = torch.argmax(logits, dim=-1)

# Display results
for i, query in enumerate(test_queries):
    label = "Health" if predictions[i].item() == 1 else "Non-Health"
    print(f"Query: {query}")
    print(f"Prediction: {label}\n")

Query: What are the side effects of taking aspirin?
Prediction: Health

Query: How can I manage my high blood pressure through diet?
Prediction: Health

Query: What are the symptoms of seasonal allergies?
Prediction: Health

Query: What is the best treatment for migraines?
Prediction: Health

Query: Can regular exercise help with controlling diabetes?
Prediction: Health

Query: What medications are typically prescribed for asthma?
Prediction: Health

Query: How often should I schedule a check-up with my doctor?
Prediction: Non-Health

Query: What is the best way to lower cholesterol levels naturally?
Prediction: Health

Query: What is a balanced diet plan for someone with heart disease?
Prediction: Health

Query: How do I know if I need to see a specialist for my back pain?
Prediction: Health

Query: What time is the next soccer game?
Prediction: Non-Health

Query: How do I install software on my computer?
Prediction: Non-Health

Query: What is the capital city of Australia?
Prediction