Iterating FULL:

In [None]:
import torch
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM

print("Is CUDA available?", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("Current CUDA device:", torch.cuda.current_device())
print("CUDA device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M-Instruct")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M-Instruct").to(device)

# Load the JSON file
with open("data/feature_classification_input.json", "r") as f:
    json_data = json.load(f)

# Extract the "features" array from the "sections" in the JSON
variables = json_data["sections"]["features"]

# Load the CSV files
features_df = pd.read_csv('data/private_data/train_features.csv')
labels_df = pd.read_csv('data/private_data/train_labels.csv')

# Assuming 'NarrativeCME' is the column with the text data
target_texts = features_df['NarrativeCME'].astype(str)  # Use the entire column for predictions
true_labels = labels_df.drop(columns=['uid']).astype(int)  # Drop 'uid' column

# Initialize a list to hold model predictions
predictions = []

# System message
system_message = (
    "You are a fine-tuned language model designed to assist with interpreting and coding variables related to "
    "mental health and behavioral data. Your task is to accurately classify input data according to predefined "
    "variables, using the provided instructions and definitions as guidelines. Make decisions objectively, follow "
    "the coding rules precisely, and avoid making inferences not supported by the given information."
)

# Loop through each variable and all texts
for variable in variables:
    variable_name = variable["name"]
    definition = variable["definition"]
    instruction = variable["instruction"]

    # Create an input prompt for each text based on the variable's definition and instructions
    for text in target_texts:
        input_text = (
            f"System instruction: {system_message} "
            f"Variable: {variable_name}\nDefinition: {definition}\nInstruction: {instruction}\n\n"
            f"Context: {text}\nAnswer only 0 (as no) or 1 (as yes)."
        )

        # Tokenize input with attention mask
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            max_length=512,
            truncation=True,
            padding="max_length"
        ).to(device)  # Move input tensors to the GPU

        # Generate output
        with torch.no_grad():
            outputs = model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=50,
                temperature=0.2,
                top_p=0.9,
                do_sample=True
            )
        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

        # Append the prediction to the list, rounding the number and ensuring no decimals
        try:
            predictions.append(int(round(float(decoded_output))))
        except ValueError:
            predictions.append(None)  # Handle cases where output isn't a valid integer

# Convert predictions to a DataFrame
predictions_df = pd.DataFrame(predictions, columns=["Prediction"])

# Save the results to a CSV file
predictions_df.to_csv('data/smollm_predictions_full.csv', index=False)

# Now compare predictions with true labels for accuracy
# Load the predictions and true labels CSV files
predictions_df = pd.read_csv('data/smollm_predictions_full.csv')
true_labels_df = pd.read_csv('data/private_data/train_labels.csv')

# Drop the 'uid' column from true_labels_df if it exists
true_labels_df = true_labels_df.drop(columns=['uid'])

# Print shapes to diagnose
print("Predictions Shape:", predictions_df.shape)
print("True Labels Shape:", true_labels_df.shape)

# Check for matching indices
print("Predictions Indices:", predictions_df.index)
print("True Labels Indices:", true_labels_df.index)

# Calculate accuracies by comparing each column in predictions_df to the corresponding column in true_labels_df
accuracies = {}
for column in predictions_df.columns:
    if column in true_labels_df.columns:
        # Perform comparison
        accuracies[column] = (predictions_df[column] == true_labels_df[column]).mean()

# Display the accuracies for each column
for column, accuracy in accuracies.items():
    print(f"Accuracy for '{column}': {accuracy:.4f}")

# Calculate and display the average accuracy across all columns
average_accuracy = sum(accuracies.values()) / len(accuracies)
print(f"\nAverage Accuracy: {average_accuracy:.4f}")


Is CUDA available? True
CUDA device count: 1
Current CUDA device: 0
CUDA device name: NVIDIA GeForce RTX 2060 SUPER


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW
import pandas as pd
import json

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load model and tokenizer
model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Load the JSON file
with open("data/feature_classification_input.json", "r") as f:
    json_data = json.load(f)

# Extract variables from the JSON
variables = json_data["sections"]["features"]

# Load the CSV files
features_df = pd.read_csv('data/private_data/train_features.csv')
labels_df = pd.read_csv('data/private_data/train_labels.csv')

# Prepare the data
class MentalHealthDataset(Dataset):
    def __init__(self, texts, variables, labels):
        self.texts = texts
        self.variables = variables
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels.iloc[idx].values

        # Use the variables to create a detailed prompt
        prompts = []
        for variable in self.variables:
            variable_name = variable["name"]
            definition = variable["definition"]
            instruction = variable["instruction"]
            prompt = (
                f"Variable: {variable_name}\nDefinition: {definition}\nInstruction: {instruction}\n\n"
                f"Context: {text}\nAnswer only 0 (as no) or 1 (as yes)."
            )
            prompts.append(prompt)
        
        # Combine prompts into a single string (for simplicity in this example)
        combined_prompt = " ".join(prompts)
        return combined_prompt, torch.tensor(label, dtype=torch.float32)

# Create the dataset and dataloader
target_texts = features_df['NarrativeCME'].astype(str)
true_labels = labels_df.drop(columns=['uid']).astype(int)
dataset = MentalHealthDataset(target_texts, variables, true_labels)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()
num_epochs = 3
for epoch in range(num_epochs):
    total_loss = 0
    for prompts, labels in dataloader:
        # Tokenize the prompts
        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            max_length=512,
            truncation=True,
            padding="max_length"
        ).to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Save the fine-tuned model and tokenizer
model.save_pretrained("fine_tuned_model_with_json")
tokenizer.save_pretrained("fine_tuned_model_with_json")

print("Model fine-tuned and saved locally in 'fine_tuned_model_with_json' directory!")
