# Run general classifier for 'study type' classification of articles

- For further information, please refer to https://pypi.org/project/general-classifier/#4-evaluate-prompt-performance
- Load dataset and install necessary libraries
- Prepare dataset, i.e., drop irrelevant columns and merge "PaperTitle" and "Abstract"
- Set up general classifier (gc), load categories and define prompt
- Define LLM to use
- Run gc on dataset 
- Evaluate performance
- Merge with initial dataset


## 1) Set up libraries and datasets

In [1]:
# Install libraries
!pip install guidance
!pip install pydantic --upgrade
!pip install openai
!pip install general-classifier
print("Success!")

Success!


In [2]:
# Import libraries
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import time
import re
from tqdm import tqdm
import csv
import general_classifier
from general_classifier import gc
import csv
import transformers
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM, TextStreamer
import torch
import requests
import sentencepiece as spm
import time
import json
import torch
import ast
import openai
from openai import OpenAI

print("Success!")

2025-03-20 21:12:09.335889: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-20 21:12:09.350675: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742501529.367407   31422 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742501529.372421   31422 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-20 21:12:09.389455: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Success!


In [None]:
# Set the working directory and file paths
input_directory = "INPUT_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"
classifier_directory = "CLASSIFIER_DIRECTORY"

# Change the working directory
os.chdir(classifier_directory)
print("Current Working Directory:", os.getcwd())

In [4]:
# Load datasets

# Load topics and categories
os.chdir(input_directory)
print("Current Working Directory:", os.getcwd())
topics_and_categories = pd.read_csv("studydesign.csv")
categories = topics_and_categories.iloc[:, 0].dropna().tolist()
print("--> Topics and categories loaded!")

# Load dataset to run classifier
os.chdir(output_directory)
print("\nCurrent Working Directory:", os.getcwd())

cancer_df = pd.read_csv("binary_cancer_matrix_filtered.csv")
len_cancer_df=len(cancer_df)
print(f" --> Total rows in cancer dataset: {len_cancer_df:,}")

cancer_df = cancer_df[['PaperId', 'PaperTitle', 'Abstract']].copy()
#print(cancer_df)

print("\n\nSuccess!")
os.chdir(classifier_directory)
print("Current Working Directory:", os.getcwd())

Current Working Directory: /data/JH/marie/TrendyVariants/Input
--> Topics and categories loaded!


## 2) Set up general calssifier

In [5]:
# Setup gc
gc.setModel(newModel="meta-llama/Llama-3.3-70B-Instruct", newModelType="DeepInfra", newInferenceType="cloud", api_key="API_KEY")
print("Success!")

Success!


In [6]:
gc.removeAllTopics()
gc.add_topic(
    topic_name="Study Type",
    categories=categories  # Use extracted categories
)
# Display all defined topics and their categories
gc.show_topics_and_categories()

All topics have been removed, counters reset, and related data cleared.
Topic 1 (ID=A): Study Type
  Prompt: INSTRUCTION: You are a helpful classifier. You select the correct of the possible categories for classifying a piece of text. The topic of the classification is '[TOPIC]'. The allowed categories are '[CATEGORIES]'. QUESTION: The text to be classified is '[TEXT]'. ANSWER: The correct category for this text is '
    1. Clinical study (ID=a)
    2. Case report study (ID=b)
    3. In vitro study (ID=c)
    4. In silico study (ID=d)
    5. In vivo/Animal study (ID=e)
    6. Behavioral study (ID=f)
    7. Observational/RWE study (ID=g)
    8. Systematic review study (ID=h)


In [7]:
gc.setPrompt(
    topicId="A", 
    newPrompt=(
        "Prompt: INSTRUCTION: You are a helpful classifier. You are given the abstract of a "
        "scientific, biomedical publication and you have to select the correct of the possible categories. "
        "The topic of the classification is '[TOPIC]'. The allowed categories are '[CATEGORIES]'. "
        "QUESTION: The abstract to be classified is '[TEXT]'. "
        'ANSWER: The correct category for this abstract is "".'
    )
)

Prompt for topic ID A updated.


In [8]:
# Show prompt and topic with associated categories
gc.show_topics_and_categories()

Topic 1 (ID=A): Study Type
  Prompt: Prompt: INSTRUCTION: You are a helpful classifier. You are given the abstract of a scientific, biomedical publication and you have to select the correct of the possible categories. The topic of the classification is '[TOPIC]'. The allowed categories are '[CATEGORIES]'. QUESTION: The abstract to be classified is '[TEXT]'. ANSWER: The correct category for this abstract is "".
    1. Clinical study (ID=a)
    2. Case report study (ID=b)
    3. In vitro study (ID=c)
    4. In silico study (ID=d)
    5. In vivo/Animal study (ID=e)
    6. Behavioral study (ID=f)
    7. Observational/RWE study (ID=g)
    8. Systematic review study (ID=h)


## 3) Run general classifier

In [None]:
##### Create a article subset, remove everything except the column to classify and the PaperID

# Create the new column by combining "PaperTitle" and "Abstract"
subset_df["PaperTitle_and_Abstracts"] = subset_df["PaperTitle"].astype(str) + " " + subset_df["Abstract"].astype(str)

# Keep only the required columns
subset_df_gccolumn = subset_df[["PaperId", "PaperTitle_and_Abstracts"]].copy()

# Save the new dataset as CSV
csv_path = "subset_df_gccolumn.csv"
subset_df_gccolumn.to_csv(csv_path, index=False)

# Print confirmation and show first rows
print(f"File saved: {csv_path}")
print(subset_df_gccolumn.head(20))
print("Length of dataset:",len(subset_df_gccolumn))

In [None]:
# Run classification on the subset CSV file
os.chdir(output_directory)

# Get start timestamp
start_time = time.time()

gc.classify_table(dataset="subset_df_gccolumn", withEvaluation=False, constrainedOutput=True)
# withEvaluation=False if not done manually
print("Success!")

# Get end timestamp
end_time = time.time()

# Calculate runtime duration
runtime_duration = end_time - start_time

# Generate output file name
output_filename = f"Study_design_classification_runtime_subset_data.txt"

os.chdir(output_directory)
# Save timestamp information to file
with open(output_filename, "w") as f:
    f.write(f"Start Time: {time.ctime(start_time)}\n")
    f.write(f"End Time: {time.ctime(end_time)}\n")
    f.write(f"Total Runtime (seconds): {runtime_duration:.4f}\n")

print(f"Runtime saved in {output_filename}")

## 4) Batch running of general calssifier

In [None]:
subset_df=cancer_df.copy()
print(f"Length of cancer dataset to process in batches: {len(subset_df):,}")

In [None]:
# Create a new column by combining "PaperTitle" and "Abstract"

subset_df["PaperTitle_and_Abstracts"] = subset_df["PaperTitle"].astype(str) + " " + subset_df["Abstract"].astype(str)
subset_df_gccolumn = subset_df[["PaperId", "PaperTitle_and_Abstracts"]].copy()

# Change the working directory
classifier_directory = "/data/JH/marie/TrendyVariants/Output/gc_batch_files"
os.chdir(classifier_directory)
print("Current Working Directory:", os.getcwd())

# Save the full newly created dataset as CSV
csv_path = "subset_df_gccolumn.csv"
subset_df_gccolumn.to_csv(csv_path, index=False)

# Print confirmation and show first rows
print(f"File saved: {classifier_directory}")
print(subset_df_gccolumn.head(20))
print("\n\nLength of dataset:", len(subset_df_gccolumn))

# ---- Dynamically Create Batches ----
# Define batch size
batch_size = 50000
total_batches = (len(subset_df_gccolumn) + batch_size - 1) // batch_size
print(f"\n\nTotal number of batches: {total_batches}")


# Split dataset into batches and save them
for batch_num in range(total_batches):
    start_idx = batch_num * batch_size
    end_idx = min(start_idx + batch_size, len(subset_df_gccolumn))

    # Extract batch
    batch_df = subset_df_gccolumn.iloc[start_idx:end_idx]

    # Save batch as a separate CSV
    batch_filename = os.path.join(classifier_directory, f"subset_df_gccolumn_batch_{batch_num+1}.csv")
    batch_df.to_csv(batch_filename, index=False)

    # Print confirmation
    print(f"Batch {batch_num+1}/{total_batches} saved: {batch_filename} | Articles in batch: {len(batch_df)}")

In [13]:
# Batch processing

# Manually define the current batch to process!
n = 5  # Define the current batch number manually!!!!
# if new running
lastprocessedbatch=4
total_batches=5
batch_size = 50000

# Check if n and last processed batch are the same
if n == lastprocessedbatch:
    print("\n !!! UPDATE BATCH NUMBER WARNING !!!")
    print(f"The current batch number ({n}) is the same as the last processed batch ({lastprocessedbatch}).")
    print("Please ensure you are not reprocessing the same batch unless intended.\n")

print(f"Batch to be processed next: {n}")
print(f"Batches already processed: {lastprocessedbatch:,} / {total_batches:,} batches")
print(f"Batches to process: {total_batches - n}")
print(f"Batch size: {batch_size:,} articles")
print(f"Total batches of dataset: {total_batches:,}")

Batch to be processed next: 5
Batches already processed: 4 / 5 batches
Batches to process: 0
Batch size: 50,000 articles
Total batches of dataset: 5


In [None]:
print(f"Processing Batch {n}/{total_batches}...")

# Run classification on batch data
os.chdir(classifier_directory)
start_time = time.time()

gc.classify_table(dataset=f"subset_df_gccolumn_batch_{n}", withEvaluation=False, constrainedOutput=True) #Need to define the CSV!
# withEvaluation=False if not done manually
print("\n\nSuccess!")
print("File saved as subset_df_gccolumn_batch_n_(restuls)!")

# Get end timestamp
end_time = time.time()
runtime_duration = end_time - start_time


##############

# Generate output file name
output_filename = f"Study_design_classification_runtime_subset_data_batch_{n}.txt"
lastprocessedbatch=n

# Convert runtime to minutes and hours
runtime_minutes = runtime_duration / 60
runtime_hours = runtime_duration / 3600


# Print runtime information to console
print(f"\n ######### Batch {n} Runtime Summary: ########")
print(f"- Start Time: {time.ctime(start_time)}")
print(f"- End Time: {time.ctime(end_time)}")
print(f"- Total Runtime: {runtime_duration:.4f} seconds | {runtime_minutes:.2f} minutes | {runtime_hours:.2f} hours")

# Save timestamp information to file
with open(output_filename, "w") as f:
    f.write(f"Start Time: {time.ctime(start_time)}\n")
    f.write(f"End Time: {time.ctime(end_time)}\n")
    f.write(f"Total Runtime (seconds): {runtime_duration:.4f}\n")
    f.write(f"Total Runtime (minutes): {runtime_minutes:.2f}\n")
    f.write(f"Total Runtime (hours): {runtime_hours:.2f}\n")

print(f"\nRuntime saved in {output_filename}\n")

Processing Batch 5/5...
Processed 100/39078 rows (0.26%)


In [None]:
# Create total runtime calculation

# Initialize total runtime accumulator
total_cumulative_seconds = 0

# Loop through all batch files up to total_batches
for n in range(1, total_batches + 1):
    filename = f"Study_design_classification_runtime_subset_data_batch_{n}.txt"
    file_path = os.path.join(classifier_directory, filename)

    # Check if the file exists, otherwise skip
    if not os.path.exists(file_path):
        print(f"Skipping missing file: {filename}")
        continue

    # Read the file and extract runtime in seconds
    with open(file_path, "r") as f:
        for line in f:
            if "Total Runtime (seconds):" in line:
                runtime_seconds = float(line.strip().split(":")[1])
                total_cumulative_seconds += runtime_seconds
                break  # No need to read further

# Convert total runtime to minutes and hours
total_cumulative_minutes = total_cumulative_seconds / 60
total_cumulative_hours = total_cumulative_seconds / 3600

# Print results
print(f"\nTotal cumulative runtime of general calssifier for study categorization:")
print(f"Seconds: {total_cumulative_seconds:.4f}")
print(f"Minutes: {total_cumulative_minutes:.2f}")
print(f"Hours: {total_cumulative_hours:.2f}")

# ==============================

## 4) Merge batch dataframes

In [None]:
print("Number of total batches:", total_batches)
os.chdir(classifier_directory)

for n in range(1, total_batches + 1):
    input_filename = f"subset_df_gccolumn_batch_{n}_(result).csv"
    output_filename = f"fixed_output_batch_{n}.csv"

    # Check if the file exists, if not, skip it
    if not os.path.exists(input_filename):
        print(f"Batch {n}: File not found. Skipping...")
        continue

    try:
        # Read the file while forcing a semicolon separator
        df = pd.read_csv(input_filename, sep=";", engine="python")

        # Save it back properly as a CSV with commas as delimiters
        df.to_csv(output_filename, index=False, sep=",", quoting=1)

        # Read again, fix delimiters, rename columns, and save in one go
        df.rename(columns={df.columns[0]: "PaperId,PaperTitle_and_Abstracts", 
                           df.columns[1]: "Study_design"}, inplace=True)
        df.to_csv(output_filename, index=False, sep=",", quoting=1)

    except Exception as e:
        print(f"Batch {n}: Error processing file - {e}")

print("\nProcessing complete for all available batches!")

# Print first 5 rows of Batch 1 for verification
file_1 = "fixed_output_batch_1.csv"
if os.path.exists(file_1):
    print(pd.read_csv(file_1).head(5))
else:
    print("\nBatch 1 is not available yet.")

In [None]:
# Merge the batches

# List to store DataFrames
merged_df_list = []

for n in range(1, total_batches + 1):
    file_name = f"fixed_output_batch_{n}.csv"
    
    if os.path.exists(file_name):  # Check if file exists before merging
        df = pd.read_csv(file_name)
        merged_df_list.append(df)
        print(f"Batch {n} added to merge with {len(df)} rows.")
    else:
        print(f"Batch {n} is missing. Skipping...")

# Merge all available DataFrames
final_merged_gc_df = pd.concat(merged_df_list, ignore_index=True)
final_merged_gc_df.to_csv("final_merged_gc_output.csv", index=False, sep=",", quoting=1)
print(f"\nFinal merged dataset saved as 'final_merged_gc_output.csv' with {len(final_merged_df)} total rows.")
print(final_merged_gc_df)

In [None]:
### Split classifier output columns
os.chdir(classifier_directory)
merged_df = pd.read_csv("final_merged_gc_output.csv")
print("**Merged batches of GC dataset:**")
print(merged_df)

# Split PaperId from PaperTitle_and_Abstracts
merged_df[['PaperId', 'PaperTitle_and_Abstracts']] = merged_df['PaperId,PaperTitle_and_Abstracts'].str.split(',', n=1, expand=True)
merged_df['PaperId'] = merged_df['PaperId'].str.strip('"').astype(int)


# Drop unnecessary columns
merged_df.drop(columns=['PaperId,PaperTitle_and_Abstracts', 'PaperTitle_and_Abstracts'], inplace=True)
print("\n\n**Merged batches of GC dataset after splitting and dropping PaperTitle and Abstract columns:''")
print(merged_df)
# Convert PaperId in article_df to integer (to ensure proper merging)
article_df['PaperId'] = article_df['PaperId'].astype(int)

print("\n\n-->Success!")

# Save the final merged gc batch file

gclength = len(merged_df)
output_filename = f"corrected_classification_gc_output_{gclength}.csv"
merged_df.to_csv(output_filename, index=False)

# Print the dataset and its length
print(merged_df)
print(f"\nLength of merged gc dataset: {gclength}")
print(f"Final file saved as: {output_filename}")

In [None]:
# Load  full dataset for merging
os.chdir(output_directory)
print("\nCurrent Working Directory:", os.getcwd())
cancer_df = pd.read_csv("binary_cancer_matrix_filtered.csv")
len_cancer_df=len(cancer_df)
print(f" --> Total rows in cancer dataset: {len_cancer_df:,}")
cancer_df = cancer_df[['PaperId', 'PaperTitle', 'Abstract']].copy()
print(f" --> Reconfirmed: Total rows in cancer dataset: {len_cancer_df:,}")
print("--> Full cancer dataset loaded!")
os.chdir(classifier_directory)
print("\nCurrent Working Directory:", os.getcwd())

In [None]:
### Perform the final merge
os.chdir(classifier_directory)
print("\nCurrent Working Directory:", os.getcwd())
# Start timing
start_time = time.time()

# Initialize progress bar
total_rows = len(merged_df)
with tqdm(total=total_rows, desc="Merging Data", unit="rows") as pbar:
    full_merged_df = merged_df.merge(cancer_df, on="PaperId", how="left", indicator=True)
    pbar.update(total_rows)  # Update progress bar after merging

# End timing and calculate estimated duration
end_time = time.time()
execution_time = end_time - start_time

# Count matches
successful_matches = full_merged_df['_merge'].value_counts().get('both', 0)
failed_matches = full_merged_df['_merge'].value_counts().get('left_only', 0)

# Drop the merge indicator column
full_merged_df.drop(columns=['_merge'], inplace=True)

# Save the final merged file
full_merged_df.to_csv("full_merged_output_after_classification.csv", index=False)

# Save the final merged file
full_length = len(full_merged_df)
output_filename = f"final_gc_classificaton_output_{full_length}.csv"
full_merged_df.to_csv(output_filename, index=False)

# Print match statistics and estimated execution time
print(f"Total Entries in merged_df: {len(merged_df)}")
print(f"Successful Matches: {successful_matches}")
print(f"Unmatched Entries: {failed_matches}")
print(f"Execution Time: {execution_time:.2f} seconds")
print(f"Final file saved as: {output_filename}")
print(f"Total rows in final dataset: {full_length}")