In [None]:
import faiss
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import csv
import time
import re
from tqdm import tqdm
 
# Load the CSV file
file_path = '/content/final_gemma9b_150datapoints.csv'
df = pd.read_csv(file_path)
 
display(df.head())
 
unique_status_values = df['actual_status'].unique().tolist()
print(unique_status_values)
 
df = df.drop(columns=['predicted_status'])
 
display(df.head())
 
target_samples = 30
 
# Create a new balanced dataset by undersampling each class to 100 samples
undersampled_df = df.groupby('actual_status').apply(lambda x: x.sample(n=target_samples, random_state=42) if len(x) > target_samples else x)
 
# Reset index after sampling
undersampled_df = undersampled_df.reset_index(drop=True)
 
# Display the new balanced dataset size
print(undersampled_df['actual_status'].value_counts())
 
 
# Display the first few rows of the undersampled dataset
display(undersampled_df.head())
print(len(undersampled_df))
 
 
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
 
 
 
READER_MODEL_NAME = "google/gemma-2-9b-it"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )
model = AutoModelForCausalLM.from_pretrained(
    READER_MODEL_NAME
    #, quantization_config=bnb_config
).to(device)
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
 
READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
    device =0,
)
csv_file_path = "/content/phase1_answers.csv"
txt_file_path = "/content/phase1_answers.txt"
 
 
start_time = time.time()
with open(txt_file_path, 'w', encoding='utf-8') as txt_file, \
     open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
 
    # Setup CSV writer
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(["User Query", "Predicted Disorder", "Full Response"])
 
    user_queries = undersampled_df["statement"].tolist()
 
 
    for sample_question in tqdm(user_queries, desc="Processing Queries"):
        # prompt_text = tokenizer.apply_chat_template(
        #     prompt_in_chat_format, question = sample_question, tokenize=False, add_generation_prompt=True
        # )
 
        prompt_text = tokenizer.apply_chat_template(
    [
        {
            "role": "user",
            "content": f"""You are an AI assistant specialized in diagnosing mental disorders in humans.
              Using the information contained in the context, answer the question comprehensively.
 
              The **Diagnosed Mental Disorder** must be exactly one from the list below:
              [Depression, Suicidal, Anxiety, Stress, Bi-Polar, Personality Disorder]
 
              ---
 
              Question: {sample_question}"""
                      },
                      {"role": "assistant", "content": ""}
                  ],
                  tokenize=False,
                  add_generation_prompt=True
              )
 
 
        # Step 4: Call your pipeline
        output = READER_LLM(prompt_text)[0]["generated_text"]
 
        # Write the answer to the text file
        match = re.search(r"\*\*Diagnosed Mental Disorder\*\*[:\-]*\s*(\w+(?:\s*\-*\w+)*)", output)
        predicted_disorder = match.group(1).strip() if match else "Unknown"
 
        # ✏️ Write to TXT file
        txt_file.write(f"Query: {sample_question}\n")
        txt_file.write(f"Answer:\n{output}\n")
        txt_file.write(f"{'='*50}\n\n")
 
        # ✏️ Write to CSV file
        csv_writer.writerow([sample_question, predicted_disorder, output])
 
end_time = time.time()
elapsed_time = end_time - start_time
 
print(f"\n✅ All user queries processed successfully and answers saved to 'phase1_answers.txt' & 'phase1_answers.csv' ")
print(f"⏱️ Total processing time: {elapsed_time:.2f} seconds")