In [None]:
import pandas as pd
import re

# Define file names and their reasoning status
file_configs = {
    "generated_Few_Shot_DeepMental_responses.csv": True,   # Has <think>
    "generated_Fine_Tune_DeepMental_responses.csv": True,  # Has <think>
    "generated_Fine_Tune_Traditional_responses.csv": False  # No <think>
}

# Initialize empty list for combined data
combined_data = []

# Function to extract response after "### Response:"
def extract_response(text):
    match = re.search(r"### Response:\s*(.*)", text, re.DOTALL)
    return match.group(1).strip() if match else text.strip()

# Function to remove <think>...</think>
def remove_think_tags(text):
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

# Process each file
for file, has_think in file_configs.items():
    try:
        df = pd.read_csv(file)
        
        # Extract response text
        df["clean_response"] = df["response"].apply(extract_response)
        
        if has_think:
            # Remove <think> for the no-reasoning version
            df["response_without_think"] = df["clean_response"].apply(remove_think_tags)
        else:
            # If the file does not contain <think>, keep responses the same
            df["response_without_think"] = df["clean_response"]
        
        # Add a column to track which model generated the response
        df["model"] = file.replace(".csv", "")  # Use filename as model identifier

        # Append data to combined list
        combined_data.extend(df[["instruction", "input", "clean_response", "response_without_think", "model"]].values.tolist())

    except FileNotFoundError:
        print(f"⚠️ Warning: File {file} not found. Skipping.")

# Create a single DataFrame with all responses
df_combined = pd.DataFrame(combined_data, columns=["instruction", "input", "response_with_think", "response_without_think", "model"])

# Save the final combined file
df_combined.to_csv("final_combined_responses.csv", index=False)

print("✅ Final combined response file saved as 'final_combined_responses.csv'!")


In [3]:
from datasets import load_dataset
import pandas as pd

# Load the dataset from Hugging Face
DATASET_ID = "ShenLab/MentalChat16K"
dataset = load_dataset(DATASET_ID, split="train")  # Assuming 'train' split

# Convert to DataFrame
df_expected = pd.DataFrame(dataset)

# Select relevant columns: Assuming 'input' and 'response' are the correct column names
if "input" in df_expected.columns and "response" in df_expected.columns:
    df_expected = df_expected[["input", "response"]]  # 'response' is considered the expected output
else:
    print("⚠️ Warning: Column names might be different in the dataset. Check and update accordingly.")

# Save expected responses
csv_filename = "expected_responses.csv"
df_expected.to_csv(csv_filename, index=False)

print(f"✅ Expected responses file saved as '{csv_filename}'!")

# Display a sample of expected responses
print(df_expected.head())


✅ Expected responses file saved as 'expected_responses.csv'!
                                         instruction  \
0  You are a helpful mental health counselling as...   
1  You are a helpful mental health counselling as...   
2  You are a helpful mental health counselling as...   
3  You are a helpful mental health counselling as...   
4  You are a helpful mental health counselling as...   

                                               input  \
0  I've been struggling with my mental health for...   
1  I've been feeling overwhelmed with my caregivi...   
2  I've been feeling constantly anxious and unabl...   
3  My mom has Alzheimer's, and I've been her prim...   
4  I've tried setting boundaries, but it feels li...   

                                              output  
0  I understand that you've been dealing with a s...  
1  Your situation is complex, and it's important ...  
2  I can see that you're dealing with a great dea...  
3  I'm sorry to hear that your siblings' dema

In [7]:
# Define file paths
expected_responses_file = "expected_responses.csv"
generated_responses_file = "final_combined_responses.csv"

# Load the expected responses file
df_expected = pd.read_csv(expected_responses_file)

# Load the generated responses file
df_generated = pd.read_csv(generated_responses_file)

# Ensure only one match per input by removing duplicates from df_expected
df_expected = df_expected.drop_duplicates(subset=["input"])

# Merge the expected responses with generated responses based on 'input'
df_merged = df_generated.merge(df_expected[["input", "output"]], on="input", how="left")

# Rename the expected response column for clarity
df_merged.rename(columns={"response": "expected_response"}, inplace=True)

# Save the final file with expected responses included
df_merged.to_csv("final_combined_responses_with_expected.csv", index=False)

# Display the final merged DataFrame


print("✅ Final file with expected responses saved as 'final_combined_responses_with_expected.csv'!")


✅ Final file with expected responses saved as 'final_combined_responses_with_expected.csv'!
