# Data Preprocessing

We will preprocess each raw dataset into a single CSV folder with two columns: "instructions" and "Output".

In [None]:
''' For the first dataset , It contains Description, Patient, Doctor columns where it has values as questionare and answers. We will remove the Description column and remove the words 
"Hi Doctor" and "Hello Doctor" with just simple "Hello" or "Hi".There were other letters such as Â, we need to remove that also.
Then Create a new csv for our integrated data and store it accordingly.'''

import pandas as pd
import os
import re

INPUT_CSV_PATH = r"MedicBot\raw_dataset\ai-medical-chatbot.csv"        
OUTPUT_CSV_PATH = r"MedicBot\MedicBot\Dataset\Data_Preprocessed.csv"   # Output file

try:
    df = pd.read_csv(INPUT_CSV_PATH)
except Exception as e:
    print(f"Error loading CSV: {e}")
    exit(1)

if 'Description' in df.columns:
    df.drop(columns=['Description'], inplace=True) # Droping Description

def clean_greeting(text):
    # Remove encoding artifacts
    text = re.sub(r"[Â�]", "", text)
    # Replace variations of 'Hi Doctor' or 'Hello Doctor' (case insensitive)
    text = re.sub(r"\bHi\s+Doctor\b", "Hi", text, flags=re.IGNORECASE)
    text = re.sub(r"\bHello\s+Doctor\b", "Hello", text, flags=re.IGNORECASE)
    return text

if 'Patient' in df.columns:
    df['Patient'] = df['Patient'].astype(str).apply(clean_greeting)
else:
    print("Error: 'Patient' column not found.")
    exit(1)

df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"Cleaned data saved to '{OUTPUT_CSV_PATH}'")


Cleaned data saved to 'C:\Users\jayac\OneDrive\Desktop\MedicBot\MedicBot\Dataset\Data_Preprocessed.csv'


In [None]:
''' For the next dataset, we will choose the first and second column and add those to the preprocessed dataset.Same for other files also'''

import pandas as pd

# Paths to your files
OUTPUT_CSV_PATH = r"MedicBot\MedicBot\Dataset\Data_Preprocessed.csv"   # Output file
DATASET_CSV = r"MedicBot\MedicBot\raw_dataset\validation_data_chatbot.csv"  # Update this with actual path if needed

# Load main preprocessed dataset
df_main = pd.read_csv(OUTPUT_CSV_PATH)

# Load second dataset with specified columns
df_new = pd.read_csv(DATASET_CSV, usecols=["short_question", "short_answer"])

# Rename columns to match the format
df_new.columns = ['Patient', 'Doctor']

# Append to the existing dataset
df_main = pd.concat([df_main, df_new], ignore_index=True)

# Save back to same file
df_main.to_csv(OUTPUT_CSV_PATH, index=False, encoding='utf-8')

print(f"Successfully appended {len(df_new)} rows to '{OUTPUT_CSV_PATH}'")


Successfully appended 11901 rows to 'C:\Users\jayac\OneDrive\Desktop\MedicBot\MedicBot\Dataset\Data_Preprocessed.csv'


In [None]:
''' For the next dataset, it is a json file, we will load the json file and in the json file we will choose the random question from the patterns section
and add that to the preprocessed dataset. We will also choose the answer from the responses section and add that to the preprocessed dataset.'''

import json
import csv
import random
import pandas as pd

# Path to your existing preprocessed CSV
OUTPUT_CSV_PATH = r"MedicBot\Dataset\Data_Preprocessed.csv"   # Output file

# Path to your new JSON file
json_file = r"MedicBot\MedicBot\raw_dataset\intents.json"

# Load the existing CSV into a DataFrame
df = pd.read_csv(OUTPUT_CSV_PATH)

# Load the JSON dataset
with open(json_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Prepare a list of new rows
new_rows = []
# Extract and append random pattern-response pairs
for intent in data["intents"]:
    if intent["patterns"] and intent["responses"]:
        question = random.choice(intent["patterns"]).strip()
        answer = intent["responses"][0].strip()
        new_rows.append({"Patient": question, "Doctor": answer})

# Convert to DataFrame and concatenate
new_df = pd.DataFrame(new_rows)
df = pd.concat([df, new_df], ignore_index=True)

# Save updated CSV (overwriting)
df.to_csv(OUTPUT_CSV_PATH, index=False, encoding='utf-8')
print("Data successfully appended from JSON to CSV.")


Data successfully appended from JSON to CSV.


In [None]:
import pandas as pd

# Try a more tolerant encoding
csv_path = r"MedicBot\MedicBot\Dataset\Data_Preprocessed.csv"
df = pd.read_csv(csv_path, encoding='ISO-8859-1')  # Or try 'cp1252' if needed

# Dataset summary
print("🔍 Dataset Overview")
print("-" * 40)
print(f"Shape (rows, columns): {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nColumn Types:")
print(df.dtypes)

# Unique values in each column
print("\n🔢 Unique Value Counts:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

# Show sample rows
print("\n🧾 First 5 Rows:")
print(df.head())

# Check for nulls
print("\n❓ Missing Values:")
print(df.isnull().sum())


🔍 Dataset Overview
----------------------------------------
Shape (rows, columns): (318464, 2)
Columns: ['Patient', 'Doctor']

Column Types:
Patient    object
Doctor     object
dtype: object

🔢 Unique Value Counts:
Patient: 269488 unique values
Doctor: 272191 unique values

🧾 First 5 Rows:
                                             Patient  \
0  Hi,I am just wondering what is abutting and ab...   
1  Hi, I am a 22-year-old female who was diagnose...   
2  Hi! I used to have clear skin but since I move...   
3  Hello,I am having an uncomfortable feeling in ...   
4  Hello,Before two years had sex with a call gir...   

                                              Doctor  
0  Hi. I have gone through your query with dilige...  
1  Hi. You have really done well with the hypothy...  
2  Hi there Acne has multifactorial etiology. Onl...  
3  Hello. The popping and discomfort what you fel...  
4  Hello. The HIV test uses a finger prick blood ...  

❓ Missing Values:
Patient    0
Doctor    

In [None]:
# Data Cleaning - Remove duplicates
import pandas as pd

# Load the dataset with tolerant encoding
csv_path = r"MedicBot\MedicBot\Dataset\Data_Preprocessed.csv"
df = pd.read_csv(csv_path, encoding='ISO-8859-1')

# Drop duplicate Doctor responses, keeping the first occurrence
df_unique = df.drop_duplicates(subset='Doctor', keep='first')

# Save the cleaned DataFrame back to the same file (overwrite)
df_unique.to_csv(csv_path, index=False, encoding='utf-8')

# Summary after cleaning
print("✅ Duplicates based on 'Doctor' column removed.")
print(f"Remaining rows: {df_unique.shape[0]}")
print(f"Columns: {df_unique.columns.tolist()}")


✅ Duplicates based on 'Doctor' column removed.
Remaining rows: 272191
Columns: ['Patient', 'Doctor']


In [1]:
#converting the dataset to LLaMA 3 format in a json file
import pandas as pd
import json
from tqdm import tqdm

# Load your CSV data
df = pd.read_csv(r'C:\Users\jayac\OneDrive\Desktop\MedicBot\MedicBot\Dataset\Data_Preprocessed.csv') 

# Initialize conversation list
conversations = []

# Template for system prompt
system_prompt = {
    "role": "system",
    "content": "You are a helpful medical AI assistant specialized in providing accurate medical information and advice."
}

# Process each row in the CSV
for _, row in tqdm(df.iterrows(), total=len(df)):
    question = row['Patient']  
    response = row['Doctor'] 
    
    # Create conversation dictionary
    conversation = {
        "messages": [
            system_prompt,
            {
                "role": "user",
                "content": f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|>"
            },
            {
                "role": "assistant",
                "content": f"<|start_header_id|>assistant<|end_header_id|>\n\n{response}<|eot_id|>"
            }
        ]
    }
    conversations.append(conversation)

# Save to JSON file
with open('medical_dataset_llama3_format.json', 'w') as f:
    json.dump(conversations, f, indent=2)

print(f"Conversion complete! Saved {len(conversations)} conversations to medical_chatbot_llama3_format.json")


100%|██████████| 272191/272191 [00:46<00:00, 5840.27it/s]


Conversion complete! Saved 272191 conversations to medical_chatbot_llama3_format.json
