In [1]:
import pandas as pd
from pathlib import Path

In [2]:
csv_paths = [file for file in Path("./train").glob("*.csv")]
csv_paths

[PosixPath('train/angst-silver_train.csv'),
 PosixPath('train/mhqa-b-subset_train.csv'),
 PosixPath('train/medmcqa_train.csv'),
 PosixPath('train/mhqa-b-all-labels.csv')]

# ANGST DATASET CLEANING

In [3]:
idx = 0
csv_path = csv_paths[idx]
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0.1,Unnamed: 0,text,depression_label,anxiety_label,multilabel_clf_label,multiclass_clf_label,id
0,0,"i woke up very early, 2 am. i just came out of...",1,0,"[1, 0]",Depression,4JEVyZ
1,1,ive been trying my damndest to improve myself ...,1,0,"[1, 0]",Depression,3jkFBr
2,2,he lives on the other side of the country and ...,1,0,"[1, 0]",Depression,4DZQ7p
3,3,i work in retail and lately ive had some shit ...,0,1,"[0, 1]",Anxiety,FSr2Vx
4,4,"i had my first one in november 2020, another 2...",0,1,"[0, 1]",Anxiety,42FfcW


In [4]:
df.drop(columns=['depression_label', 'anxiety_label', 'multilabel_clf_label', 'Unnamed: 0'], inplace=True)
df.head()

Unnamed: 0,text,multiclass_clf_label,id
0,"i woke up very early, 2 am. i just came out of...",Depression,4JEVyZ
1,ive been trying my damndest to improve myself ...,Depression,3jkFBr
2,he lives on the other side of the country and ...,Depression,4DZQ7p
3,i work in retail and lately ive had some shit ...,Anxiety,FSr2Vx
4,"i had my first one in november 2020, another 2...",Anxiety,42FfcW


In [5]:
df['multiclass_clf_label'].value_counts()

multiclass_clf_label
Normal                             3349
Depression                         2705
Anxiety                            1048
Comorbid (Depression + Anxiety)     565
Name: count, dtype: int64

In [6]:
df.loc[:, 'option1'] = "Depression"
df.loc[:, 'option2'] = "Anxiety"
df.loc[:, 'option3'] = "Comorbid (Depression + Anxiety)"
df.loc[:, 'option4'] = "Normal"

In [9]:
df.rename(columns={'multiclass_clf_label': 'answer'}, inplace=True)
df.head()

Unnamed: 0,text,answer,id,option1,option2,option3,option4
0,"i woke up very early, 2 am. i just came out of...",Depression,4JEVyZ,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal
1,ive been trying my damndest to improve myself ...,Depression,3jkFBr,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal
2,he lives on the other side of the country and ...,Depression,4DZQ7p,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal
3,i work in retail and lately ive had some shit ...,Anxiety,FSr2Vx,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal
4,"i had my first one in november 2020, another 2...",Anxiety,42FfcW,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal


In [10]:
def correct_option_number_func(row):
    options = [row['option1'], row['option2'], row['option3'], row['option4']]
    return options.index(row['answer']) + 1

df['correct_option_number'] = df.apply(correct_option_number_func, axis=1)
df.head()

Unnamed: 0,text,answer,id,option1,option2,option3,option4,correct_option_number
0,"i woke up very early, 2 am. i just came out of...",Depression,4JEVyZ,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,1
1,ive been trying my damndest to improve myself ...,Depression,3jkFBr,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,1
2,he lives on the other side of the country and ...,Depression,4DZQ7p,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,1
3,i work in retail and lately ive had some shit ...,Anxiety,FSr2Vx,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,2
4,"i had my first one in november 2020, another 2...",Anxiety,42FfcW,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,2


In [11]:
df['question'] = "Classify whether the given text is Depression, Anxiety or Comorbid (Depression + Anxiety) or Normal\n\n" + df['text']     

In [12]:
df.head()

Unnamed: 0,text,answer,id,option1,option2,option3,option4,correct_option_number,question
0,"i woke up very early, 2 am. i just came out of...",Depression,4JEVyZ,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,1,"Classify whether the given text is Depression,..."
1,ive been trying my damndest to improve myself ...,Depression,3jkFBr,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,1,"Classify whether the given text is Depression,..."
2,he lives on the other side of the country and ...,Depression,4DZQ7p,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,1,"Classify whether the given text is Depression,..."
3,i work in retail and lately ive had some shit ...,Anxiety,FSr2Vx,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,2,"Classify whether the given text is Depression,..."
4,"i had my first one in november 2020, another 2...",Anxiety,42FfcW,Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,2,"Classify whether the given text is Depression,..."


In [16]:
df = df[['id', 'question', 'option1', 'option2', 'option3', 'option4', 'answer', 'correct_option_number']]
df.head()

Unnamed: 0,id,question,option1,option2,option3,option4,answer,correct_option_number
0,4JEVyZ,"Classify whether the given text is Depression,...",Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,Depression,1
1,3jkFBr,"Classify whether the given text is Depression,...",Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,Depression,1
2,4DZQ7p,"Classify whether the given text is Depression,...",Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,Depression,1
3,FSr2Vx,"Classify whether the given text is Depression,...",Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,Anxiety,2
4,42FfcW,"Classify whether the given text is Depression,...",Depression,Anxiety,Comorbid (Depression + Anxiety),Normal,Anxiety,2


In [18]:
df.to_csv(csv_path.parent / f"{csv_path.stem}_cleaned.csv", index=False)

# MEDMCQA DATASET CLEANING

In [30]:
csv_path = csv_paths[2]
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,id,question,opa,opb,opc,opd,cop,choice_type,exp,subject_name,topic_name
0,53f79833-21b0-4336-8ef4-404c687ec807,Hypomimia is ?,Decreased ability to copy,Decreased execution,Deficit of expression by gesture,Deficit of fluent speech,2,single,Ans. C. Deficit of expression by gestureHypomi...,Psychiatry,
1,b72c9d20-5b2a-4353-ab60-fa1a899fde6b,Symptomatic treatment is not required in withd...,Cannabis,Morphine,Alcohol,Cocaine,0,single,Since cannabis causes very mild withdrawal sym...,Psychiatry,Substance Related and Addictive Disorders
2,8ae09b49-665e-45e3-ac3d-869ff1a96a44,"""Castration anxiety"" is seen in which phase of...",Oral,Anal,Phallic,Genital,2,multi,Phallic phase (3-5years): Male child develops ...,Psychiatry,
3,4eafd9d8-b2ea-4f44-ac7f-a12103f61945,"In Profound MR, IQ is",50-69,35-49,20-34,< 20,3,single,IQ range for categoriesICD-10DSM-IVMild50-6950...,Psychiatry,Child psychiatry
4,77e67056-21d4-4dad-ba62-f785e6ce4e38,Postponing paying attention of conscious impul...,Sublimation,Suppression,Humor,Anticipation,1,single,Suppression is pushing of unwanted feelings in...,Psychiatry,


In [31]:
df.drop(columns=['choice_type'], inplace=True)
df.head()

Unnamed: 0,id,question,opa,opb,opc,opd,cop,exp,subject_name,topic_name
0,53f79833-21b0-4336-8ef4-404c687ec807,Hypomimia is ?,Decreased ability to copy,Decreased execution,Deficit of expression by gesture,Deficit of fluent speech,2,Ans. C. Deficit of expression by gestureHypomi...,Psychiatry,
1,b72c9d20-5b2a-4353-ab60-fa1a899fde6b,Symptomatic treatment is not required in withd...,Cannabis,Morphine,Alcohol,Cocaine,0,Since cannabis causes very mild withdrawal sym...,Psychiatry,Substance Related and Addictive Disorders
2,8ae09b49-665e-45e3-ac3d-869ff1a96a44,"""Castration anxiety"" is seen in which phase of...",Oral,Anal,Phallic,Genital,2,Phallic phase (3-5years): Male child develops ...,Psychiatry,
3,4eafd9d8-b2ea-4f44-ac7f-a12103f61945,"In Profound MR, IQ is",50-69,35-49,20-34,< 20,3,IQ range for categoriesICD-10DSM-IVMild50-6950...,Psychiatry,Child psychiatry
4,77e67056-21d4-4dad-ba62-f785e6ce4e38,Postponing paying attention of conscious impul...,Sublimation,Suppression,Humor,Anticipation,1,Suppression is pushing of unwanted feelings in...,Psychiatry,


In [32]:
df.rename(columns={"exp": "explanation", "opa": "option1", "opb": "option2", "opc": "option3", "opd": "option4", "cop": "correct_option_number", "subject_name": "subject", "topic_name": "topic"}, inplace=True)
df.head()

Unnamed: 0,id,question,option1,option2,option3,option4,correct_option_number,explanation,subject,topic
0,53f79833-21b0-4336-8ef4-404c687ec807,Hypomimia is ?,Decreased ability to copy,Decreased execution,Deficit of expression by gesture,Deficit of fluent speech,2,Ans. C. Deficit of expression by gestureHypomi...,Psychiatry,
1,b72c9d20-5b2a-4353-ab60-fa1a899fde6b,Symptomatic treatment is not required in withd...,Cannabis,Morphine,Alcohol,Cocaine,0,Since cannabis causes very mild withdrawal sym...,Psychiatry,Substance Related and Addictive Disorders
2,8ae09b49-665e-45e3-ac3d-869ff1a96a44,"""Castration anxiety"" is seen in which phase of...",Oral,Anal,Phallic,Genital,2,Phallic phase (3-5years): Male child develops ...,Psychiatry,
3,4eafd9d8-b2ea-4f44-ac7f-a12103f61945,"In Profound MR, IQ is",50-69,35-49,20-34,< 20,3,IQ range for categoriesICD-10DSM-IVMild50-6950...,Psychiatry,Child psychiatry
4,77e67056-21d4-4dad-ba62-f785e6ce4e38,Postponing paying attention of conscious impul...,Sublimation,Suppression,Humor,Anticipation,1,Suppression is pushing of unwanted feelings in...,Psychiatry,


In [35]:
df['correct_option_number'] = df['correct_option_number'].apply(lambda x: x+1)
df.head()

Unnamed: 0,id,question,option1,option2,option3,option4,correct_option_number,explanation,subject,topic
0,53f79833-21b0-4336-8ef4-404c687ec807,Hypomimia is ?,Decreased ability to copy,Decreased execution,Deficit of expression by gesture,Deficit of fluent speech,3,Ans. C. Deficit of expression by gestureHypomi...,Psychiatry,
1,b72c9d20-5b2a-4353-ab60-fa1a899fde6b,Symptomatic treatment is not required in withd...,Cannabis,Morphine,Alcohol,Cocaine,1,Since cannabis causes very mild withdrawal sym...,Psychiatry,Substance Related and Addictive Disorders
2,8ae09b49-665e-45e3-ac3d-869ff1a96a44,"""Castration anxiety"" is seen in which phase of...",Oral,Anal,Phallic,Genital,3,Phallic phase (3-5years): Male child develops ...,Psychiatry,
3,4eafd9d8-b2ea-4f44-ac7f-a12103f61945,"In Profound MR, IQ is",50-69,35-49,20-34,< 20,4,IQ range for categoriesICD-10DSM-IVMild50-6950...,Psychiatry,Child psychiatry
4,77e67056-21d4-4dad-ba62-f785e6ce4e38,Postponing paying attention of conscious impul...,Sublimation,Suppression,Humor,Anticipation,2,Suppression is pushing of unwanted feelings in...,Psychiatry,


In [36]:
df['answer'] = df.apply(lambda x: x['option' + str(x['correct_option_number'])], axis=1)
df.head()

Unnamed: 0,id,question,option1,option2,option3,option4,correct_option_number,explanation,subject,topic,answer
0,53f79833-21b0-4336-8ef4-404c687ec807,Hypomimia is ?,Decreased ability to copy,Decreased execution,Deficit of expression by gesture,Deficit of fluent speech,3,Ans. C. Deficit of expression by gestureHypomi...,Psychiatry,,Deficit of expression by gesture
1,b72c9d20-5b2a-4353-ab60-fa1a899fde6b,Symptomatic treatment is not required in withd...,Cannabis,Morphine,Alcohol,Cocaine,1,Since cannabis causes very mild withdrawal sym...,Psychiatry,Substance Related and Addictive Disorders,Cannabis
2,8ae09b49-665e-45e3-ac3d-869ff1a96a44,"""Castration anxiety"" is seen in which phase of...",Oral,Anal,Phallic,Genital,3,Phallic phase (3-5years): Male child develops ...,Psychiatry,,Phallic
3,4eafd9d8-b2ea-4f44-ac7f-a12103f61945,"In Profound MR, IQ is",50-69,35-49,20-34,< 20,4,IQ range for categoriesICD-10DSM-IVMild50-6950...,Psychiatry,Child psychiatry,< 20
4,77e67056-21d4-4dad-ba62-f785e6ce4e38,Postponing paying attention of conscious impul...,Sublimation,Suppression,Humor,Anticipation,2,Suppression is pushing of unwanted feelings in...,Psychiatry,,Suppression


In [37]:
df.to_csv(csv_path.parent / f"{csv_path.stem}_cleaned.csv", index=False)

# CLEAN MHQA DATASET

In [39]:
df = pd.read_csv("/home/sracha/proper_kg_project/base_llm_train_eval/dataset/train/mhqa-b-all-labels.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49734 entries, 0 to 49733
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     49734 non-null  int64  
 1   topic                  49734 non-null  object 
 2   type                   49734 non-null  object 
 3   question               49734 non-null  object 
 4   option1                49734 non-null  object 
 5   option2                49734 non-null  object 
 6   option3                49734 non-null  object 
 7   option4                49734 non-null  object 
 8   correct_option         0 non-null      float64
 9   correct_option_number  49734 non-null  float64
 10  correct_answer         49734 non-null  object 
 11  qid                    49734 non-null  int64  
 12  output_id              49734 non-null  int64  
 13  output_question        49734 non-null  object 
 14  output_decision        49734 non-null  object 
 15  ou

In [42]:
df.drop(columns=['correct_option', 'output_id', 'output_question', 'output_decision', 'output_explanation', "qid"], inplace=True, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49734 entries, 0 to 49733
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     49734 non-null  int64  
 1   topic                  49734 non-null  object 
 2   type                   49734 non-null  object 
 3   question               49734 non-null  object 
 4   option1                49734 non-null  object 
 5   option2                49734 non-null  object 
 6   option3                49734 non-null  object 
 7   option4                49734 non-null  object 
 8   correct_option_number  49734 non-null  float64
 9   correct_answer         49734 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 3.8+ MB


In [43]:
df.rename(columns={"correct_answer": "answer"}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49734 entries, 0 to 49733
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     49734 non-null  int64  
 1   topic                  49734 non-null  object 
 2   type                   49734 non-null  object 
 3   question               49734 non-null  object 
 4   option1                49734 non-null  object 
 5   option2                49734 non-null  object 
 6   option3                49734 non-null  object 
 7   option4                49734 non-null  object 
 8   correct_option_number  49734 non-null  float64
 9   answer                 49734 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 3.8+ MB


In [45]:
df.to_csv("./train/mhqa-b-all-labels_cleaned.csv", index=False)