In [1]:
import pandas as pd

In [4]:
df = pd.read_json("./merged.jsonl", lines=True)

In [2]:
def convert_df_to_nested_structure(df):
    """
    Convert DataFrame to the desired nested JSON structure
    """
    result = []
    
    # Group by 'text' column
    grouped = df.groupby('text')
    
    for text, group in grouped:
        # Get the label (assuming it's the same for all rows with same text)
        label = group['label'].iloc[0]
        
        # Create symptom_phrases list
        symptom_phrases = []
        
        for _, row in group.iterrows():
            # Handle potentially stringified JSON columns
            def safe_parse_json(value):
                # Handle None/NaN values first
                if value is None:
                    return None
                
                # Check if it's a pandas NA/NaN (but not an array)
                if not isinstance(value, (list, dict)) and pd.isna(value):
                    return None
                
                # If it's already a list or dict, return as is
                if isinstance(value, (list, dict)):
                    return value
                
                # Handle empty string
                if isinstance(value, str) and value == '':
                    return None
                
                # Try to parse string as JSON
                if isinstance(value, str):
                    try:
                        return ast.literal_eval(value)
                    except:
                        try:
                            return json.loads(value)
                        except:
                            return value
                
                return value
            
            symptom_phrase = {
                "phrase": row['phrase'] if pd.notna(row['phrase']) else "",
                "symptom": row['symptom'] if pd.notna(row['symptom']) else "",
                "analysis": row['analysis'] if pd.notna(row['analysis']) else "",
                "symptom_phrase_label": safe_parse_json(row['symptom_phrase_label']),
                "faiss_rag_results": safe_parse_json(row['faiss_rag_results']),
                "bm25_rag_results": safe_parse_json(row['bm25_rag_results']),
                "usefulness_triplets_docs": safe_parse_json(row['usefulness_triplets_docs'])
            }
            
            symptom_phrases.append(symptom_phrase)
        
        # Create the final structure
        text_entry = {
            "text": text,
            "label": label,
            "symptom_phrases": symptom_phrases
        }
        
        result.append(text_entry)
    
    return result

In [3]:
def convert_df_to_flat_structure(df):
    """
    Alternative: Convert to structure where each row becomes a separate entry
    with single symptom_phrase instead of grouping
    """
    result = []
    
    for _, row in df.iterrows():
        def safe_parse_json(value):
            # Handle None/NaN values first
            if value is None:
                return None
            
            # Check if it's a pandas NA/NaN (but not an array)
            if not isinstance(value, (list, dict)) and pd.isna(value):
                return None
            
            # If it's already a list or dict, return as is
            if isinstance(value, (list, dict)):
                return value
            
            # Handle empty string
            if isinstance(value, str) and value == '':
                return None
            
            # Try to parse string as JSON
            if isinstance(value, str):
                try:
                    return ast.literal_eval(value)
                except:
                    try:
                        return json.loads(value)
                    except:
                        return value
            
            return value
        
        text_entry = {
            "text": row['text'] if pd.notna(row['text']) else "",
            "label": row['label'] if pd.notna(row['label']) else "",
            "symptom_phrases": {
                "phrase": row['phrase'] if pd.notna(row['phrase']) else "",
                "symptom": row['symptom'] if pd.notna(row['symptom']) else "",
                "analysis": row['analysis'] if pd.notna(row['analysis']) else "",
                "symptom_phrase_label": safe_parse_json(row['symptom_phrase_label']),
                "faiss_rag_results": safe_parse_json(row['faiss_rag_results']),
                "bm25_rag_results": safe_parse_json(row['bm25_rag_results']),
                "usefulness_triplets_docs": safe_parse_json(row['usefulness_triplets_docs'])
            }
        }
        
        result.append(text_entry)
    
    return result

In [6]:
def agg_func(x):
    return x.tolist()

df.groupby('text').agg(agg_func).reset_index().head()

Unnamed: 0,text,label,phrase,symptom,analysis,faiss_rag_docs,bm25_rag_docs,usefulness_triplets_docs,symptom_phrase_label
0,18m 57 smoker 154 pounds i dont know if this s...,"[Anxiety, Anxiety, Anxiety, Anxiety, Anxiety]","[i deal with anxiety, i deal with anxiety, int...","[Generalized anxiety, Generalized anxiety, Hea...",[The patient acknowledges experiencing anxiety...,"[[{'content': 'arise. Second, the worries asso...","[[{'content': 'disorder are more pervasive, pr...",[[{'triplet': '(Head: 'Generalized Anxiety Dis...,"[{'phrase': 'i deal with anxiety', 'label': 'A..."
1,aphobes always talk about finding the right pe...,"[Normal, Normal, Normal, Normal, Normal, Normal]",[its sad that aphobes dont know these people e...,"[Feeling marginalized, Desire for validation, ...",[The patient feels a sense of sadness and frus...,[[{'content': 'failing to meet the cultural an...,[[{'content': 'are critical and dichotomous. ...,"[[{'triplet': '(Head: 'Sex Discrimination', De...",[{'phrase': 'its sad that aphobes dont know th...
2,blazblue chronophantasma extend CUR2 deus ex m...,"[Normal, Normal, Normal, Normal, Normal, Norma...","[I feel disconnected from reality., I spend ev...","[Depersonalization, Chronic loneliness and anx...",[The patient expresses a feeling of detachment...,[[{'content': 'the experience. Derealization i...,[[{'content': 'disorders and sexual dysfunctio...,"[[{'triplet': '(Head: 'Depersonalization', Def...",[{'phrase': 'I feel disconnected from reality....
3,"dear e, im sorry that you cant see me since il...","[Normal, Normal, Normal, Normal, Normal, Norma...","[i want to spend my life waking up to you..., ...","[Longing for connection, Fear of loss, Fear of...",[The desire to share life with someone indicat...,[[{'content': 'SE Axis Symptom Patterns in t...,[[{'content': 'Subjective Experience—SE Axis ...,[[{'triplet': '(Head: 'Someone to Have a Good ...,[{'phrase': 'i want to spend my life waking up...
4,edit 2 i am done! i think i actually managed t...,"[Normal, Normal, Normal, Normal, Normal, Norma...",[please wait outside store when done and ill e...,"[Social anxiety, Shock or surprise, Financial ...",[This phrase suggests an awareness of social d...,[[{'content': 'of social situations because of...,[[{'content': 'individuals with social anxiety...,"[[{'triplet': '(Head: 'Social Anxiety', Defini...",[{'phrase': 'please wait outside store when do...


In [9]:
df['symptom_phrase_label'].isna().sum()

np.int64(279)