In [6]:
import os
import pandas as pd

#### Read json data

In [73]:
folder_name = "alldata/note"
file_name = "semantic_OPENAI_summaries.jsonl" 
file_path = os.path.join(os.getcwd(), folder_name, file_name)

#kinda messy dataframe
df_jsonl = pd.read_json(file_path, lines=True)

#extract embedding as separate column
df_jsonl["embedding"] = df_jsonl["response"].apply(
    lambda resp: resp["body"]["data"][0]["embedding"]
)

#split vector in embedding column into multiple columns - THIS HELPS WRITE TO CSV
df_emb_expanded = pd.DataFrame(df_jsonl["embedding"].tolist())

#Merge back into original df_jsonl
df_merged = pd.concat([df_jsonl, df_emb_expanded], axis=1)

#Extract note_id 
#Remove the prefix "embedding-original-" from the new note_id column
df_merged.rename(columns={"custom_id": "note_id"}, inplace=True)
df_merged["note_id"] = df_merged["note_id"].str.replace("embedding-original-", "", regex=False)

#only keep cols we want
df_merged.drop(columns=["id", "response", "error", "embedding"], inplace=True)

In [74]:
df_merged.head(1)

Unnamed: 0,note_id,0,1,2,3,4,5,6,7,8,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,10202247-DS-15,-0.000184,-0.006476,0.034641,-0.023113,-0.025192,0.027593,-0.018605,-0.008514,-0.047314,...,0.015311,-0.005122,0.030984,-0.014487,0.006891,-0.018339,-0.048486,0.001417,-0.010921,-0.03492


#### Read original data

In [75]:
#read
file_name = "discharge.csv"
file_path = os.path.join(os.getcwd(), folder_name, file_name)
df = pd.read_csv(file_path)

In [76]:
#create Y label
df['charttime'] = pd.to_datetime(df['charttime'])
df = df.sort_values(by=['subject_id', 'charttime'])
df['Y'] = df.groupby('subject_id')['charttime'].shift(-1).notna().astype(int)

#### merge the two on note_id to get Y labels

In [77]:
df_merged_final = df_merged.merge(df[["note_id", "Y"]], on="note_id", how="left")

In [78]:
df_merged_final[df_merged_final.isna().any(axis=1)]

Unnamed: 0,note_id,0,1,2,3,4,5,6,7,8,...,1527,1528,1529,1530,1531,1532,1533,1534,1535,Y


In [79]:
df_merged_final.drop(columns=["note_id"], inplace=True)

In [80]:
df_merged_final.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1527,1528,1529,1530,1531,1532,1533,1534,1535,Y
0,-0.000184,-0.006476,0.034641,-0.023113,-0.025192,0.027593,-0.018605,-0.008514,-0.047314,-0.016581,...,-0.005122,0.030984,-0.014487,0.006891,-0.018339,-0.048486,0.001417,-0.010921,-0.03492,1


In [81]:
#Define the output file path
output_file_path = os.path.join(os.getcwd(), folder_name, "OPENAI_merged_500_summary_embedding.csv")

#Save as CSV
df_merged_final.to_csv(output_file_path, index=False)

#### check that we can read in the csv and it is not null

In [82]:
df_jsonl_check = pd.read_csv(output_file_path)

In [83]:
df_jsonl_check.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1527,1528,1529,1530,1531,1532,1533,1534,1535,Y
0,-0.000184,-0.006476,0.034641,-0.023113,-0.025192,0.027593,-0.018605,-0.008514,-0.047314,-0.016581,...,-0.005122,0.030984,-0.014487,0.006891,-0.018339,-0.048486,0.001417,-0.010921,-0.03492,1


In [84]:
df_jsonl_check.isna().sum()

0       0
1       0
2       0
3       0
4       0
       ..
1532    0
1533    0
1534    0
1535    0
Y       0
Length: 1537, dtype: int64

In [85]:
print(df_jsonl_check['Y'].value_counts())

Y
1    268
0    232
Name: count, dtype: int64
