# Data Transformation

## Imports 

In [16]:
import numpy as np
import pandas as pd
import json
import os
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
import torch

## Load Data

In [17]:
# List of guideline files
guideline_files = [
    'guideline_usp.jsonl',
    'guideline_rch.jsonl',
    'guideline_nice.jsonl',
    'guideline_mayo.jsonl',
    'guideline_idsa.jsonl',
    'guideline_gc.jsonl',
    'guideline_cma.jsonl',
    'guideline_cdc_diseases.jsonl',
    'guideline_aafp.jsonl'
]

dfs = []
# Data Path
DATA_PATH = '../data/'
PATIENTS_FOLDER = DATA_PATH+'structured_patients/'

# Loop through each guideline file and load into a DataFrame
for file in guideline_files:
    path = os.path.join(PATIENTS_FOLDER, file)
    df_temp = pd.read_json(path, lines=True)
    dfs.append(df_temp)

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

display(df.head())

Unnamed: 0,condition_name,guideline,patient_nb,structure
0,Iron Deficiency Anemia in Young Children,guideline_usp.jsonl_82.json,1,"{\n ""symptoms"": [\n {\n ""..."
1,Iron Deficiency Anemia in Young Children,guideline_usp.jsonl_82.json,2,"{\n ""symptoms"": [\n {\n ""..."
2,Iron Deficiency Anemia in Young Children,guideline_usp.jsonl_82.json,3,"{\n ""symptoms"": [\n {\n ""..."
3,Latent Tuberculosis Infection in Adults,guideline_usp.jsonl_5.json,1,"{\n ""symptoms"": [\n {\n ""..."
4,Latent Tuberculosis Infection in Adults,guideline_usp.jsonl_5.json,2,"{\n ""symptoms"": [\n {\n ""..."


## Preprocessing

In [18]:
def convert_to_dict(x):
    '''Convert the structured_patient column from string to dictionary'''
    try:
        return json.loads(x)
    except (json.JSONDecodeError, TypeError):
        return {}

# Handle data type
df['structure'] = df['structure'].apply(convert_to_dict)

# Dataframe of patients symptoms
symptoms_df = pd.json_normalize(df['structure'].apply(lambda x: x.get('symptoms', [])))
symptom_names_df = symptoms_df.applymap(lambda x: x.get('name of the symptom') if x is not None else None)

In [4]:
print(df)

                                     condition_name  \
0          Iron Deficiency Anemia in Young Children   
1          Iron Deficiency Anemia in Young Children   
2          Iron Deficiency Anemia in Young Children   
3           Latent Tuberculosis Infection in Adults   
4           Latent Tuberculosis Infection in Adults   
...                                             ...   
3938                                   Hypertension   
3939                                   Hypertension   
3940  Chronic pain, Acute pain, Opioid use disorder   
3941  Chronic pain, Acute pain, Opioid use disorder   
3942  Chronic pain, Acute pain, Opioid use disorder   

                         guideline  patient_nb  \
0      guideline_usp.jsonl_82.json           1   
1      guideline_usp.jsonl_82.json           2   
2      guideline_usp.jsonl_82.json           3   
3       guideline_usp.jsonl_5.json           1   
4       guideline_usp.jsonl_5.json           2   
...                            ...     

In [5]:
symptom_names_df[0:7]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,iron deficiency anemia,,,,,,,,,,,,,,,,,
1,irritability,fatigue,,,,,,,,,,,,,,,,
2,abdominal pain,palpitations,,,,,,,,,,,,,,,,
3,positive TB skin test reaction,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,
5,none,,,,,,,,,,,,,,,,,
6,severe headache,changes in vision,swelling,,,,,,,,,,,,,,,


In [19]:
# Replace rows with 'None' values with explicit 'None'
symptom_names_df = symptom_names_df.apply(lambda row: None if row[0] in ['None', 'none'] else row, axis=1)

In [20]:
# Remove Nan values
no_symptoms = symptom_names_df.index[symptom_names_df[0].isna()].tolist()
symptom_names_df = symptom_names_df.drop(no_symptoms)
df['condition_name'] = df['condition_name'].drop(no_symptoms)

In [21]:
# Replace all empty elements of symptom_names_df with None
symptom_names_df = symptom_names_df.applymap(lambda x: None if x == '' else x)

# Remove rows with only None values
symptom_names_df = symptom_names_df.dropna(how='all')

In [9]:
symptom_names_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,iron deficiency anemia,,,,,,,,,,,,,,,,,
1,irritability,fatigue,,,,,,,,,,,,,,,,
2,abdominal pain,palpitations,,,,,,,,,,,,,,,,
3,positive TB skin test reaction,,,,,,,,,,,,,,,,,
6,severe headache,changes in vision,swelling,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3937,headaches,dizziness,feeling of pulsations in neck,elevated blood pressure,,,,,,,,,,,,,,
3938,palpitations,chest discomfort,,,,,,,,,,,,,,,,
3940,chronic pain,functional impairment,opioid-related harms,,,,,,,,,,,,,,,
3941,severe pain,gastrointestinal issues,cognitive impairment,possible overdose,,,,,,,,,,,,,,


In [22]:
X = symptom_names_df.copy()
X = X.reset_index(drop=True)

y = df['condition_name'].loc[symptom_names_df.index].copy()
y = y.reset_index(drop=True)

In [23]:
# Get the indices where y is empty
empty_y = y.index[y == ''].tolist()

# Drop theses indices from X and y
X = X.drop(empty_y).reset_index(drop=True)
y = y.drop(empty_y).reset_index(drop=True)

In [24]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,iron deficiency anemia,,,,,,,,,,,,,,,,,
1,irritability,fatigue,,,,,,,,,,,,,,,,
2,abdominal pain,palpitations,,,,,,,,,,,,,,,,
3,positive TB skin test reaction,,,,,,,,,,,,,,,,,
4,severe headache,changes in vision,swelling,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3871,headaches,dizziness,feeling of pulsations in neck,elevated blood pressure,,,,,,,,,,,,,,
3872,palpitations,chest discomfort,,,,,,,,,,,,,,,,
3873,chronic pain,functional impairment,opioid-related harms,,,,,,,,,,,,,,,
3874,severe pain,gastrointestinal issues,cognitive impairment,possible overdose,,,,,,,,,,,,,,


In [13]:
y

0            Iron Deficiency Anemia in Young Children
1            Iron Deficiency Anemia in Young Children
2            Iron Deficiency Anemia in Young Children
3             Latent Tuberculosis Infection in Adults
4                                        Preeclampsia
                            ...                      
3871                                     Hypertension
3872                                     Hypertension
3873    Chronic pain, Acute pain, Opioid use disorder
3874    Chronic pain, Acute pain, Opioid use disorder
3875    Chronic pain, Acute pain, Opioid use disorder
Name: condition_name, Length: 3876, dtype: object

## One Hot Encoding

In [14]:
# One-hot encoding for each symptom
X_ohe = pd.get_dummies(X.stack().dropna()).groupby(level=0).max()
X_ohe = X_ohe.astype(int)
X_ohe

Unnamed: 0,40 mmHg drop in systolic blood pressure,AML,Abdominal discomfort,Abdominal pain,Abdominal tenderness,Academic difficulties,Academic underachievement,Acanthosis nigricans,Aching pain,Achromic patch on the skin,...,yellowing of the skin and eyes,yellowing of the skin and sclera,yellowing of whites of eyes,yellowish discoloration in eyes,yellowish discoloration of skin and eyes,yellowish discoloration of the skin,yellowish tinge of the skin and sclera,yellowish tinge to skin,yellowish vaginal discharge,yellowish-green nipple discharge
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3872,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3873,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3874,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Word2Vec

In [15]:
# Combine all symptoms into a single column
combined_symptoms = X.apply(lambda row: ' '.join(filter(lambda x: x is not None and x != 'None', map(str, row))), axis=1)

# Tokenization and standardization
combined_symptoms = combined_symptoms.apply(lambda x: str(x).lower())
combined_symptoms = combined_symptoms.apply(lambda x: ''.join(e for e in str(x) if e.isalnum() or e.isspace()))

# Display the preprocessed data
print(combined_symptoms)

0                                  iron deficiency anemia
1                                    irritability fatigue
2                             abdominal pain palpitations
3                          positive tb skin test reaction
4              severe headache changes in vision swelling
                              ...                        
3871    headaches dizziness feeling of pulsations in n...
3872                        palpitations chest discomfort
3873    chronic pain functional impairment opioidrelat...
3874    severe pain gastrointestinal issues cognitive ...
3875                                         chronic pain
Length: 3876, dtype: object


In [16]:
# Tokenize the text
tokenized_text = combined_symptoms.apply(word_tokenize)

####################stopwords removal#######################
# Download the stopwords from NLTK
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(tokens):
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

# Apply the function to remove stopwords
tokenized_text = tokenized_text.apply(remove_stopwords)

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_text, vector_size=200, window=5, min_count=1, workers=4)

# Access the word embeddings
word_embeddings = model.wv

# Convert text to feature vectors using word embeddings
def text_to_feature_vector(text, word_embeddings):
    words = word_tokenize(text)
    vectors = [word_embeddings[word] for word in words if word in word_embeddings]
    return np.mean(vectors, axis=0) if vectors else np.zeros(word_embeddings.vector_size)


# Apply the function to convert symptoms to feature vectors
symptoms_word2vec = combined_symptoms.apply(lambda x: text_to_feature_vector(x, word_embeddings))

X_w2v = pd.DataFrame(symptoms_word2vec.to_list(), index=symptoms_word2vec.index)
X_w2v

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arthurchansel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.007914,-0.038824,-0.013107,0.037840,0.072438,-0.039965,0.055321,0.082795,-0.042764,0.016386,...,0.053186,-0.061435,-0.036243,-0.010091,0.036202,-0.002124,0.053166,-0.045095,-0.005099,-0.035915
1,0.028948,-0.143550,-0.046417,0.141125,0.255049,-0.150867,0.204053,0.300005,-0.163435,0.056143,...,0.200596,-0.220944,-0.150846,-0.039119,0.140379,-0.013511,0.182753,-0.177423,-0.025001,-0.142211
2,0.020734,-0.143971,-0.049396,0.143212,0.262119,-0.154341,0.213036,0.309495,-0.164868,0.062704,...,0.203167,-0.220386,-0.149755,-0.047000,0.137380,-0.010750,0.192081,-0.184437,-0.029765,-0.143411
3,0.012730,-0.057274,-0.017434,0.056724,0.108166,-0.063334,0.085319,0.124860,-0.071402,0.027667,...,0.082728,-0.094956,-0.060044,-0.016092,0.060665,-0.002881,0.080780,-0.073268,-0.009099,-0.058128
4,0.030565,-0.169944,-0.050216,0.174220,0.314410,-0.182736,0.247906,0.364312,-0.201429,0.070583,...,0.243449,-0.271055,-0.172826,-0.048753,0.167176,-0.012166,0.233756,-0.217399,-0.025908,-0.169888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3871,0.021835,-0.144796,-0.045166,0.145312,0.266501,-0.155578,0.210560,0.310733,-0.165151,0.060206,...,0.207533,-0.229764,-0.146352,-0.040722,0.144495,-0.010811,0.198750,-0.184365,-0.023138,-0.144436
3872,0.020613,-0.121998,-0.039162,0.118191,0.230239,-0.135446,0.184306,0.264170,-0.138914,0.048066,...,0.172886,-0.193750,-0.132286,-0.039715,0.121561,-0.010995,0.166850,-0.153390,-0.020744,-0.122422
3873,0.014229,-0.083787,-0.027027,0.083283,0.151221,-0.093032,0.125585,0.178250,-0.096975,0.035420,...,0.119455,-0.132433,-0.084299,-0.024085,0.077839,-0.005952,0.113255,-0.108370,-0.015357,-0.081908
3874,0.017168,-0.097415,-0.028890,0.097644,0.174519,-0.103333,0.141316,0.204182,-0.110519,0.038421,...,0.136387,-0.150042,-0.097105,-0.028185,0.091663,-0.007126,0.130433,-0.122824,-0.015389,-0.093704


## Doc2Vec

In [17]:
# Tokenize the text
tokenized_text = combined_symptoms.apply(word_tokenize)

# Prepare the dataset for Doc2Vec
tagged_data = [TaggedDocument(words=symptoms, tags=[str(i)]) for i, symptoms in enumerate(tokenized_text)]



# Train a Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=50, window=5, min_count=1, workers=4, epochs=40)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Convert symptoms to feature vectors using Doc2Vec
feature_vectors = [doc2vec_model.infer_vector(symptoms.words) for symptoms in tagged_data]

# Convert the feature vectors into a dataframe
list_of_features = [feature.tolist() for feature in feature_vectors]

X_d2v = pd.DataFrame(list_of_features)
X_d2v

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.059654,0.166322,-0.155803,0.028233,-0.083151,-0.134607,0.159397,0.108817,-0.089852,0.035782,...,0.149854,0.147210,-0.039331,0.076343,0.184032,0.011053,-0.097097,-0.342628,0.106245,0.090196
1,-0.039197,-0.006991,-0.115984,0.022150,-0.062383,-0.251436,0.047698,0.163146,-0.217702,0.067027,...,0.230371,-0.048394,-0.192849,-0.032412,0.294978,0.054465,0.075637,-0.138314,0.066259,0.052611
2,-0.098409,-0.192951,-0.035510,-0.000142,-0.048260,-0.249686,0.163066,0.011561,-0.081177,-0.113157,...,0.143283,0.054830,0.029495,-0.040082,0.119279,-0.001446,-0.248228,-0.022338,-0.091619,0.110683
3,-0.015665,0.105465,0.198895,-0.014339,-0.174346,-0.293034,-0.005320,0.265567,-0.207461,0.033286,...,0.023645,0.157465,-0.049882,0.159868,0.157368,-0.178376,-0.026096,-0.083043,0.079601,0.053569
4,-0.104144,0.032309,-0.122320,-0.321388,-0.090922,0.092601,0.078183,0.007270,0.235384,0.046313,...,0.119282,-0.182751,0.036518,-0.061702,-0.012342,-0.198286,-0.061394,0.088188,0.255809,-0.008151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3871,-0.175744,0.016087,0.129401,-0.065084,0.301755,-0.045501,-0.126218,-0.016366,-0.153796,0.078488,...,0.215780,-0.335957,0.241955,-0.192733,0.192866,0.057499,-0.271619,-0.464643,0.020604,0.298291
3872,-0.083521,0.010837,-0.034444,-0.027945,-0.061901,-0.168469,0.110047,0.098739,-0.160136,-0.069998,...,0.106231,-0.008066,-0.008724,0.022671,0.107202,-0.054390,-0.134756,-0.162137,-0.026947,0.084071
3873,-0.063315,0.008894,-0.003987,-0.017385,0.135673,0.053559,-0.115076,0.227158,-0.451999,-0.070806,...,0.029149,0.103744,0.157664,-0.184330,-0.033649,-0.092828,-0.148279,-0.052481,0.088602,0.076878
3874,0.071920,0.027897,-0.162408,-0.101583,-0.021769,-0.018941,0.330979,0.149825,-0.476354,0.017650,...,0.406828,0.073205,0.173386,-0.053687,0.105157,0.233431,0.031279,-0.173129,0.060786,0.073236


## TF-IDF

In [18]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=50)  # You can adjust 'max_features' as needed

# Fit and transform the vectorizer to the text data
tfidf_vectors = tfidf_vectorizer.fit_transform(combined_symptoms)

# Convert the TF-IDF vectors into a DataFrame
X_tfidf = pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
X_tfidf

Unnamed: 0,abdominal,and,back,bleeding,blood,breath,breathing,chest,chronic,cough,...,skin,swelling,tenderness,the,to,urinary,vision,vomiting,weakness,weight
0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.856993,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,1.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.432583,0.0,0.0,0.0,0.0,0.539725,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3871,0.000000,0.0,0.0,0.0,0.468134,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3872,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.695685,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3873,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.898917,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3874,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


## BERT

In [19]:
# Load pre-trained model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def multiple_embed_bert(embed_input):
    veclist = []
    try:
        # Tokenize input and prepare it as input for BERT
        encoded_input = tokenizer(embed_input, padding=True, truncation=True, return_tensors='pt')
        
        # Move encoded input to the same device as the model
        encoded_input = {k: v.to(model.device) for k, v in encoded_input.items()}

        # Get BERT embeddings
        with torch.no_grad():
            outputs = model(**encoded_input)

        # Extract embeddings from the last hidden state
        embeddings = outputs.last_hidden_state.mean(dim=1)
        
        # Convert embeddings to a list and append to veclist
        veclist.extend(embeddings.cpu().numpy())
        
    except Exception as e:
        print(f"Embeddings failure {e}")
        raise

    return veclist

# combined_symptoms to list
input = combined_symptoms.tolist()


bert_embeddings = multiple_embed_bert(input)

X_bert = pd.DataFrame(bert_embeddings)
X_bert

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.208517,-0.000013,-0.711176,-0.342346,0.070252,0.226370,0.400741,0.509997,-0.145034,-0.571860,...,-0.221595,-0.068366,0.051252,0.290674,0.360097,-0.198913,-0.366278,-0.149350,-0.463429,-0.110735
1,-0.253408,0.129405,-0.091569,-0.146756,-0.402022,0.340467,0.092752,0.270518,0.145255,-0.249335,...,0.013241,-0.254359,0.076998,-0.179103,0.002547,-0.028798,0.088169,-0.164830,-0.238511,0.005973
2,-0.255549,-0.127399,-0.446685,-0.382410,-0.011856,0.263003,0.199510,0.486213,-0.635665,-0.708975,...,-0.076980,-0.263508,0.556844,0.346942,0.085227,-0.254693,-0.175688,-0.269500,-0.744538,0.078658
3,-0.192197,-0.335615,-0.280847,-0.335639,0.210119,0.013806,0.144440,0.045824,0.132843,-0.513960,...,-0.002159,0.152777,0.038304,0.285491,0.128834,-0.341475,-0.287444,-0.133267,-0.130229,0.112779
4,-0.227414,0.225162,-0.030806,-0.432816,0.013661,0.202633,0.483947,0.266946,-0.363907,-0.643445,...,-0.157287,-0.238253,0.444412,-0.061278,0.019615,-0.055614,-0.111165,-0.298461,-0.579876,-0.060037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3871,-0.118092,0.144413,0.395407,-0.307600,-0.071481,-0.116478,0.383154,0.187873,-0.460239,-0.387271,...,-0.374695,-0.291777,0.395598,-0.094615,0.067760,0.085135,-0.193184,-0.506321,-0.534404,-0.103453
3872,-0.289342,-0.105608,-0.348439,-0.464894,-0.210744,0.349161,0.169738,0.265315,-0.493098,-0.755392,...,0.129054,-0.127034,0.451355,0.170924,-0.087790,0.035983,-0.202898,-0.210246,-0.657496,0.171659
3873,-0.233892,0.396716,0.159856,-0.211278,0.038456,0.179054,0.165806,-0.011207,-0.170786,-0.282124,...,-0.071575,-0.358311,-0.023438,0.223124,0.152403,-0.032920,-0.420965,-0.564719,-0.555064,0.108437
3874,-0.176244,0.356798,-0.040982,-0.334752,0.233743,0.233907,0.187721,0.299728,-0.183411,-0.378041,...,-0.240871,-0.396020,0.088551,0.074995,0.035620,0.226789,-0.183014,-0.619641,-0.597328,-0.000918


## Save to csv

In [20]:
X.to_csv('TransformedData/X.csv', index=False)
y.to_csv('TransformedData/y.csv', index=False)
X_ohe.to_csv('TransformedData/X_ohe.csv', index=False)
X_w2v.to_csv('TransformedData/X_w2v.csv', index=False)
X_d2v.to_csv('TransformedData/X_d2v.csv', index=False)
X_tfidf.to_csv('TransformedData/X_tfidf.csv', index=False)
X_bert.to_csv('TransformedData/X_bert.csv', index=False)

# Test Data Augmentation

We didn't use it since the new generated words were not accurate

In [25]:
DA_df = pd.DataFrame(columns=['Symptoms', 'Condition'])
tokenized_symptoms = X.apply(lambda row: [symptom for symptom in row if symptom is not None], axis=1)
DA_df['Symptoms'] = tokenized_symptoms
DA_df['Condition'] = y
DA_df['Symptoms'] = DA_df['Symptoms'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
DA_df

Unnamed: 0,Symptoms,Condition
0,iron deficiency anemia,Iron Deficiency Anemia in Young Children
1,"irritability, fatigue",Iron Deficiency Anemia in Young Children
2,"abdominal pain, palpitations",Iron Deficiency Anemia in Young Children
3,positive TB skin test reaction,Latent Tuberculosis Infection in Adults
4,"severe headache, changes in vision, swelling",Preeclampsia
...,...,...
3871,"headaches, dizziness, feeling of pulsations in...",Hypertension
3872,"palpitations, chest discomfort",Hypertension
3873,"chronic pain, functional impairment, opioid-re...","Chronic pain, Acute pain, Opioid use disorder"
3874,"severe pain, gastrointestinal issues, cognitiv...","Chronic pain, Acute pain, Opioid use disorder"


In [26]:
import nlpaug.augmenter.word as naw

# Define the augmenter
aug = naw.SynonymAug(aug_src='wordnet')
#aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")

def augment_symptom_list(symptom_list, num_augmented_rows=1):
    augmented_rows = []
    for _ in range(num_augmented_rows):
        #augmented_symptoms = [aug.augment(symptom) for symptom in symptom_list]
        augmented_symptoms = [aug.augment(symptom_list)]
        augmented_rows.append(augmented_symptoms)
    return augmented_rows

# Apply the augmentation and expand the dataset
augmented_df = pd.DataFrame(columns=DA_df.columns)
for index, row in DA_df[0:10].iterrows():
    # Original row
    augmented_df = augmented_df.append(row, ignore_index=True)
    # Augmented rows
    augmented_symptom_lists = augment_symptom_list(row['Symptoms'])[0]
    #for aug_symptoms in augmented_symptom_lists:
    new_row = row.copy()
    #new_row['Symptoms'] = aug_symptoms
    new_row['Symptoms'] = augmented_symptom_lists[0][0]
    augmented_df = augmented_df.append(new_row, ignore_index=True)

augmented_df

  augmented_df = augmented_df.append(row, ignore_index=True)
  augmented_df = augmented_df.append(new_row, ignore_index=True)


Unnamed: 0,Symptoms,Condition
0,iron deficiency anemia,Iron Deficiency Anemia in Young Children
1,iron deficiency genus anemia,Iron Deficiency Anemia in Young Children
2,"irritability, fatigue",Iron Deficiency Anemia in Young Children
3,"snappishness, fatigue",Iron Deficiency Anemia in Young Children
4,"abdominal pain, palpitations",Iron Deficiency Anemia in Young Children
5,"abdominal pain sensation, trembling",Iron Deficiency Anemia in Young Children
6,positive TB skin test reaction,Latent Tuberculosis Infection in Adults
7,positive TB tegument test chemical reaction,Latent Tuberculosis Infection in Adults
8,"severe headache, changes in vision, swelling",Preeclampsia
9,"severe headache, change in visual sense, swell",Preeclampsia
