In [7]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForTokenClassification
import string

In [2]:
'''
Get the raw phee dataset. Give it to clinical BERT model. Save the results in csv files.
Sentence_ID
Org_Sentence: original sentence (the version in phee)
Org_Tag: original tag (the version in phee)
Sentence: tokenized sentence (with bert tokenizer)
Tag: tags that come from clinical BERT model
'''
files = ['train', 'dev', 'test']
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
model.to(device)

def process_sentence(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, add_special_tokens=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)[0]
    labels = [model.config.id2label[label_id.item()] for label_id in predictions[1:-1].cpu()]  # Exclude [CLS], [SEP]
    united_labels = []
    united_tokens = []
    # ignore words that are split into subwords
    
    for token, label in zip(tokenizer.tokenize(sentence), labels):
        if token.startswith('##'):
            united_tokens[-1] = united_tokens[-1] + token[2:]
            continue
        united_tokens.append(token)
        united_labels.append(label)

    return united_tokens, united_labels

def drop_duplicates(df):
    # Initial number of samples
    initial_count = len(df)

    # Identify duplicates for 'Sentence' and 'Tag' columns
    duplicates_both = df.duplicated(subset=['Org_Sentence', 'Org_Tag'], keep=False)

    # Identify duplicates only based on 'Sentence'
    duplicates_sentence = df.duplicated(subset=['Org_Sentence'], keep=False)

    # Filter out the rows where 'Sentence' is duplicated but 'Tag' is not
    to_drop =df[duplicates_sentence & ~duplicates_both]

    # Count the number of rows to be dropped
    drop_count = len(to_drop)

    # Drop the identified rows
    df = df.drop(to_drop.index)

    # Now, drop duplicate rows based on both 'Sentence' and 'Tag'
    df = df.drop_duplicates(subset=['Org_Sentence', 'Org_Tag'])

    # Calculate the number of samples removed
    samples_removed = initial_count - len(df)

    # Report the numbers
    print(f"Initial count of samples: {initial_count}")
    print(f"Number of ambiguous samples (duplicated 'Org_Sentence' but not 'Org_Tag'): {drop_count}")
    print(f"Total samples removed: {samples_removed}")

    return df


def preprocess(file_name):
    # Reading the file and creating a DataFrame
    with open(f'data/raw/phee/ace/{file_name}.txt', 'r') as file:
        lines = file.readlines()

    words, tags, sentence_ids = [], [], []
    sentence_id = 0

    for line in lines:
        if line.strip() == '':  # Check for empty line indicating end of sentence
            sentence_id += 1
        else:
            word, tag = line.strip().split()
            words.append(word)
            tags.append(tag)
            sentence_ids.append(sentence_id)

    # Creating DataFrame
    df = pd.DataFrame({'Sentence_ID': sentence_ids, 'Org_Sentence': words, 'Org_Tag': tags})

    df_s = df.groupby('Sentence_ID').agg({
        'Org_Sentence': lambda x: ' '.join(x),
        'Org_Tag': lambda x: ' '.join(x)
        }).reset_index()

    # df_s = df_s.rename(columns={"Word": "Org_Sentence"})

    
    df_s['Result'] = df_s['Org_Sentence'].apply(process_sentence)
    df_s[['Sentence', 'Tag']] = pd.DataFrame(df_s['Result'].tolist(), index=df_s.index)
    df_s['Sentence'] = df_s['Sentence'].apply(lambda x: ' '.join(x))
    df_s['Tag'] = df_s['Tag'].apply(lambda x: ' '.join(x))
    # drop results column
    df_s = df_s.drop(columns=['Result'])

    df_s = drop_duplicates(df_s)
    
    # save to csv
    df_s.to_csv(f'data/processed/phee/ace/{file_name}.csv', index=False)

    return df_s

for file_name in files:
    preprocess(file_name)



Initial count of samples: 3006
Number of ambiguous samples (duplicated 'Org_Sentence' but not 'Org_Tag'): 212
Total samples removed: 213
Initial count of samples: 1003
Number of ambiguous samples (duplicated 'Org_Sentence' but not 'Org_Tag'): 85
Total samples removed: 85
Initial count of samples: 968
Number of ambiguous samples (duplicated 'Org_Sentence' but not 'Org_Tag'): 0
Total samples removed: 0


In [3]:
# def update_t_capital(row):
#     new_tags = []
#     for tag, org_tag in zip(row['Tag'].split(), row['Org_Tokenized_Tag'].split()):
#         if tag == 'I-test':
#             new_tags.append('I-Test')
#         else:
#             new_tags.append(org_tag)
#     return ' '.join(new_tags)

# files = ['train', 'dev', 'test']
# for file_name in files:
#     df = pd.read_csv(f'data/processed/phee/ace/{file_name}.csv')
#     # df_unequal contains the sentences that become different after bert tokenizer (e.g. org-> I' m , bert-> I ' m)
#     df_unequal = df[(df['Tag'].str.split().apply(lambda x: len(x)) != df['Org_Tag'].str.split().apply(lambda x: len(x)))]
#     df_unequal.reset_index(drop=True, inplace=True)

#     for i in range(len(df_unequal)):
#         j = 0
#         k = 0
#         while j < len(df_unequal['Sentence'][i].split()):
#             if df_unequal['Sentence'][i].split()[j].lower() != df_unequal['Org_Sentence'][i].split()[k].lower():
            
#                 # word in tokenized sentence
#                 word = df_unequal['Sentence'][i].split()[j] 
#                 while(word.lower() != df_unequal['Org_Sentence'][i].split()[k].lower()):
                    
#                     # print('word: ', word, 'word2 ', df_unequal['Org_Sentence'][i].split()[k].lower())
#                     word = word + df_unequal['Sentence'][i].split()[j+1]
#                     str_arr = df_unequal['Org_Tag'][i].split()
#                     str_arr.insert(k+1,  df_unequal['Org_Tag'][i].split()[k])
#                     df_unequal.loc[i, 'Org_Tag'] = ' '.join(str_arr)
#                     j = j + 1
#             k = k + 1
#             j = j + 1
            
#     # create a copy of df['Org_Tag'] column
#     df['Org_Tokenized_Tag'] = df['Org_Tag'] 

#     update_values = df['Sentence_ID'].map(df_unequal.set_index('Sentence_ID')['Org_Tag'])
#     df['Org_Tokenized_Tag'] = update_values.combine_first(df['Org_Tokenized_Tag'])  

#     # I-test -> I-Test
#     df['Org_Tokenized_Tag'] = df.apply(update_t_capital, axis=1)

#     # save to csv
#     df.to_csv(f'data/processed/phee/ace/{file_name}_w_test_tag.csv', index=False)
    

In [28]:
# # There is a new mapping check new_mapping.ipynb
# treatment_tags = [
#     "I-Treatment.Drug",
#     "I-Treatment.Combination.Drug",
#     "I-Treatment",
#     "I-Treatment.Route",
#     "I-Treatment.Dosage",
#     "I-Combination.Drug",
#     "I-Treatment.Duration",
#     "I-Dosage",
#     "I-Freq",
#     "I-Drug",
#     "I-Treatment.Time_elapsed",
#     "I-Treatment.Freq"
# ]

# # Map to the new tag
# treatment_mapping = {tag: "I-Treatment" for tag in treatment_tags}

# problem_tags = [
#     "I-Effect",
#     "I-Treat_Disorder",
#     "I-Treatment.Treat_Disorder",
#     "I-Subject.Sub_Disorder",
#     "I-Sub_Disorder"
# ]

# # Map to the new tag
# problem_mapping = {tag: "I-Problem" for tag in problem_tags}

# background_tags = [
#     "I-Subject.Age",
#     "I-Subject.Gender",
#     "I-Subject.Race",
#     "I-Subject",
#     "I-Race",
#     "I-Gender"
# ]

# # Map to the new tag
# background_mapping = {tag: "I-Background" for tag in background_tags}

# other_tags = [
#     "I-Duration",
#     "I-Time_elapsed"
# ]

# # Map to the new tag
# other_mapping = {tag: "I-Other" for tag in other_tags}

# o_tags = [
#     "I-Subject.Population",
#     "I-Potential_therapeutic_event.Trigger",
#     "I-Adverse_event.Trigger",
#     "I-Route",
#     "I-Population",
#     "O"
# ]

# # Map to the new tag
# o_mapping = {tag: "O" for tag in o_tags}

# test_tags = [
#     "I-Test"
# ]

# test_mapping = {tag: "I-Test" for tag in test_tags}

# # Combine all mappings
# all_mappings = {**treatment_mapping, **problem_mapping, **background_mapping, **other_mapping, **o_mapping, **test_mapping}

# def map_tags(tag_string):
#     return ' '.join(all_mappings.get(tag, tag) for tag in tag_string.split())

# # List of words to relabel as "O"
# words_to_relabel = ["a", "an", "and", "the", "with", 'for', 'nor', 'but', 'or', 'yet']

# # Function to change the label of specific words to "O"
# def relabel_words(row):
#     words = row['sentence'].split()
#     tags = row['tag'].split()
#     new_tags = []

#     for word, tag in zip(words, tags):
#         if word.lower() in words_to_relabel or word in string.punctuation:
#             new_tags.append('O')
#         else:
#             new_tags.append(tag)
    
#     return ' '.join(new_tags)

# files = ['train', 'dev', 'test']
# for file_name in files:
#     df_w_t = pd.read_csv(f'data/processed/phee/ace/{file_name}_w_test_tag.csv')
#     df_w_t["Med_Tag"] = df_w_t["Org_Tokenized_Tag"].apply(map_tags)
#     df_w_t = df_w_t[['Sentence', 'Med_Tag']]
#     df_w_t.rename(columns={"Sentence": "sentence", "Med_Tag": "tag"}, inplace=True)

#     # relabel punctuation and conjunctions as "O"
#     df_w_t['tag'] = df_w_t.apply(relabel_words, axis=1)

#     # save to csv
#     df_w_t.to_csv(f'data/processed/phee/ace/{file_name}_w_test_tag_mapped.csv', index=False)


In [32]:
train_df = pd.read_csv('data/processed/phee/ace/train_w_test_tag_mapped.csv')
train_df.iloc[2789]

sentence    a 53 - year - old man developed lower leg edem...
tag         O I-Background O I-Background O I-Background I...
Name: 2789, dtype: object

In [24]:
# NOT BEING USED SAME IN trainer_clinical_bert.ipynb
# # 60 train, 20 dev, 20 test
# train_df = pd.read_csv('data/processed/phee/ace/train_w_test_tag_mapped.csv')
# dev_df = pd.read_csv('data/processed/phee/ace/dev_w_test_tag_mapped.csv')
# test_df = pd.read_csv('data/processed/phee/ace/test_w_test_tag_mapped.csv')


# train_df = train_df[['Sentence', 'Med_Tag']]
# train_df.rename(columns={"Sentence": "sentence", "Med_Tag": "tag"}, inplace=True)
# dev_df = dev_df[['Sentence', 'Med_Tag']]
# dev_df.rename(columns={"Sentence": "sentence", "Med_Tag": "tag"}, inplace=True)
# test_df = test_df[['Sentence', 'Med_Tag']]
# test_df.rename(columns={"Sentence": "sentence", "Med_Tag": "tag"}, inplace=True)

# train_df['sentence'] = train_df['sentence'].apply(lambda x: x.split())
# train_df['tag'] = train_df['tag'].apply(lambda x: x.split())
# dev_df['sentence'] = dev_df['sentence'].apply(lambda x: x.split())
# dev_df['tag'] = dev_df['tag'].apply(lambda x: x.split())
# test_df['sentence'] = test_df['sentence'].apply(lambda x: x.split())
# test_df['tag'] = test_df['tag'].apply(lambda x: x.split())

# # save to csv
# train_df.to_csv('data/processed/phee/ace/final_train.csv', index=False)
# dev_df.to_csv('data/processed/phee/ace/final_dev.csv', index=False)
# test_df.to_csv('data/processed/phee/ace/final_test.csv', index=False)