In [None]:
"""
Error analysis module
Input: the outputs of classification models on the dataframe. 
Specifically: result_dict = {"predictions": all_predictions, "references": all_references, "filenames": all_filenames, 'input_ids': all_input_ids} where:
all_predictions is the binary predictions for the file, 0 or 1
all_references is the binary references (ground truth) for the file, 0 or 1
all_filenames is the name of the folder (discharge summary et.c)
input ids is the tokenized, encoded versions of the original sentences (must decode here
Functions:
- error by folder: retrieve number of errors per folder 
- error length: retrieve average sentence length for correct, incorrect classifications:
- error by words: retrieve most common words present in incorrect/correct sentences 
- error by model: for result_dict from each model, which errors are present in all models?
- error by drug: given list of all drug strings, which drugs are most common in errors? 
""";


In [None]:
import pickle
def import_data(path):
    with open(path, 'rb') as f:
        result_dict = pickle.load(f)
    #print(result_dict['decoded_sentences'][0])
        return(result_dict)


def export_data(data, path):
    f = open(path+".pkl",'w')
    f.close()
    f = open(path+".pkl", "wb")
    pickle.dump(data, f)
    f.close()
    
T5_result_dict = import_data('PT_T5_Classifier_results_dict_TEST_SET_v1.pkl')
  
BERT_result_dict = import_data('PT_BERT_Classifier_results_dict_TEST_SET_v1.pkl')

SVM_result_dict = import_data('SVM_predictions.pkl')

T5_int_list = [int(tensor.numpy()) for tensor in T5_result_dict['row_numbers']]
print("biggest value in T5 (should match number of vals in test set")
print(max(T5_int_list))



BERT_int_list = [int(tensor.numpy()) for tensor in BERT_result_dict['row_numbers']]
print("biggest value in BERT (should match number of vals in test set")
print(max(BERT_int_list))

In [None]:
def get_correct_incorrect(results_dict):
    import pandas as pd
    df = pd.DataFrame({
        'sentence': results_dict["decoded_sentences"],
        'prediction': results_dict["predictions"],
        'reference': results_dict["references"]
    })
    # Assuming you have a DataFrame named 'df' with columns 'sentence', 'prediction', and 'reference'
    # Create a new DataFrame for correct and incorrect predictions
    correct_df = pd.DataFrame(columns=['correct'])
    incorrect_df = pd.DataFrame(columns=['incorrect'])

    # Iterate through each row in the original DataFrame
    for index, row in df.iterrows():
        if row['prediction'] == row['reference']:
            correct_df = correct_df.append({'correct': row['sentence']}, ignore_index=True)
        else:
            incorrect_df = incorrect_df.append({'incorrect': row['sentence']}, ignore_index=True)

    return({"correct": correct_df, "incorrect":incorrect_df})

"""
result_dict = import_data('PT_T5_Classifier_results_dict.pkl')
dict1 = get_correct_incorrect(result_dict)
print(dct)
# Specify the path and filename for the Excel file
import csv
with open('T5_corect_incorrect.csv', 'w') as output:
    writer = csv.writer(output)
    for key, value in dict1.items():
        writer.writerow([key, value])

        
result_dict = import_data('PT_BERT_Classifier_results_dict.pkl')
dict1 = get_correct_incorrect(result_dict)
print(dct)
# Specify the path and filename for the Excel file
with open('BERT_corect_incorrect.csv', 'w') as o:
    writer = csv.writer(o, quoting=csv.QUOTE_NONNUMERIC)
    for key, value in dict1.items():
        writer.writerow([key, value])
""";

In [None]:
def error_by_folder(results_dict):
    import pandas as pd

    # Create a DataFrame with filenames, predictions, and references
    df = pd.DataFrame({
        'filename': result_dict["filenames"],
        'prediction': result_dict["predictions"],
        'reference': result_dict["references"]
    })

    # Identify correct and incorrect predictions
    df['correct'] = (df['prediction'] == df['reference'])

    # Group by filename and calculate total correct and incorrect predictions
    summary_df = df.groupby('filename')['correct'].value_counts().unstack(fill_value=0).reset_index()

    # Rename columns for clarity
    summary_df.columns = ['filename', 'incorrect_predictions',  'correct_predictions']

    return(summary_df)


In [None]:
def error_by_length_char(result_dict):
    #returns average length in characters of all incorrect sentence,  correct sentence
    correct_sum = 0
    correct_length = 0
    
    incorrect_sum = 0
    incorrect_length = 0

    
    for i in range(len(result_dict['decoded_sentences'])):
        #get correct sentences
        if(result_dict['predictions'][i] == result_dict['references'][i]):
            correct_length += 1
            correct_sum += len(result_dict['decoded_sentences'][i])
            
        #Get incorrect
        else:
            incorrect_sum += len(result_dict['decoded_sentences'][i])
            incorrect_length += 1
            
    average_correct_length_char = correct_sum / correct_length
    average_incorrect_length_char = incorrect_sum / incorrect_length

    return({"average_incorrect_length_char": average_incorrect_length_char, "average_correct_length_char":average_correct_length_char})
        
print("BERT")
print(error_by_length_char(BERT_result_dict))

print("T5")
print(error_by_length_char(T5_result_dict))

In [None]:
import nltk
from collections import Counter

def process_sentence(sentence, stopwords):
    # Tokenize sentence, remove punctuation, and convert to lowercase
    words = [word.strip(".,?!").lower() for word in sentence.split()]
    # Remove stopwords and return the remaining words
    return [word for word in words if word not in stopwords]

def analyze_sentences(correct_sentences, incorrect_sentences, stopwords=[]):
    correct_counts = Counter()
    incorrect_counts = Counter()

    # Process correct sentences
    for sentence in correct_sentences:
        sentence = ' '.join(sentence)
        words = process_sentence(sentence, stopwords)
        correct_counts.update(words)

    # Process incorrect sentences
    for sentence in incorrect_sentences:
        sentence = ' '.join(sentence)
        words = process_sentence(sentence, stopwords)
        incorrect_counts.update(words)

    # Return sorted results
    return {
        'correct': correct_counts.most_common(),
        'incorrect': incorrect_counts.most_common()
    }


def get_common_words(result_dict):
    
    from nltk.corpus import stopwords


    correctness_df = get_correct_incorrect(result_dict)
    display(correctness_df['correct'])
    display(correctness_df['incorrect'])


    correct_sentences = correctness_df['correct'].values.tolist()
    incorrect_sentences = correctness_df['incorrect'].values.tolist()
    #print(correct_sentences[0])
    stopwords = stopwords.words('english')
    stopwords += (['-', 'ml', 'po', 'mg', 'pt'])
    #stopwords = []


    result = analyze_sentences(correct_sentences, incorrect_sentences, stopwords)
    print("Most common words in correct sentences:", result['correct'][0:31])
    print("\nMost common words in incorrect sentences:", result['incorrect'][0:31])

nltk.download('stopwords')

print("BERT")
print(get_common_words(BERT_result_dict))

print("T5")
print(get_common_words(T5_result_dict))

In [None]:
import pandas as pd
def get_common_incorrect_words(T5_result_dict, BERT_result_dict):
    T5_incorrect = get_correct_incorrect(T5_result_dict)['incorrect']
    display(T5_incorrect)
    print(type(T5_incorrect))
    T5_incorrect.rename(columns={'incorrect': 'T5_incorrect'}, inplace=True)

    BERT_incorrect = get_correct_incorrect(BERT_result_dict)['incorrect']
    display(BERT_incorrect)
    BERT_incorrect.rename(columns={'incorrect': 'BERT_incorrect'}, inplace=True)

    print(type(BERT_incorrect))
    
    merged_df = pd.concat([T5_incorrect, BERT_incorrect], axis=1)
    excel_file_path = 'actual_output.xlsx'
    merged_df.to_excel(excel_file_path, index=False)

    
get_common_incorrect_words(T5_result_dict, BERT_result_dict)

In [None]:
#IMPROVEMENT FROM SVM
#Get the sentences that were misclassified by SVM AND NOT (BERT, T5)
SVM_only_wrong = []

from tqdm import tqdm
def SVM_difference(SVM_result_dict, T5_result_dict, BERT_result_dict):
    print(T5_result_dict.keys())
    print(len(T5_result_dict['row_numbers']))
    
    print(SVM_result_dict.keys())
    
    """
    FALSE: T5_result_dict['row_numbers'][i] == BERT_result_dict['row_numbers'][i] == SVM_predictions[i] 
    
    the lists did not keep their order; train_dataframe[i] does not equal BERT or SVM or T5[i]
    to compare the same i: treat SVM_ground_truth[i] as i, treat T5/BERT_result_dict[row number] as i
    
            this method allows you to iterate over the same original element on all 3 dicts
        T5_ground_truth = 0 
        BERT_ground_truth =  0
        SVM_ground_truth = SVM_result_dict['ground_truth'][i]
        
        for j in range(len((T5_result_dict['row_numbers']))):
            if T5_result_dict['row_numbers'][j] == i:
                #this means that T5[j] == SVM[i]
                T5_ground_truth = T5_result_dict['references'][j]
        for k in range(len((BERT_result_dict['row_numbers']))):
            if BERT_result_dict['row_numbers'][k] == i:
                #this means that BERT[k] == SVM[i]
                BERT_ground_truth = BERT_result_dict['references'][k]
        
        if not (BERT_ground_truth == T5_ground_truth and BERT_ground_truth == SVM_ground_truth):
            print("here")
    """
    #list of sentences that SVM got wrong, T5 AND BERT got right
    for i in tqdm(range(len(SVM_result_dict['ground_truth']))):
            T5_result = 0 
            BERT_result =  0
            SVM_result = SVM_result_dict['predictions'][i]
            ground_truth = SVM_result_dict['ground_truth'][i]
            sentence = ''
            for j in range(len((T5_result_dict['row_numbers']))):
                if T5_result_dict['row_numbers'][j] == i:
                    sentence = T5_result_dict['decoded_sentences'][j]
                    #this means that T5[j] == SVM[i]
                    T5_result = T5_result_dict['predictions'][j]
            for k in range(len((BERT_result_dict['row_numbers']))):
                if BERT_result_dict['row_numbers'][k] == i:
                    #this means that BERT[k] == SVM[i]
                    BERT_result = BERT_result_dict['predictions'][k]
            #If SVM got wrong
            if not SVM_result == ground_truth:
                #If T5 and BERT got right
                if(T5_result == BERT_result and T5_result == ground_truth):
                    SVM_only_wrong.append(sentence)
                    
    with open('errors_ONLY_SVM.txt', 'w') as f:
        for line in SVM_only_wrong:
            f.write(f"{line}\n\n")
    return(SVM_only_wrong)
        
SVM_difference(SVM_result_dict, T5_result_dict, BERT_result_dict)

In [None]:
#HARDEST SENTENCES
#Get the sentences that were misclassified by SVM AND BERT AND T5
SVM_correct = []
SVM_wrong = []
correct = []
all_wrong = []
from tqdm import tqdm
def hard_sentences(SVM_predictions, T5_result_dict, BERT_result_dict):
    print(T5_result_dict.keys())
    print(len(T5_result_dict['row_numbers']))
    
    print(SVM_result_dict.keys())
    
    #list of sentences that SVM AND T5 AND BERT got wrong
    wrong = []
    for i in tqdm(range(len(SVM_result_dict['ground_truth']))):
            T5_result = 0 
            BERT_result =  0
            SVM_result = SVM_result_dict['predictions'][i]
            ground_truth = SVM_result_dict['ground_truth'][i]
            sentence = ''
            for j in range(len((T5_result_dict['row_numbers']))):
                if T5_result_dict['row_numbers'][j] == i:
                    sentence = T5_result_dict['decoded_sentences'][j]
                    #this means that T5[j] == SVM[i]
                    T5_result = T5_result_dict['predictions'][j]
            for k in range(len((BERT_result_dict['row_numbers']))):
                if BERT_result_dict['row_numbers'][k] == i:
                    #this means that BERT[k] == SVM[i]
                    BERT_result = BERT_result_dict['predictions'][k]
            #If all got wrong
            if not (SVM_result == ground_truth) and not (T5_result == BERT_result) and not (T5_result == ground_truth):
                wrong.append(sentence)
                all_wrong.append({'sentence':sentence, 'ground_truth':ground_truth})
            if (SVM_result == ground_truth) and (T5_result == BERT_result) and (T5_result == ground_truth):
                correct.append(sentence)
            if (SVM_result == ground_truth):
                SVM_correct.append(sentence)
            if not (SVM_result == ground_truth):
                SVM_wrong.append({'sentence':sentence, 'ground_truth':ground_truth})
    with open('errors_all.txt', 'w') as f:
        for line in wrong:
            f.write(f"{line}\n\n")
    return(wrong)
        
hard_sentences(SVM_result_dict, T5_result_dict, BERT_result_dict)

In [None]:
#Get SVM misclass nature
for i in SVM_wrong:
    sentence = i['sentence']
    #Get SVM misclass
    if(sentence.split(' ')[0] == 'Past'):
        if(sentence.split(' ')[1] == 'Medical/Surgical'):
            print(sentence)
            print(i['ground_truth'])

all_false_positives = 0 
all_false_negatives = 0
for i in all_wrong:
    sentence = i['sentence']
    #correct answer was 0 (noADE), all got 1 
    if(i['ground_truth'] == 0):
        all_false_negatives +=1
    if(i['ground_truth'] == 1):
        all_false_positives +=1
print('all false positives:', all_false_positives)
print('all false negatives:', all_false_negatives)
print('number of sentences incorrectly classified by all models:',len(all_wrong))

In [None]:
#Goal: determine how BERT/T5 improves on SVM. Show diff. between SVM correct/incorrect
#get avg length of correct / incorrect sentence 
#get avg word length of correct / incorrect sentence 

SVM_wrong_count = 0
for i in SVM_wrong:
    SVM_wrong_count += len(i)
print('avg sentence length (in chars) of incorrectly classified SVM sentence')
print(SVM_wrong_count / len(SVM_wrong))


SVM_correct_count = 0
for i in SVM_correct:
    SVM_correct_count += len(i)
print('avg sentence length (in chars) of correctly classified SVM sentence')
print(SVM_correct_count / len(SVM_correct))

print("BERT")
print(error_by_length_char(BERT_result_dict))

print("T5")
print(error_by_length_char(T5_result_dict))


In [None]:
with open('errors_all.txt', encoding='utf-8') as f:
    length = 0
    count = 0
    for line in f:
        length += len(line)
        count+=1
    print("average length of incorrectly classified sentneces by all models")
    print(length / count)
    print('number of incorrectly classified sentences by all models')
    print(count)
    
avg_length_SVM_correct = 0

length_correct = 0
count_correct = 0
for line in correct:
    length_correct += len(line)
    count_correct += 1
    
print('\naverage length of correctly classified sentences by all models')
print(length_correct / count_correct)
print('number of correctly classified sentences by all models')
print(len(correct))

avg_word_length_SVM_correct = 0
avg_word_length_SVM_incorrect = 0


#print((SVM_correct[0]))
for i in SVM_correct:
    word_list = i.split()
    if len(word_list) <= 0:
        continue
    total_avg = sum( map(len, word_list) ) / len(word_list)
    avg_word_length_SVM_correct += total_avg 
    avg_word_length_SVM_correct /= 2
    
print('\navg word length in correctly classified SVM sentences')
print(avg_word_length_SVM_correct)
print('number of correctly classified SVM sentences:')
print(len(SVM_correct))

for i in SVM_only_wrong:
    word_list = i.split()     
    total_avg = sum( map(len, word_list) ) / len(word_list)
    avg_word_length_SVM_incorrect += total_avg 
    avg_word_length_SVM_incorrect /= 2

print('\navg word length in incorrectly classified SVM sentences')
print(avg_word_length_SVM_incorrect)
print('number of sentences only incorrectly classified by SVM')
print(len(SVM_only_wrong))

with open('errors_ONLY_SVM.txt', encoding='utf-8') as f:
    length = 0
    count = 0
    for line in f:
        length += len(line)
        count+=1
        
    print("\naverage character length of incorrectly classified SVM sentences")
    print(length / count)

SVM_length = 0
SVM_count = 0
for i in SVM_correct:
    SVM_count += 1
    SVM_length += len(i)
print("average character length of correctly classified sentence by SVM")
print(SVM_length / SVM_count)


In [None]:

import pandas as pd
import hashlib

def get_uncommon_errors_between_models(model_outputs: list[dict]):
    
    correct_df = pd.DataFrame({'correct': []})
    incorrect_df = pd.DataFrame({'incorrect': []})
    counter = 0
    for results_dict in model_outputs:
        correctness_df = get_correct_incorrect(results_dict)
        incorrect_sentences = correctness_df['incorrect']
        temp2 = pd.DataFrame()

        temp2['incorrect'] = incorrect_sentences['incorrect']
        if(counter == 0):
            incorrect_df = temp2
        else:
            incorrect_df = pd.merge(incorrect_df, temp2, on='incorrect', how='outer')
        counter += 1

    display(incorrect_df)
    return(incorrect_df)

import pandas as pd

def get_most_common_errors_between_models(model_outputs: list[dict]):
    
    correct_df = pd.DataFrame({'correct': []})
    incorrect_df = pd.DataFrame({'incorrect': []})
    counter = 0
    for results_dict in model_outputs:
        correctness_df = get_correct_incorrect(results_dict)
        incorrect_sentences = correctness_df['incorrect']
        df = pd.DataFrame.from_dict([correctness_df])

        df.to_excel(f'results1_{counter}.xlsx', index=False)

        temp2 = pd.DataFrame()

        temp2['incorrect'] = incorrect_sentences['incorrect']
        if(counter == 0):
            incorrect_df = temp2
        else:
            incorrect_df = pd.merge(incorrect_df, temp2, on='incorrect', how='inner')
        counter += 1

    display(incorrect_df)
    return(incorrect_df)

T5_result_dict = import_data('PT_T5_Classifier_results_dict.pkl')
BERT_result_dict = import_data('PT_BERT_Classifier_results_dict.pkl')

common_incorrect_df = get_most_common_errors_between_models([T5_result_dict, BERT_result_dict])
uncommon_incorrect_df = get_uncommon_errors_between_models([T5_result_dict, BERT_result_dict])

print(len(common_incorrect_df))
print(len(uncommon_incorrect_df))

#35export_data(incorrect_df, 'common_errors_BERT_T5')


length = 0
count = 0
"""
for i in common_incorrect_df['incorrect']:
    print(i)
    length += len(i)
    #print(length)
    print('\n')
    count += 1
    
print("avg length: ", length/count)
"""

In [None]:
duplicate_strings = uncommon_incorrect_df[uncommon_incorrect_df.duplicated('incorrect')]

# Display the rows with duplicate strings
print("Rows with duplicate strings:")
print(duplicate_strings)


In [None]:
display(get_correct_incorrect(T5_result_dict)['incorrect'])

display(get_correct_incorrect(BERT_result_dict)['incorrect'])

In [None]:
pd.options.display.max_colwidth = 1000
print(get_correct_incorrect(T5_result_dict)['incorrect'].iloc[4])
print(get_correct_incorrect(BERT_result_dict)['incorrect'].iloc[1])

#print(get_correct_incorrect(BERT_result_dict)['incorrect',1])

Difference is due to tokenization -- we encode it, classify, then decode it to perform error analysis; when encoding, it seems like some characters are toknized differently. 

In [None]:
#Ensemble approach: if 2/ 3 models agree, then do 2/3 models. Else, don't.
from tqdm import tqdm
def ensemble_approach(SVM_result_dict, T5_result_dict, BERT_result_dict):
    print(SVM_result_dict.keys())
    print(T5_result_dict.keys())
    print(BERT_result_dict.keys())
    
    print(len(SVM_result_dict['predictions']))
    print(len(T5_result_dict['predictions']))
    print(len(BERT_result_dict['predictions']))
    majority_agreement_predictions = []
    for i in tqdm(range(len(SVM_result_dict['ground_truth']))):
            T5_result = 0 
            BERT_result =  0
            SVM_result = SVM_result_dict['predictions'][i]
            ground_truth = SVM_result_dict['ground_truth'][i]
            sentence = ''
            for j in range(len((T5_result_dict['row_numbers']))):
                if T5_result_dict['row_numbers'][j] == i:
                    sentence = T5_result_dict['decoded_sentences'][j]
                    #this means that T5[j] == SVM[i]
                    T5_result = T5_result_dict['predictions'][j]
            for k in range(len((BERT_result_dict['row_numbers']))):
                if BERT_result_dict['row_numbers'][k] == i:
                    #this means that BERT[k] == SVM[i]
                    BERT_result = BERT_result_dict['predictions'][k]
                    
            #If 2/3 agree
            #print('sentence: ', sentence)
            if ( BERT_result == 1 or SVM_result == 1 or T5_result == 1):
                majority_agreement_predictions.append(1)
            else:
                majority_agreement_predictions.append(0)
                
    return(majority_agreement_predictions)

majority_agreement_predictions = ensemble_approach(SVM_result_dict, T5_result_dict, BERT_result_dict)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

classification_report_result = classification_report(SVM_result_dict['ground_truth'], majority_agreement_predictions, digits=3)
print(classification_report_result)