In [None]:
# List the PDFs in raw_pdf/
# Convert the PDFs to text in raw_text/
# Remove the conclusions in processed/
# Create a CSV containing all the information in processed/
# Use the pigeon app for annotation based on each raw_text that adds the label.

In [None]:
# Remove spaces from filenames
import os

folder_path = "../data/raw_pdf"

for filename in os.listdir(folder_path):
    if " " in filename:  # Check if filename contains spaces
        new_filename = filename.replace(" ", "_")  # Remove spaces
        os.rename(os.path.join(folder_path, filename), os.path.join(folder_path, new_filename))


In [None]:
# List all PDF files
# import glob
# import sys
# sys.path.append("../")
# from src import TextReport
# pdf_files = glob.glob('../data/raw_pdf/*.pdf')

# for pdf_file in pdf_files:
#     with open(pdf_file, 'rb') as f:
#         # convert pdf to text
#         pdf_object = TextReport(f, lang="fra")
#         pdf_text = pdf_object.pdf_to_text()
#         pdf_text = pdf_text.replace("\n"," ")
#         # dump the text to a file
#         new_file_name = pdf_file.replace('raw_pdf', 'raw_text')
#         new_file_name = new_file_name.replace('.pdf', '.txt')
#         with open(new_file_name, 'w') as final_f:
#             final_f.write(pdf_text)

In [None]:
import glob
import sys
sys.path.append("../")
from src import TextReport
text_files = glob.glob('../data/raw_text/*.txt')

all_path = text_files
all_file_names = [text_file.split('/')[-1] for text_file in text_files]
all_text = []
all_conclusion = []

for text_file in text_files:
    # Load the text file
    with open(text_file, "r") as file:
        text = file.read()

    # Convert the text to lower case
    text_lower = text.lower()

    # Find the position of the word "conclusion"
    conclusion_pos = text_lower.find("conclusion")

    # Remove all text after the word "conclusion"
    if conclusion_pos != -1:
        text_no_conclu = text[:conclusion_pos]
    else:
        text_no_conclu = text
    # Save the modified text back to the file
    all_text.append(text_no_conclu)
    all_conclusion.append(text[conclusion_pos:])
    new_file_name = text_file.replace('raw_text', 'processed')
    with open(new_file_name, "w") as file:
        open(new_file_name, "w").write(text)

In [None]:
missing_conclusions = [i for i in range(len(all_conclusion)) if len(all_conclusion[i]) <= 10]
for index in missing_conclusions:
    print(f"Failed detecting conclusions in report index {index} with filename {all_file_names[index]} ")
print("You should try to check it by hand !")

In [None]:
def find_duplicates(lst):
    """
    This function finds the duplicates in a list and returns a list of tuples containing the indices of all the duplicates.
    """
    duplicates = {}
    for i, item in enumerate(lst):
        if item in duplicates:
            duplicates[item].append(i)
        else:
            duplicates[item] = [i]
    
    result = [(item, indices) for item, indices in duplicates.items() if len(indices) > 1]
    return result

duplicates = find_duplicates(all_text)
print("Duplicated reports: ")
for item, indices in duplicates:
    print(f"Item is repeated at indices {all_file_names[indices[0]]} and {all_file_names[indices[1]]}")

In [None]:
# Load previous annotations and resume
import pigeon as pg
from IPython.core.display import display, HTML

import pandas as pd
import os
import numpy as np

processed_df = "../data/text_dataset.csv"
cols = ['filename', 'diag', 'path', 'text', "conclusion"]

if os.path.isfile(processed_df):
    # If the file exists, load it into a pandas DataFrame
    df = pd.read_csv(processed_df)
else:
    # If the file does not exist, create a new DataFrame with 3 lists and an empty column
    data = {"filename": all_file_names,
            "diag": [np.nan for i in range(len(all_file_names))],
            "path": all_path,
            "text": all_text,
            "conclusion": all_conclusion}
    df = pd.DataFrame(data, columns=cols)
    df.to_csv(processed_df, index=False)

df_diag = df[df.diag.notna()]
df_no_diag = df[df.diag.isna()]


def annotate_text(data):
    html = f"<p>Filename: {data[1]}</p><p>Conlusion: {data[0]}</p><br/><p> Full text: {data[2]}"
    return html

merged_list = list(zip(df_no_diag["conclusion"].to_list(), df_no_diag["filename"].to_list(), df_no_diag["text"].to_list()))
# Using pigeon to annotate the texts
annotations = pg.annotate(
    merged_list,
    shuffle=False, 
    include_skip=False,
    options=["Nemaline Myopathy","Core Myopathy","Centronuclear Myopathy","NON-CM-OTHER", "UNCLEAR"], 
    display_fn=lambda data: display(HTML(annotate_text(data))),
)

In [None]:
# Save Current Annotations
for index, value in enumerate(annotations):
    df_no_diag.iloc[index, 1] = value[1]
df_final = pd.concat([df_diag, df_no_diag])
df_final.to_csv(processed_df, index=False)

In [None]:
df = pd.read_csv(processed_df)
df["diag"].value_counts()

In [None]:
# import deepl 
# import pandas as pd
# import deepl
# from dotenv import load_dotenv
# import os
# load_dotenv() 

# translator = deepl.Translator(os.getenv("DEEPL_KEY"))
# df_final["translated_text"] = df_final["text"].apply(lambda x: translator.translate_text(x, target_lang="EN-US").text)
# df_final.to_csv('../data/text_dataset_translate.csv', index=False)