In [None]:
import os
from langchain_community.document_loaders import DirectoryLoader
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

In [None]:
def create_documents_from_dataframe(df):
    documents = []
    for _, row in df.iterrows():
        study = row['study']
        note = str(row['impression']) + '\n\n' + str(row['findings'])
        document = Document(page_content=note, metadata={"study": study})
        documents.append(document)
    return documents

In [None]:
def split_documents_by_sentences(documents):
    split_documents = []
    
    for doc in documents:
        # Use nltk to split the document content into sentences
        sentences = nltk.sent_tokenize(doc.page_content)
        
        split_documents.append(sentences)
    
    return split_documents

In [None]:
def analyze_sentences(split_documents):
    num_sentences = [len(doc) for doc in split_documents]
    
    avg_sentences = np.mean(num_sentences)
    std_sentences = np.std(num_sentences)
    max_sentences = np.max(num_sentences)
    min_sentences = np.min(num_sentences)
    
    return avg_sentences, std_sentences, max_sentences, min_sentences

In [None]:
def split_sentences_evenly(sentences, num_chunks):
    # If the number of sentences is less than the number of chunks, combine all in the first chunk
    if len(sentences) < num_chunks:
        return [' '.join(sentences)] + [''] * (num_chunks - 1)
    else:
        # Calculate the chunk sizes
        avg_chunk_size = len(sentences) // num_chunks
        remainder = len(sentences) % num_chunks
        
        chunk_sizes = [avg_chunk_size + 1 if i < remainder else avg_chunk_size for i in range(num_chunks)]
        
        # Split the sentences into chunks
        chunks = []
        start = 0
        for size in chunk_sizes:
            end = start + size
            chunks.append(' '.join(sentences[start:end]))
            start = end
        
        return chunks

def chunk_df(df):
    # Combine the "findings" and "impression" columns
    df['combined'] = df['findings'].fillna('') + '\n\n' + df['impression'].fillna('')
    df['combined'] = df['combined'].str.strip()
    df = df[df['combined'] != '']

    # Initialize lists to store sentence chunks
    chunks_list = []

    # Process each row in the DataFrame
    for index, row in df.iterrows():
        sentences = sent_tokenize(str(row['combined']))
        # Distribute sentences across chunks
        chunks = split_sentences_evenly(sentences, 6)
        chunks_list.append(chunks)
    
    # Create a new DataFrame with the study column, combined column, and new sentence chunk columns
    new_df = pd.DataFrame(df[['study', 'combined']])
    
    # Add new columns for each sentence chunk
    for i in range(6):
        new_df[f'chunk_{i+1}'] = [chunks[i] if i < len(chunks) else '' for chunks in chunks_list]
    
    return new_df

First, we use the csv with the findings and impressions to load into a pandas dataframe to convert to documents and split. To process the note, we combine the findings and impressions columns. 

In [None]:
sectioned_notes_path = "/opt/gpudata/mimic-cxr/mimic_cxr_sectioned.csv"
sectioned_notes_df = pd.read_csv(sectioned_notes_path)

In [None]:
print(sectioned_notes_df)

In [None]:
sectioned_notes_docs = create_documents_from_dataframe(sectioned_notes_df)

We can also find the average number of sentences per note

In [None]:
split_sectioned_notes = split_documents_by_sentences(sectioned_notes_docs)
sectioned_avg_sentences, sectioned_std_sentences, sectioned_max_sentences, sectioned_min_sentences = analyze_sentences(split_sectioned_notes)
print("sectioned average: ", sectioned_avg_sentences, "\nsectioned stdv: ", sectioned_std_sentences, "\nsectioned max: ", sectioned_max_sentences, "\nsectioned min: ", sectioned_min_sentences)

In [None]:
split_sentence_chunk_notes_df = chunk_df(sectioned_notes_df)

In [None]:
split_sentence_chunk_notes_path = "/home/imadejski/ctds-search-model/data/mimic/mimic_cxr_sentence_chunk_sectioned.csv"
split_sentence_chunk_notes_df.to_csv(split_sentence_chunk_notes_path, index=False)

In [None]:
print(split_sentence_chunk_notes_df)

In [None]:
def split_into_chunks_with_overlap(text, num_chunks=6, overlap=5):
    sentences = nltk.sent_tokenize(text)
    total_sentences = len(sentences)
    
    if total_sentences <= num_chunks:
        # If there are fewer sentences than chunks, split sentences into smaller parts
        words = text.split()
        total_words = len(words)
        chunk_size = total_words // num_chunks
        chunks = []
        
        for i in range(num_chunks):
            start_idx = max(0, i * chunk_size - i * overlap)
            end_idx = min(start_idx + chunk_size + overlap, total_words)
            chunk = words[start_idx:end_idx]
            chunks.append(" ".join(chunk))
    else:
        # Normal chunking based on sentences with word overlap
        chunk_size = total_sentences // num_chunks
        chunks = []
        
        for i in range(num_chunks):
            start_idx = i * chunk_size
            end_idx = start_idx + chunk_size
            if i < num_chunks - 1:
                end_idx = min(end_idx + 1, total_sentences)  # Ensure overlap of one sentence
            chunk_sentences = sentences[start_idx:end_idx]
            chunk_text = " ".join(chunk_sentences)
            
            if i > 0:
                # Add overlap of 5 words from the previous chunk
                prev_chunk_words = chunks[-1].split()
                overlap_words = " ".join(prev_chunk_words[-overlap:])
                chunk_text = overlap_words + " " + chunk_text
            
            chunks.append(chunk_text)
    
    return chunks

def transform_dataframe(df, output_csv_path):
    # Create an empty list to collect rows
    rows = []
    
    for _, row in df.iterrows():
        study = row['study']
        full_note = f"{row['impression']}\n\n{row['findings']}"
        chunks = split_into_chunks(full_note, num_chunks=6, overlap=5)
        
        # Create a dictionary for the row with dynamic chunk columns
        row_dict = {'study': study, 'full_note': full_note}
        for i, chunk in enumerate(chunks):
            row_dict[f'chunk{i+1}'] = chunk
        
        rows.append(row_dict)
    
    # Convert the list of rows into a dataframe
    transformed_df = pd.DataFrame(rows)
    
    # Save the dataframe to a CSV file
    transformed_df.to_csv(output_csv_path, index=False)
    
    return transformed_df