In [1]:
import re
import pandas as pd

def clean_clinical_note(text):
    """
    Basic cleaning of a single clinical note text.
    
    Parameters:
    text (str): The input clinical note text
    
    Returns:
    str: Cleaned text with normalized spacing
    """
    if not isinstance(text, str):
        raise TypeError("Input must be a string")
    
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Remove multiple newlines and replace with single newline
    text = re.sub(r'\n\s*\n+', '\n', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

def clean_notes_column(df, column_name='note_text'):
    """
    Clean an entire column of clinical notes using pandas vectorized operations.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing the notes
    column_name (str): Name of the column containing notes, defaults to 'note_text'
    
    Returns:
    pandas.DataFrame: DataFrame with cleaned notes
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame")
    
    # Create a copy to avoid modifying the original DataFrame
    df_cleaned = df.copy()
    
    # Apply vectorized string operations
    df_cleaned[column_name] = (df_cleaned[column_name]
        .str.replace(r'\s+', ' ', regex=True)  # Remove multiple spaces
        .str.replace(r'\n\s*\n+', '\n', regex=True)  # Handle multiple newlines
        .str.strip()  # Remove leading/trailing whitespace
    )
    
    return df_cleaned

# Assuming your dataframe is called df
def aggregate_notes(df):
    # Group by person_id and note_date, concatenate note_text with newlines
    aggregated_df = df.groupby(['person_id'])['note_text'].agg(
        lambda x: '\n'.join(x.dropna())  # dropna() removes any null values before joining
    ).reset_index()
    return aggregated_df

In [4]:
def main():
    # df = pd.read_csv("../data/suicide_attempt/FDA_Chart_20241115.csv")
    df = pd.read_csv("../data/suicide_attempt/sa_chart_20241127.csv")
    # chart = pd.read_csv("../data/suicide_attempt/FDA_Chart_Cleaned.csv")
    
    # df_filtered = df[~df['person_id'].isin(chart['person_id'])]
    df_aggregated = aggregate_notes(df)
    df_cleaned = clean_notes_column(df_aggregated)
    df_cleaned.to_csv("../data/suicide_attempt/sa_chart_grouped_20241127.csv", index = False)
    
if __name__ == "__main__":
    main()

In [2]:
with open('../data/suicide_attempt/strong_evidence_long.txt', 'r') as file:
    strong_evidence = file.read()

with open('../data/suicide_attempt/weak_evidence_long.txt', 'r') as file:
    weak_evidence = file.read()

strong_evidence_cleaned = clean_clinical_note(strong_evidence)
weak_evidence_cleaned = clean_clinical_note(weak_evidence)

with open('../data/suicide_attempt/strong_evidence_long_cleaned.txt', 'a') as file:
    file.write(strong_evidence_cleaned)

with open('../data/suicide_attempt/weak_evidence_long_cleaned.txt', 'a') as file:
    file.write(weak_evidence_cleaned)