In [1]:
import pandas as pd
import re

In [10]:
df = pd.read_csv('enriched_BST_data_v2.csv')

In [None]:
df.head

## Split different parts of the report into separate rows

In [8]:
def split_reports(df, text_data):
    new_df = pd.DataFrame()

    for i, row in df.iterrows():
        reports = re.split(r"\s+\d+\.\s+", row[text_data])

        reports = [report for report in reports if report.strip()]

        reports_df = pd.DataFrame({text_data: reports})

        for col in df.columns:
            if col != text_data:
                reports_df[col] = row[col]

        new_df = new_df.append(reports_df, ignore_index=True)

    return new_df


In [None]:
df = split_reports(df, 'text_data')

In [10]:
df = df.drop_duplicates(subset='text_data')

## Determine Method (Touch Prep vs FNA)

In [11]:
def determine_method(row, text_column):
    if re.search(r'touch prep', row[text_column], flags=re.IGNORECASE):
        return "Touch Prep"
    elif re.search(r'aspirate|aspiration', row[text_column], flags=re.IGNORECASE):
        return "FNA"
    else:
        return None

In [12]:
df['Method'] = df.apply(lambda row: determine_method(row, 'part_description'), axis=1)

## Extract Adequacy Statement and SIND

In [None]:
# def extract_adequacy_statement(row, text_data):
#     match = re.search(r'Statement of Adequacy:(.*?\.)', row[text_data], flags=re.IGNORECASE)
#     if match:
#         return ''.join(match.groups(default='')).strip()
#     else:
#         return None
    
# #def extract_sind_statement(row, text_data):
#     match = re.search(r'(Although .*?\.)', row[text_data], flags=re.IGNORECASE)
#     if match:
#         return ''.join(match.groups(default='')).strip()

In [13]:
def extract_adequacy_statement(row, text_data):
    match = re.search(r'(Although .*?\.)', row[text_data], flags=re.IGNORECASE)
    if match:
        return ''.join(match.groups(default='')).strip()
    
    match = re.search(r'Statement of Adequacy:(.*?\.)', row[text_data], flags=re.IGNORECASE)
    if match:
        return ''.join(match.groups(default='')).strip()

    return None

In [14]:
df['AdequacyStatement'] = df.apply(lambda row: extract_adequacy_statement(row, 'text_data'), axis=1)


In [65]:
#new_df['AdequacyStatement'] = new_df.apply(lambda row: extract_adequacy_statement(row, 'text_data'), axis=1)

In [66]:
#new_df['AdequacyStatement'] = new_df.apply(lambda row: extract_sind_statement(row, 'text_data'), axis=1)

## Count Number of Passes

In [15]:
def count_attempts(row, text_column):
    match = re.search(r'Immediate adequacy assessment(.*)', row[text_column])
    if match:
        text = match.group(1)

        adequate_count = len(re.findall(r'\bAdequate\b', text))
        not_adequate_count = len(re.findall(r'\bNot Adequate\b', text))
        inadequate_count = len(re.findall(r'\bInadequate\b', text)) + not_adequate_count

        continue_after_adequate = 'Adequate' in text and ('Inadequate' in text[text.index('Adequate'):] or 'Not Adequate' in text[text.index('Adequate'):])

        return pd.Series([adequate_count, inadequate_count, continue_after_adequate])
    else:
        return pd.Series([0, 0, False])

In [16]:
df[['AdequateCount', 'InadequateCount', 'ContinueAfterAdequate']] = df.apply(lambda row: count_attempts(row, 'text_data'), axis=1)

## Extract Final Diagnosis Statement

In [17]:
def find_diagnosis(text):
    match = re.search(r'(?i)(Positive for malignant cells|Negative for malignant cells)[\.,](.*?)(\.|$)', text, flags=re.DOTALL | re.IGNORECASE)
    if match:
        return match.group(1) + match.group(2) + '.'
    else:
        return None

In [13]:
def find_diagnosis(text):
    pattern_label_mapping = {
        'Positive for malignant cells': 'Positive for malignant cells',
        'Negative for malignant cells': 'Negative for malignant cells',
        'Malignant cells detected': 'Positive for malignant cells',
        'No malignant cells detected': 'Negative for malignant cells',
        'No malignant cells identified': 'Negative for malignant cells',
        'No malignant cells seen': 'Negative for malignant cells',
        'neoplastic cells present': 'Positive for neoplastic cells',
        'plasma cell neoplasm': 'Positive for neoplastic cells',
        'suspect adenocarcinoma': 'Suspicious for carcinoma',
        'suspect carcinoma': 'Suspicious for carcinoma',
        'suspicious for carcinoma': 'Suspicious for carcinoma',
        'suspect lymphoma': 'Suspicious for lymphoma',
        'suspicious for lymphoma': 'Suspicious for lymphoma',
        'rare spindle cells present': 'Suspicious for sarcoma',
        'suspect sarcoma': 'Suspicious for sarcoma',
        'suspicect chondrosarcoma': 'Suspicious for sarcoma',
        'spindle cell proliferation': 'Suspicious for sarcoma',
        'suspicious cells present': 'Suspicious cells present',
        'atypical cells': 'Atypical',
        'atypical.': 'Atypical',
        'atypical spindle cells': 'Atypical',
        'atypical epithelial' : 'Atypical',
    }
    
    for pattern, label in pattern_label_mapping.items():
        if re.search(rf'(?i)({pattern})', text, flags=re.DOTALL | re.IGNORECASE):
            return label
    
    return None


In [14]:
df['cyto_diagnosis'] = df['cyto_report'].apply(find_diagnosis)

In [15]:
df.to_csv('enriched_BST_data_v4.csv', index=False)

# Extract Concurrent Case Number

In [19]:
concurrent_pattern = r'(?i)concurrent.*?([HS]\d{2}-\d+)'
#flow_pattern = r'([F]\d{2}-\d+)'
df['concurrent'] = df['text_data'].str.extract(concurrent_pattern)
#lym_df['FLOW'] = lym_df['text_data_final'].str.extract(flow_pattern)

In [20]:
df.to_csv('BST_WIP.csv', index=False)