## Importing Libraries

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Datasets

In [11]:
# Loading ACI challenge data for now
train_df = pd.read_csv('../../clinical_visit_note_summarization_corpus/data/aci-bench/challenge_data/train.csv')
train_metadata_df = pd.read_csv('../../clinical_visit_note_summarization_corpus/data/aci-bench/challenge_data/train_metadata.csv')
valid_df = pd.read_csv('../../clinical_visit_note_summarization_corpus/data/aci-bench/challenge_data/valid.csv')
valid_metadata_df = pd.read_csv('../../clinical_visit_note_summarization_corpus/data/aci-bench/challenge_data/valid_metadata.csv')

## Basic Data Analysis

In [41]:
# Display first few rows
print("Train Data:")
print(train_df.head())
print("\nTrain Metadata:")
print(train_metadata_df.head())


Train Data:
      dataset encounter_id                                           dialogue  \
0  virtassist       D2N001  [doctor] hi , martha . how are you ?\n[patient...   
1  virtassist       D2N002  [doctor] hi , andrew , how are you ?\n[patient...   
2  virtassist       D2N003  [doctor] hi , john . how are you ?\n[patient] ...   
3  virtassist       D2N004  [doctor] hi , james , how are you ?\n[patient]...   
4  virtassist       D2N005  [doctor] hey , ms. hill . nice to see you .\n[...   

                                                note  
0  CHIEF COMPLAINT\n\nAnnual exam.\n\nHISTORY OF ...  
1  CHIEF COMPLAINT\n\nJoint pain.\n\nHISTORY OF P...  
2  CHIEF COMPLAINT\n\nBack pain.\n\nHISTORY OF PR...  
3  CHIEF COMPLAINT\n\nBack pain.\n\nHISTORY OF PR...  
4  CC:\n\nRight middle finger pain.\n\nHPI:\n\nMs...  

Train Metadata:
      dataset encounter_id     id doctor_name patient_gender  patient_age  \
0  virtassist       D2N001  VA049         NaN         female         50.0   


In [42]:
# Data Overview
print("Train Data Info:")
print(train_df.info())
print("\nTrain Metadata Info:")
print(train_metadata_df.info())


Train Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67 entries, 0 to 66
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   dataset       67 non-null     object
 1   encounter_id  67 non-null     object
 2   dialogue      67 non-null     object
 3   note          67 non-null     object
dtypes: object(4)
memory usage: 2.2+ KB
None

Train Metadata Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67 entries, 0 to 66
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   dataset             67 non-null     object 
 1   encounter_id        67 non-null     object 
 2   id                  67 non-null     object 
 3   doctor_name         7 non-null      object 
 4   patient_gender      65 non-null     object 
 5   patient_age         54 non-null     float64
 6   patient_firstname   59 non-null     object 
 7   patient_famil

In [43]:
# Missing Data Analysis
print(train_df.isnull().sum())
print(train_metadata_df.isnull().sum())


dataset         0
encounter_id    0
dialogue        0
note            0
dtype: int64
dataset                0
encounter_id           0
id                     0
doctor_name           60
patient_gender         2
patient_age           13
patient_firstname      8
patient_familyname    20
cc                     0
2nd_complaints        24
dtype: int64


In [44]:
# Statistical Summaries
print(train_df.describe())


       dataset encounter_id  \
count       67           67   
unique       3           67   
top        aci       D2N001   
freq        35            1   

                                                 dialogue  \
count                                                  67   
unique                                                 67   
top     [doctor] hi , martha . how are you ?\n[patient...   
freq                                                    1   

                                                     note  
count                                                  67  
unique                                                 67  
top     CHIEF COMPLAINT\n\nAnnual exam.\n\nHISTORY OF ...  
freq                                                    1  


In [45]:
# Comparative Analysis
print("Train vs Validation:")
print("Train Shape:", train_df.shape)
print("Validation Shape:", valid_df.shape)


Train vs Validation:
Train Shape: (67, 4)
Validation Shape: (20, 4)


In [46]:
# Metadata Analysis
print("Metadata columns:", train_metadata_df.columns)


Metadata columns: Index(['dataset', 'encounter_id', 'id', 'doctor_name', 'patient_gender',
       'patient_age', 'patient_firstname', 'patient_familyname', 'cc',
       '2nd_complaints'],
      dtype='object')


In [47]:
# Feature Engineering
if 'age' in train_df.columns:
    train_df['age_group'] = pd.cut(train_df['age'], bins=[0, 18, 35, 60, 100], labels=['Child', 'Young Adult', 'Adult', 'Senior'])


In [48]:
# Data Quality Checks
print("Duplicates in Train Data:", train_df.duplicated().sum())

Duplicates in Train Data: 0


## Advanced Data Analysis

In [49]:
# Average length of dialgoue vs note in train and val data
train_df['dialogue_length'] = train_df['dialogue'].apply(lambda x: len(x.split()))
train_df['note_length'] = train_df['note'].apply(lambda x: len(x.split()))
print("Average Dialogue Length in Train Data:", train_df['dialogue_length'].mean())
print("Average Note Length in Train Data:", train_df['note_length'].mean())

valid_df['dialogue_length'] = valid_df['dialogue'].apply(lambda x: len(x.split()))
valid_df['note_length'] = valid_df['note'].apply(lambda x: len(x.split()))
print("Average Dialogue Length in Validation Data:", valid_df['dialogue_length'].mean())
print("Average Note Length in Validation Data:", valid_df['note_length'].mean())

Average Dialogue Length in Train Data: 1301.2238805970148
Average Note Length in Train Data: 420.8358208955224
Average Dialogue Length in Validation Data: 1221.45
Average Note Length in Validation Data: 430.85


In [50]:
# Max length of dialogue and note in train and val data
print("Max Dialogue Length in Train Data:", train_df['dialogue_length'].max())
print("Max Note Length in Train Data:", train_df['note_length'].max())

print("Max Dialogue Length in Validation Data:", valid_df['dialogue_length'].max())
print("Max Note Length in Validation Data:", valid_df['note_length'].max())

Max Dialogue Length in Train Data: 3050
Max Note Length in Train Data: 884
Max Dialogue Length in Validation Data: 1789
Max Note Length in Validation Data: 829
