In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

In [3]:
df = pd.read_csv('/vol/miltank/users/doanb/ADLM_SS2025/ROCO-dataset/radiologytraindata.csv')
#df.columns # Index(['id', 'name', 'caption'], dtype='object')
df.head() 

Unnamed: 0,id,name,caption
0,ROCO_00002,PMC4083729_AMHSR-4-14-g002.jpg,Computed tomography scan in axial view showin...
1,ROCO_00003,PMC2837471_IJD2009-150251.001.jpg,Bacterial contamination occurred after comple...
2,ROCO_00004,PMC2505281_11999_2007_30_Fig6_HTML.jpg,The patient had residual paralysis of the han...
3,ROCO_00005,PMC3745845_IJD2013-683423.005.jpg,Panoramic radiograph after immediate loading.\n
4,ROCO_00007,PMC4917066_amjcaserep-17-301-g001.jpg,Plain abdomen x-ray: Multiple air levels at t...


# Abdominal

In [None]:
# Filter only entries with 'caption' containing 'abdomen' or 'abdominal'
df_abdominal = df[df['caption'].str.contains('abdomen|abdominal', case=False, na=False)]
print(f"Number of entries with 'abdomen' or 'abdominal': {len(df_abdominal)}")
df_abdominal.head()
# save the column 'name' to a new csv file 
#df_abdominal[['name']].to_csv('/vol/miltank/users/doanb/ADLM_SS2025/ROCO-dataset/radiology_abdominal_names.csv', index=False)


Number of entries with 'abdomen' or 'abdominal': 5089


Unnamed: 0,id,name,caption
4,ROCO_00007,PMC4917066_amjcaserep-17-301-g001.jpg,Plain abdomen x-ray: Multiple air levels at t...
23,ROCO_00030,PMC362879_1471-2466-4-1-1.jpg,Abdominal CT scan showing pancreatic pseudocy...
24,ROCO_00032,PMC4495506_IJU-31-259-g004.jpg,Axial contrast-enhanced computed tomography i...
41,ROCO_00050,PMC4239440_PAMJ-18-217-g001.jpg,Free air beneath the diaphragm at abdominal x...
70,ROCO_00088,PMC4641558_10.1177_2054270415611833-fig2.jpg,Sagittal CT angiogram of abdomen demonstratin...


In [11]:
not_include_terms = [
    "ct", "computed", "tomography","MRI", "MR", "magnetic", "resonance", "X-ray", "radiograph", "PET", 
    "scan", "scans", "image", "images", "figure", "figures", 
    "shows", "showing", "seen", "demonstrating", "presenting", "demonstrates",
]
default_english_stop_words = CountVectorizer(stop_words='english').get_stop_words()
custom_stop_words = list(set(default_english_stop_words).union(term.lower() for term in not_include_terms))

# Extract common bigrams and trigrams
vectorizer = CountVectorizer(ngram_range=(3, 10),  stop_words=custom_stop_words, min_df=5) 
X = vectorizer.fit_transform(df_abdominal['caption'])
# Get the feature names (bigrams and trigrams)
feature_names = vectorizer.get_feature_names_out()
# Convert to a DataFrame for better visualization
df_bigrams_trigrams = pd.DataFrame(X.toarray(), columns=feature_names)
# Sum the occurrences of each bigram and trigram
bigram_trigram_counts = df_bigrams_trigrams.sum().sort_values(ascending=False)

# save to an excel file
#output_file = '/vol/miltank/users/doanb/ADLM_SS2025/ngocs_use_cases/n_grams_abdominal.xlsx'
#bigram_trigram_counts.to_excel(output_file, index=True, header=['Count'])

# make bigram_trigram_counts to a DataFrame
df_bigrams_trigrams_counts = pd.DataFrame(bigram_trigram_counts).reset_index()
df_bigrams_trigrams_counts.columns = ['n_gram', 'count']



In [34]:
df_filtered = df_bigrams_trigrams_counts[df_bigrams_trigrams_counts['n_gram'].str.contains('abdominal wall', case=False, na=False)]
df_filtered

Unnamed: 0,n_gram,count
2,anterior abdominal wall,73
80,lateral abdominal wall,12
118,abdominal wall mass,10
137,left abdominal wall,9
186,right abdominal wall,8
190,abdominal wall arrow,8
192,abdominal wall muscles,8
202,abdominal wall defect,7
363,lower abdominal wall,6
547,left lateral abdominal wall,5


In [None]:
# Filter all captions containing the term and save to an excel file
df_term = df_abdominal[df_abdominal['caption'].str.contains("anterior abdominal wall", case=False, na=False) 
                       #& ( 
                            #df_abdominal['caption'].str.contains('hepatom', case=False, na=False) |
                            #df_abdominal['caption'].str.contains('shrink', case=False, na=False) |
                            #df_abdominal['caption'].str.contains('tumor', case=False, na=False) |
                            #df_abdominal['caption'].str.contains('lesion', case=False, na=False) |
                            #df_abdominal['caption'].str.contains('anterior abdominal wall', case=False, na=False) )
                        ]
df_term.to_excel('/vol/miltank/users/doanb/ADLM_SS2025/ngocs_use_cases/filtered_abdominal_term.xlsx', index=False)
# save the column 'name' to a new csv file 
df_term[['name']].to_csv('/vol/miltank/users/doanb/ADLM_SS2025/ngocs_use_cases/filtered_abdominal_term_names.csv', index=False)

In [55]:
# Get unique scanning modalities from the 'caption' column
unique_modalities = df_abdominal['caption'].str.extract(r'\b(CT|computed tomography|MRI|MR|T1-weighted|T2-weighted|fMRI|magnetic resonance|X-ray|radiograph|PET|positron emission tomography)\b', expand=False).dropna().unique()
print("Unique scanning modalities found in the captions:")
for modality in unique_modalities:
    #print(modality) with count of occurrences
    count = df_abdominal['caption'].str.contains(modality, case=False, na=False).sum()
    print(f"{modality}: {count} occurrences")

# how many entries are remain after filtering out all entries with the modalities listed above?
df_abdominal_filtered = df_abdominal[~df_abdominal['caption'].str.contains('|'.join(unique_modalities), case=False, na=False)]
print(f"Number of entries remaining after filtering out known modalities: {len(df_abdominal_filtered)}")



Unique scanning modalities found in the captions:
CT: 3035 occurrences
computed tomography: 1051 occurrences
radiograph: 298 occurrences
X-ray: 355 occurrences
MRI: 149 occurrences
PET: 39 occurrences
MR: 176 occurrences
T1-weighted: 15 occurrences
magnetic resonance: 97 occurrences
T2-weighted: 30 occurrences
positron emission tomography: 18 occurrences
Number of entries remaining after filtering out known modalities: 748


In [27]:
# Filter all captions containing mri related terms
df_abdominal_mri = df_abdominal[df_abdominal['caption'].str.contains('MRI|MR|T1-weighted|T2-weighted|fMRI|magnetic resonance', case=False, na=False)]
print(f"Number of MRI with abdominal {len(df_abdominal_mri)} -> too littel") 
# save to a new excel file
# df_abdominal_mri.to_excel('/vol/miltank/users/doanb/ADLM_SS2025/ngocs_use_cases/df_abdominal_mri.xlsx', index=False)

Number of MRI with abdominal 261 -> too littel


In [25]:
# Filter all captions containing ct related terms
df_abdominal_ct = df_abdominal[df_abdominal['caption'].str.contains('CT|computed tomography', case=False, na=False)]
print(f"Number of CT with abdominal {len(df_abdominal_ct)}")
# save to a new excel file
df_abdominal_ct.to_excel('/vol/miltank/users/doanb/ADLM_SS2025/ngocs_use_cases/df_abdominal_ct.xlsx', index=False)

Number of CT with abdominal 3691


In [26]:
# Filter all captions containing xray related terms
df_abdominal_xray = df_abdominal[df_abdominal['caption'].str.contains('X-ray|radiograph', case=False, na=False)]
print(f"Number of X-ray with abdominal {len(df_abdominal_xray)}")
# save to a new excel file
df_abdominal_xray.to_excel('/vol/miltank/users/doanb/ADLM_SS2025/ngocs_use_cases/df_abdominal_xray.xlsx', index=False)

Number of X-ray with abdominal 647


# Chest

In [27]:
# Filter only entries with 'caption' containing 'chest' and "mediasti"
df_chest = df[df['caption'].str.contains('chest', case=False, na=False) 
              & df['caption'].str.contains('cardiac|cardiomegaly', case=False, na=False) # mediasti/ lung / cardiac|cardiomegaly
              #& df['caption'].str.contains('enlarge', case=False, na=False)
            ]
print(f"Number of entries: {len(df_chest)}")

# save the column 'name' to a new csv file 
#df_chest[['name']].to_csv('/vol/miltank/users/doanb/ADLM_SS2025/ngocs_use_cases/filtered_chest_names.csv', index=False)

#df_chest.to_excel('/vol/miltank/users/doanb/ADLM_SS2025/ngocs_use_cases/filtered_chest.xlsx', index=False)

df_chest.head()



Number of entries: 288


Unnamed: 0,id,name,caption
175,ROCO_00222,PMC2700481_ATM-04-75-g016.jpg,Radiograph of chest and abdomen of a neonate ...
193,ROCO_00245,PMC5050014_pone.0163490.g001.jpg,Method for determining Cardiothoracic Ratio.D...
550,ROCO_00698,PMC3034699_1479-5876-9-12-1.jpg,A full range of cardiac support technology. T...
676,ROCO_00851,PMC4974773_13104_2016_2181_Fig3_HTML.jpg,Chest radiograph showing cardiomegaly and bil...
1466,ROCO_01838,PMC3669303_pone.0064603.g003.jpg,Postero-anterior chest radiograph of the prob...


In [15]:
# print out the caption of the image with name 'PMC529310_1479-5876-2-33-1.jpg'
print(df[df['name'] == 'PMC529310_1479-5876-2-33-1.jpg']['caption'].values[0])

 Chest CT before treatment (27-Sep-2001) show that conglomeration of a size of 5.5 × 4.2 cm at the left lower hilus pulmonis, large amount of accumulation of fluid in the left thoracic cavity, enlarged lymph nodes in the mediastinum.

