#**Overview:**

This notebook load the suggested split csv file and merges the split column to the previously created pneumonia dataset. This column is then used to have a consistent data split across all notebooks

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive

In [2]:
drive.mount('/content/drive')
split = pd.read_csv('/content/drive/My Drive/Dissertation/mimic-cxr-2.0.0-split.csv')

Mounted at /content/drive


In [3]:
split['split'].value_counts()

Unnamed: 0_level_0,count
split,Unnamed: 1_level_1
train,368960
test,5159
validate,2991


In [4]:
# Group the DataFrame by 'study_id' and count the unique values in the 'split' column for each group
split_counts = split.groupby('study_id')['split'].value_counts()
s = split_counts.value_counts().sort_index()
xrays = {'1':s[1],'2':s[2],'3':s[3],'4':s[4],'>=5':sum(s[4:])}
print('number of x-rays per study',xrays)

number of x-rays per study {'1': 102675, '2': 103481, '3': 19442, '4': 2097, '>=5': 140}


In [5]:
visit_counts = split.groupby('subject_id')['study_id'].nunique()
v = visit_counts.value_counts().sort_index()
visits = {'1':v[1],'2':v[2],'3':v[3],'4':v[4],'5 or more':sum(v[5:])}
# Extract data for plotting
visits_keys = list(visits.keys())
visits_values = list(visits.values())

chart = pd.DataFrame(visits.items(), columns = ['Visits', 'Count'])
chart.head()

Unnamed: 0,Visits,Count
0,1,32695
1,2,10945
2,3,5670
3,4,3573
4,5 or more,10080


In [None]:
value = 1
split_sorted = split.sort_values('study_id')
for i in range(len(split)-1):
  if split_sorted['study_id'][i] == split_sorted['study_id'][i+1]:
    if split_sorted['split'][i] != split_sorted['split'][i+1]:
      print("same study but different split",i)
    else:
      value += 1
  else:
    value += 1
if value == len(split):
  print("All studies have consistent split values.")

All studies have consistent split values.


In [None]:
df = pd.read_csv('/content/drive/My Drive/Dissertation/Images/pneumonia_1519_PAAP.csv')
df_split = pd.merge(df, split, on='dicom_id')

# funcdtion to run a few check to see if the two dataframes were merged correctly
def column_test(df,col1,col2):
  n = len(df)
  m = sum((df[col1]==df[col2]))
  return m==n

column_test(df_split,'subject_id_x','subject_id')

In [None]:
df_split = df_split.drop(['study_id_y','subject_id'],axis=1)
df_split = df_split.rename(columns={"subject_is_x":"subject_id","Pneumonia":"pneumonia"})
df_split

Unnamed: 0,dicom_id,subject_id_x,study_id_x,ViewPosition,img_path,pneumonia,split
0,043f2b1c-1b8b0a20-c9e5ec5d-02ac7d4a-35000b4c,15000170,56450978,PA,files/p15/p15000170/s56450978/043f2b1c-1b8b0a2...,0.0,train
1,39ee0432-150f8ee9-e65abf9a-15bc5beb-80fbf3f6,15000393,51634677,PA,files/p15/p15000393/s51634677/39ee0432-150f8ee...,0.0,train
2,80eeb158-92ef7719-b43ae606-fb2745cf-99680d44,15000393,51634677,PA,files/p15/p15000393/s51634677/80eeb158-92ef771...,0.0,train
3,8a2da5f5-09ea301d-768e059c-5f053a34-2d3b3057,15000393,52929930,PA,files/p15/p15000393/s52929930/8a2da5f5-09ea301...,1.0,train
4,b08efb71-38c915e9-3d9d7df0-d783d4d6-1317bf59,15000393,54674484,PA,files/p15/p15000393/s54674484/b08efb71-38c915e...,0.0,train
...,...,...,...,...,...,...,...
30679,14c4f70b-51110089-a731e968-fc1e017e-dd4c536b,19997473,57809462,AP,files/p19/p19997473/s57809462/14c4f70b-5111008...,-1.0,train
30680,a29987d8-abd13298-7a067b12-620f9fdb-103ecf53,19998330,54053771,AP,files/p19/p19998330/s54053771/a29987d8-abd1329...,1.0,train
30681,518011e2-346dbd44-3e738335-c5006bf8-d69f6b68,19998770,51149538,AP,files/p19/p19998770/s51149538/518011e2-346dbd4...,0.0,train
30682,1427ad57-5bf4f3e6-90be02f3-d1760987-99d7f2ce,19998843,56350227,AP,files/p19/p19998843/s56350227/1427ad57-5bf4f3e...,1.0,train


In [None]:
df_split.to_csv('/content/drive/My Drive/Dissertation/Images/pneumonia_1519_PAAP.csv',index=False)

In [None]:
pd.read_csv('/content/drive/My Drive/Dissertation/Images/pneumonia_1519_PAAP.csv')['split'].value_counts()

Unnamed: 0_level_0,count
split,Unnamed: 1_level_1
train,29943
test,520
validate,221
