# Create label for training in future

In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv(
    "/home/nckusoc/Documents/Devin-Journal/manifest-1721718306896/LIDC_IDRI_Preprocessing/data_std_10/Meta/meta_info.csv"
)

# Format the 'patient_id' to be four digits with leading zeros
df["patient_id_str"] = df["patient_id"].apply(lambda x: f"{int(x):04}")

# Modify the 'original_image' and 'mask_image' columns by adding 'LIDC-IDRI-{patient_id_str}/'
df["original_image"] = df.apply(
    lambda row: f"LIDC-IDRI-{row['patient_id_str']}/{row['original_image']}", axis=1
)
df["mask_image"] = df.apply(
    lambda row: f"LIDC-IDRI-{row['patient_id_str']}/{row['mask_image']}", axis=1
)

# Drop the 'patient_id_str' column as it is not needed in the final CSV
df.drop(columns=["patient_id_str"], inplace=True)
# Save the updated DataFrame to a new CSV file
df.to_csv(
    "/home/nckusoc/Documents/Devin-Journal/manifest-1721718306896/LIDC_IDRI_Preprocessing/data_std_10/Meta/meta_info_std.csv",
    index=False,
)

print(df)

       patient_id nodule_no  slice_no                   original_image  \
0               1       [0]        86  LIDC-IDRI-0001/0001_NI_slice086   
1               1       [0]        87  LIDC-IDRI-0001/0001_NI_slice087   
2               1       [0]        88  LIDC-IDRI-0001/0001_NI_slice088   
3               1       [0]        89  LIDC-IDRI-0001/0001_NI_slice089   
4               1       [0]        90  LIDC-IDRI-0001/0001_NI_slice090   
...           ...       ...       ...                              ...   
19254        1011       [3]       109  LIDC-IDRI-1011/1011_NI_slice109   
19255        1012       [0]        60  LIDC-IDRI-1012/1012_NI_slice060   
19256        1012       [0]        61  LIDC-IDRI-1012/1012_NI_slice061   
19257        1012       [0]        62  LIDC-IDRI-1012/1012_NI_slice062   
19258        1012       [0]        63  LIDC-IDRI-1012/1012_NI_slice063   

                            mask_image malignancy is_cancer  is_clean  
0      LIDC-IDRI-0001/0001_MA_slice086 

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# meta = pd.read_csv('../data/Meta/meta_info.csv')
meta = pd.read_csv(
    "/home/nckusoc/Documents/Devin-Journal/manifest-1721718306896/LIDC_IDRI_Preprocessing/data_std/Meta/meta_info_std.csv"
)

In [4]:
meta.head()

Unnamed: 0,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_clean
0,1,[0],86,LIDC-IDRI-0001/0001_NI_slice086,LIDC-IDRI-0001/0001_MA_slice086,[5],[True],False
1,1,[0],87,LIDC-IDRI-0001/0001_NI_slice087,LIDC-IDRI-0001/0001_MA_slice087,[5],[True],False
2,1,[0],88,LIDC-IDRI-0001/0001_NI_slice088,LIDC-IDRI-0001/0001_MA_slice088,[5],[True],False
3,1,[0],89,LIDC-IDRI-0001/0001_NI_slice089,LIDC-IDRI-0001/0001_MA_slice089,[5],[True],False
4,1,[0],90,LIDC-IDRI-0001/0001_NI_slice090,LIDC-IDRI-0001/0001_MA_slice090,[5],[True],False


In [5]:
# NI= Nodule Image, MA = Mask Original , CN = Clean Nodule , CM = Clean Mask

In [6]:
def is_nodule(row):
    if row[20:22] =='NI':
        return True
    else:
        return False

In [7]:
meta['is_nodule']= meta['original_image'].apply(lambda row: is_nodule(row))

In [8]:
# Lets separate Clean meta and meta data
clean_meta = meta[meta['is_nodule']==False]
clean_meta.reset_index(inplace=True)
meta = meta[meta['is_nodule']==True]
meta.reset_index(inplace=True)

In [9]:
clean_meta.head(2)

Unnamed: 0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_clean,is_nodule
0,366,28,0,0,LIDC-IDRI-0028/0028_CN_slice000,LIDC-IDRI-0028/0028_CM_slice000,0,False,True,False
1,367,28,1,1,LIDC-IDRI-0028/0028_CN_slice001,LIDC-IDRI-0028/0028_CM_slice001,0,False,True,False


In [10]:
meta.head(2)

Unnamed: 0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_clean,is_nodule
0,0,1,[0],86,LIDC-IDRI-0001/0001_NI_slice086,LIDC-IDRI-0001/0001_MA_slice086,[5],[True],False,True
1,1,1,[0],87,LIDC-IDRI-0001/0001_NI_slice087,LIDC-IDRI-0001/0001_MA_slice087,[5],[True],False,True


In [11]:
def is_train(row,train,val,test):
    if row in train:
        return 'Train'
    elif row in val:
        return 'Validation'
    else:
        return 'Test'

In [12]:
clean_patient_id = list(np.unique(clean_meta['patient_id']))
meta_patient_id = list(np.unique(meta['patient_id']))

In [13]:
def create_label_segmentation(meta):
    patient_id = list(np.unique(meta['patient_id']))
    train_patient , test_patient = train_test_split(patient_id,test_size= 0.1)
    train_patient, val_patient = train_test_split(train_patient,test_size= 0.1)
    print(len(train_patient),len(val_patient),len(test_patient))
    
    meta['data_split']= meta['patient_id'].apply(lambda row : is_train(row,train_patient,val_patient,test_patient))
    
    return meta

In [14]:
# We need to train/test split independently for clean_meta, meta
meta = create_label_segmentation(meta)
clean_meta = create_label_segmentation(clean_meta)

697 78 87
108 13 14


In [15]:
meta.head()

Unnamed: 0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_clean,is_nodule,data_split
0,0,1,[0],86,LIDC-IDRI-0001/0001_NI_slice086,LIDC-IDRI-0001/0001_MA_slice086,[5],[True],False,True,Train
1,1,1,[0],87,LIDC-IDRI-0001/0001_NI_slice087,LIDC-IDRI-0001/0001_MA_slice087,[5],[True],False,True,Train
2,2,1,[0],88,LIDC-IDRI-0001/0001_NI_slice088,LIDC-IDRI-0001/0001_MA_slice088,[5],[True],False,True,Train
3,3,1,[0],89,LIDC-IDRI-0001/0001_NI_slice089,LIDC-IDRI-0001/0001_MA_slice089,[5],[True],False,True,Train
4,4,1,[0],90,LIDC-IDRI-0001/0001_NI_slice090,LIDC-IDRI-0001/0001_MA_slice090,[5],[True],False,True,Train


In [17]:
# Clean Meta only stores meta information of patients without nodules.

In [None]:
meta.to_csv(
    "/home/nckusoc/Documents/Devin-Journal/manifest-1721718306896/LIDC_IDRI_Segmentation/meta_csv/meta_std_811.csv"
)
clean_meta.to_csv(
    "/home/nckusoc/Documents/Devin-Journal/manifest-1721718306896/LIDC_IDRI_Segmentation/meta_csv/clean_meta_std_811.csv"
)