In [2]:
# Importing Libraries for Data Manipulation and Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configuring plot styles
sns.set_style(style = 'whitegrid')
print("Libraries imported and plot styles configured.")

Libraries imported and plot styles configured.


In [3]:
# Loading the feature set which i created in the previous notebook
features_df = pd.read_csv('../Results/radiomics_features.csv')

print("Successfully loaded the radiomics_features.csv file.")
print(f"The dataset contains {features_df.shape[0]} patients and {features_df.shape[1]} columns.")

Successfully loaded the radiomics_features.csv file.
The dataset contains 98 patients and 130 columns.


In [4]:
# Displaying the first 5 rows of the dataset

print("First 5 rows of the dataset:")
display(features_df.head())

#Getting a concise summary of the dataframe, including data types and non-null values
print("\nDataFrame Info:")
features_df.info()

First 5 rows of the dataset:


Unnamed: 0,PatientID,diagnostics_Versions_PyRadiomics,diagnostics_Versions_Numpy,diagnostics_Versions_SimpleITK,diagnostics_Versions_PyWavelet,diagnostics_Versions_Python,diagnostics_Configuration_Settings,diagnostics_Configuration_EnabledImageTypes,diagnostics_Image-original_Hash,diagnostics_Image-original_Dimensionality,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,HCC_002,v3.0.1,2.2.6,2.5.2,1.8.0,3.10.16,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},fd7992bb639fb9faf665ba819bbd34498377504f,3D,...,1099.814921,0.00237,7.583554,0.266796,2054.359117,0.239685,0.00085,5433.018957,0.060803,5.615451
1,HCC_003,v3.0.1,2.2.6,2.5.2,1.8.0,3.10.16,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},25aeb8d1ec7731ede2658379e04e15b3befd6497,3D,...,769.353318,0.023946,7.485852,0.122287,4584.584627,0.76989,0.000457,3697.112955,0.269812,2.431489
2,HCC_004,v3.0.1,2.2.6,2.5.2,1.8.0,3.10.16,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},3b109c7a737933c2ffe0dd9f7ff6ba61aa1617af,3D,...,982.905104,0.005598,8.184989,0.147813,71788.316172,6.842687,3.1e-05,18299.857816,0.066867,0.68951
3,HCC_005,v3.0.1,2.2.6,2.5.2,1.8.0,3.10.16,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},b3321933601c857f30931e33f848083b44d67fbe,3D,...,1077.363047,0.001134,7.141551,0.115749,222863.08801,10.95586,1.7e-05,3817.840499,0.015906,0.087675
4,HCC_006,v3.0.1,2.2.6,2.5.2,1.8.0,3.10.16,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},a0a1238368bdb1bbc105ea0be520016c306efe99,3D,...,1291.330916,0.002892,7.605537,0.123402,78501.197013,6.98302,2.2e-05,5438.366684,0.028212,0.189437



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Columns: 130 entries, PatientID to original_ngtdm_Strength
dtypes: float64(110), int64(2), object(18)
memory usage: 99.7+ KB


In [5]:
# Separating the actual radiomics features from the diagnostic labels added by pyradiomics library which starts with 'diagnostics_'
# selecting the columns that do not start with 'diagnostics_'
feature_columns = [col for col in features_df.columns if not col.startswith('diagonastics_')]

#creating a new DataFrame with only the PatientID and the feature columns
clean_df = features_df[feature_columns]

print (f"Removed the diagnostic columns. New shape of the DataFrame is {clean_df.shape}.")


Removed the diagnostic columns. New shape of the DataFrame is (98, 130).


In [7]:
# checking for missing values in the DataFrame
missing_values = clean_df.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print(f"Found {len(missing_columns)} columns with missing values.")

# For each column that has missing values, i'll fill the missing values with the median of that column
for column in missing_columns.index:
    median_val = clean_df[col].median()
    clean_df[col].fillna(median_val, inplace=True)
print(f"Filled missing values using the median of each column.")

#final checking to confirm that there are no missing values left
print(f"\n Missing values remaining: {clean_df.isnull().sum().sum()}")

Found 0 columns with missing values.
Filled missing values using the median of each column.

 Missing values remaining: 0
