# Exploratory data analysis
The goal of this file currently is to explore pandas and visualize some data

In [3]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_excel("./data/DummyData_Extended.xlsx")

In [4]:
# Display the first few rows of the dataset
print("\nFirst 5 rows of the dataset:")
df.head()


First 5 rows of the dataset:


Unnamed: 0,Record ID,Age,Male,White,Date of Procedure,Extremity,Artery affected,BMI,Tobacco Use,Diabetes,...,AA % Aggregation,AA % Inhibition,CK R (min),CK K (min),CK angle (deg),CK MA (mm),CRT MA (mm),CKH R (min),CFF MA (mm),CFF FLEV (mg/dL)
0,9,71,0,0,,Left,Femoral,30.87815,3,1,...,56.778741,17.638901,3.386632,1.855569,60.311369,45.146627,35.864732,2.283344,36.500137,10.589905
1,65,49,0,0,,Right,Femoral,31.000611,2,1,...,56.808946,19.05058,3.056822,1.700825,59.664139,43.348083,36.106428,2.47137,36.193072,9.896531
2,24,71,0,1,,Right,Popliteal,28.088584,2,1,...,54.634166,17.637,3.296529,1.893613,59.223128,43.758536,35.266328,2.4224,34.217574,11.060189
3,48,67,1,1,,Right,Tibial,26.354188,2,0,...,57.468173,17.860942,3.132582,1.635525,58.09874,43.168904,36.230352,2.418887,34.493469,9.439621
4,74,69,0,0,,Right,Tibial,29.386684,3,1,...,55.636498,17.951208,3.250705,1.62263,60.851424,43.89646,36.121585,2.213384,36.127036,9.641269


In [5]:
# Display basic information about the dataset
print("Dataset Information:")
df.info()

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 70 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   Record ID                                                   150 non-null    int64  
 1   Age                                                         150 non-null    int64  
 2   Male                                                        150 non-null    int64  
 3   White                                                       150 non-null    int64  
 4   Date of Procedure                                           0 non-null      float64
 5   Extremity                                                   150 non-null    object 
 6   Artery affected                                             150 non-null    object 
 7   BMI                                                         150 non-

Categorical data encoding

In [12]:
# Dummy encoding
df_encoded = pd.get_dummies(df, columns=['Artery affected',"Extremity",'Anticoagulation','Intervention classification'], 
                            prefix=['Artery affected','Extremity','Anticoagulation','Intervention classification'])
df_encoded


Unnamed: 0,Record ID,Age,Male,White,Date of Procedure,BMI,Tobacco Use,Diabetes,HbA1c Baseline,Hypertension,...,CK MA (mm),CRT MA (mm),CKH R (min),CFF MA (mm),CFF FLEV (mg/dL),Artery affected_Femoral,Artery affected_Popliteal,Artery affected_Tibial,Extremity_Left,Extremity_Right
0,9,71,0,0,,30.878150,3,1,8,0,...,45.146627,35.864732,2.283344,36.500137,10.589905,True,False,False,True,False
1,65,49,0,0,,31.000611,2,1,7,1,...,43.348083,36.106428,2.471370,36.193072,9.896531,True,False,False,False,True
2,24,71,0,1,,28.088584,2,1,8,1,...,43.758536,35.266328,2.422400,34.217574,11.060189,False,True,False,False,True
3,48,67,1,1,,26.354188,2,0,5,1,...,43.168904,36.230352,2.418887,34.493469,9.439621,False,False,True,False,True
4,74,69,0,0,,29.386684,3,1,6,0,...,43.896460,36.121585,2.213384,36.127036,9.641269,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,1,70,0,1,,28.406442,3,1,6,1,...,44.121772,34.859000,2.531415,35.672805,10.462963,False,False,True,False,True
146,6,63,0,0,,32.165881,3,0,8,1,...,44.775951,35.418618,2.639576,35.563274,11.062118,True,False,False,False,True
147,20,57,0,0,,30.530348,3,1,6,0,...,44.220127,35.831759,2.291857,36.065639,10.313870,True,False,False,True,False
148,4,56,0,0,,31.480867,3,1,5,0,...,45.061634,36.193071,2.613724,36.625659,11.384205,False,False,True,True,False


In [None]:
output_file = './data/Preprocessed_Data.xlsx'
df.to_excel(output_file, index=False)

In [11]:
# Convert the values in the specified column to boolean
df['Male'] = df['Male'].astype(bool)
df['White'] = df['White'].astype(bool)
df['Diabetes'] = df['Diabetes'].astype(bool)
df['Tobacco Use'] = df['Tobacco Use'].astype(bool)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 70 columns):
 #   Column                                                      Non-Null Count  Dtype         
---  ------                                                      --------------  -----         
 0   Record ID                                                   10 non-null     int64         
 1   Age                                                         10 non-null     int64         
 2   Male                                                        10 non-null     bool          
 3   White                                                       10 non-null     bool          
 4   Date of Procedure                                           10 non-null     datetime64[ns]
 5   Extremity                                                   10 non-null     object        
 6   Artery affected                                             10 non-null     object        
 7   BMI                          

In [12]:
# Summary statistics of numeric columns
print("\nSummary Statistics:")
df.describe()


Summary Statistics:


Unnamed: 0,Record ID,Age,Date of Procedure,BMI,HbA1c Baseline,Hypertension,CKD,Coronary Artery Disease,History of MI,Clotting Disorder,...,AA % Aggregation,AA % Inhibition,CK R (min),CK K (min),CK angle (deg),CK MA (mm),CRT MA (mm),CKH R (min),CFF MA (mm),CFF FLEV (mg/dL)
count,10.0,10.0,10,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,5.5,58.0,2023-07-27 14:24:00,29.24,7.0,0.6,0.4,0.5,0.3,0.3,...,56.02,18.11,3.16,1.75,58.92,43.63,35.6,2.51,35.27,10.39
min,1.0,45.0,2023-03-15 00:00:00,25.8,5.9,0.0,0.0,0.0,0.0,0.0,...,54.1,17.4,3.0,1.6,57.2,42.1,34.6,2.2,33.7,9.3
25%,3.25,51.25,2023-05-23 06:00:00,27.4,6.55,0.0,0.0,0.0,0.0,0.0,...,54.975,17.825,3.025,1.7,57.8,42.625,34.975,2.4,34.6,9.85
50%,5.5,58.5,2023-07-23 12:00:00,28.9,6.9,1.0,0.0,0.5,0.0,0.0,...,56.05,18.05,3.15,1.75,58.8,43.75,35.75,2.5,35.25,10.4
75%,7.75,62.75,2023-09-28 06:00:00,31.4,7.425,1.0,1.0,1.0,0.75,0.75,...,56.95,18.375,3.275,1.8,59.9,44.55,36.15,2.675,35.975,10.95
max,10.0,72.0,2023-12-17 00:00:00,32.4,8.3,1.0,1.0,1.0,1.0,1.0,...,58.2,19.2,3.4,1.9,61.2,45.4,36.5,2.8,36.8,11.5
std,3.02765,8.717798,,2.341984,0.746845,0.516398,0.516398,0.527046,0.483046,0.483046,...,1.296834,0.506513,0.142984,0.108012,1.326482,1.154749,0.708676,0.191195,0.977582,0.740045


In [13]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
Record ID            0
Age                  0
Male                 0
White                0
Date of Procedure    0
                    ..
CK MA (mm)           0
CRT MA (mm)          0
CKH R (min)          0
CFF MA (mm)          0
CFF FLEV (mg/dL)     0
Length: 70, dtype: int64
