In [24]:
import pandas as pd
import numpy as np

In [25]:
# File path from the uploaded file
file_path = 'hemodat.csv'


In [26]:
# Load the data
data = pd.read_csv(file_path)


In [27]:
# Specify the desired column order
new_order = [
    'Category', 'Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 
    'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'Unnamed: 0'
]
# Reorder the columns
data_reordered = data[new_order]
#Save the rearranged DataFrame to a new CSV file
output_path = 'hemodat_reordered.csv'
data_reordered.to_csv(output_path, index=False)
print(f"Reordered file saved to {output_path}")

Reordered file saved to hemodat_reordered.csv


In [28]:
print(data_reordered)


          Category  Age Sex   ALB    ALP    ALT    AST   BIL    CHE  CHOL  \
0    0=Blood Donor   32   m  38.5   52.5    7.7   22.1   7.5   6.93  3.23   
1    0=Blood Donor   32   m  38.5   70.3   18.0   24.7   3.9  11.17  4.80   
2    0=Blood Donor   32   m  46.9   74.7   36.2   52.6   6.1   8.84  5.20   
3    0=Blood Donor   32   m  43.2   52.0   30.6   22.6  18.9   7.33  4.74   
4    0=Blood Donor   32   m  39.2   74.1   32.6   24.8   9.6   9.15  4.32   
..             ...  ...  ..   ...    ...    ...    ...   ...    ...   ...   
610    3=Cirrhosis   62   f  32.0  416.6    5.9  110.3  50.0   5.57  6.30   
611    3=Cirrhosis   64   f  24.0  102.8    2.9   44.4  20.0   1.54  3.02   
612    3=Cirrhosis   64   f  29.0   87.3    3.5   99.0  48.0   1.66  3.63   
613    3=Cirrhosis   46   f  33.0    NaN   39.0   62.0  20.0   3.56  4.20   
614    3=Cirrhosis   59   f  36.0    NaN  100.0   80.0  12.0   9.07  5.30   

      CREA    GGT  PROT  Unnamed: 0  
0    106.0   12.1  69.0           1  

In [29]:
print(data_reordered.head())


        Category  Age Sex   ALB   ALP   ALT   AST   BIL    CHE  CHOL   CREA  \
0  0=Blood Donor   32   m  38.5  52.5   7.7  22.1   7.5   6.93  3.23  106.0   
1  0=Blood Donor   32   m  38.5  70.3  18.0  24.7   3.9  11.17  4.80   74.0   
2  0=Blood Donor   32   m  46.9  74.7  36.2  52.6   6.1   8.84  5.20   86.0   
3  0=Blood Donor   32   m  43.2  52.0  30.6  22.6  18.9   7.33  4.74   80.0   
4  0=Blood Donor   32   m  39.2  74.1  32.6  24.8   9.6   9.15  4.32   76.0   

    GGT  PROT  Unnamed: 0  
0  12.1  69.0           1  
1  15.6  76.5           2  
2  33.2  79.3           3  
3  33.8  75.7           4  
4  29.9  68.7           5  


In [30]:
# Exploratory Data Analysis
print("Dataset Information:")
data_reordered.info()

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Category    615 non-null    object 
 1   Age         615 non-null    int64  
 2   Sex         615 non-null    object 
 3   ALB         614 non-null    float64
 4   ALP         597 non-null    float64
 5   ALT         614 non-null    float64
 6   AST         615 non-null    float64
 7   BIL         615 non-null    float64
 8   CHE         615 non-null    float64
 9   CHOL        605 non-null    float64
 10  CREA        615 non-null    float64
 11  GGT         615 non-null    float64
 12  PROT        614 non-null    float64
 13  Unnamed: 0  615 non-null    int64  
dtypes: float64(10), int64(2), object(2)
memory usage: 67.4+ KB


In [31]:
print("\nSummary Statistics:")
print(data_reordered.describe(include='all'))


Summary Statistics:
             Category         Age  Sex         ALB         ALP         ALT  \
count             615  615.000000  615  614.000000  597.000000  614.000000   
unique              5         NaN    2         NaN         NaN         NaN   
top     0=Blood Donor         NaN    m         NaN         NaN         NaN   
freq              533         NaN  377         NaN         NaN         NaN   
mean              NaN   47.408130  NaN   41.620195   68.283920   28.450814   
std               NaN   10.055105  NaN    5.780629   26.028315   25.469689   
min               NaN   19.000000  NaN   14.900000   11.300000    0.900000   
25%               NaN   39.000000  NaN   38.800000   52.500000   16.400000   
50%               NaN   47.000000  NaN   41.950000   66.200000   23.000000   
75%               NaN   54.000000  NaN   45.200000   80.100000   33.075000   
max               NaN   77.000000  NaN   82.200000  416.600000  325.300000   

               AST         BIL         CHE

In [32]:
print("\nChecking for missing values:")
missing_values = data_reordered.isnull().sum()
print(missing_values)


Checking for missing values:
Category       0
Age            0
Sex            0
ALB            1
ALP           18
ALT            1
AST            0
BIL            0
CHE            0
CHOL          10
CREA           0
GGT            0
PROT           1
Unnamed: 0     0
dtype: int64


In [34]:

# Preprocessing and Imputation
# Fill missing numerical values with the mean
numerical_columns = data_reordered.select_dtypes(include=[np.number]).columns
data_reordered[numerical_columns] = data_reordered[numerical_columns].apply(
    lambda x: x.fillna(x.mean())
)

In [35]:
# Fill missing categorical values with the mode
categorical_columns = data_reordered.select_dtypes(include=['object']).columns
data_reordered[categorical_columns] = data_reordered[categorical_columns].apply(
    lambda x: x.fillna(x.mode()[0])
)

In [36]:
print("\nAfter Imputation, Checking for Missing Values:")
print(data_reordered.isnull().sum())



After Imputation, Checking for Missing Values:
Category      0
Age           0
Sex           0
ALB           0
ALP           0
ALT           0
AST           0
BIL           0
CHE           0
CHOL          0
CREA          0
GGT           0
PROT          0
Unnamed: 0    0
dtype: int64


In [37]:
# Estimate the accuracy of imputation
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [39]:
# Evaluate imputation for numerical features
for col in numerical_columns:
    original_data = data_reordered[col].dropna()
    if len(original_data) > 0:
        train, test = train_test_split(original_data, test_size=0.2, random_state=42)
        imputer = SimpleImputer(strategy='mean')
        imputer.fit(train.values.reshape(-1, 1))
        imputed_test = imputer.transform(test.values.reshape(-1, 1))
        mae = mean_absolute_error(test, imputed_test)
        print(f"Feature: {col}, Mean Absolute Error of Imputation: {mae}")

# Save the preprocessed and imputed data
output_path = 'hemodat_preprocessed.csv'
data_reordered.to_csv(output_path, index=False)

print(f"Preprocessed file saved to {output_path}")


Feature: Age, Mean Absolute Error of Imputation: 0.0
Feature: ALB, Mean Absolute Error of Imputation: 0.0
Feature: ALP, Mean Absolute Error of Imputation: 0.0
Feature: ALT, Mean Absolute Error of Imputation: 0.0
Feature: AST, Mean Absolute Error of Imputation: 0.0
Feature: BIL, Mean Absolute Error of Imputation: 0.0
Feature: CHE, Mean Absolute Error of Imputation: 0.0
Feature: CHOL, Mean Absolute Error of Imputation: 0.0
Feature: CREA, Mean Absolute Error of Imputation: 0.0
Feature: GGT, Mean Absolute Error of Imputation: 0.0
Feature: PROT, Mean Absolute Error of Imputation: 0.0
Feature: Unnamed: 0, Mean Absolute Error of Imputation: 0.0
Preprocessed file saved to hemodat_preprocessed.csv
