In [127]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [128]:
#read csv
column_names = ['age', 'blood pressure', 'specific gravity', 'albumin', 'sugar', 'red blood cells', 'pus cell', 'pus cell clumps', 'bacteria', 'blood glucose random', 'blood urea', 'serum creatinine', 'sodium',
       'potassium', 'hemoglobin', 'packed cell volume', 'white blood cell count', 'red blood cell count', 'hypertension', 'diabetes mellitus', 'coronary artery disease', 'appetite', 'pedal edema', 'anemia', 'class']
kidney = pd.read_excel(f"C:\\Users\\lala7\\Desktop\\研究所\\碩一上\\製造數據科學\\HW\\MDS_Assignment02\\MDS_Assignment2_kidney.xlsx", sheet_name = 'chronic_kidney_disease',  names = column_names) 
kidney

Unnamed: 0,age,blood pressure,specific gravity,albumin,sugar,red blood cells,pus cell,pus cell clumps,bacteria,blood glucose random,...,packed cell volume,white blood cell count,red blood cell count,hypertension,diabetes mellitus,coronary artery disease,appetite,pedal edema,anemia,class
0,7,50,1.02,4,0,?,normal,notpresent,notpresent,?,...,38,6000,?,no,no,no,good,no,no,ckd
1,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423,...,31,7500,?,no,yes,no,poor,no,yes,ckd
2,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
3,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106,...,35,7300,4.6,no,no,no,good,no,no,ckd
4,60,90,1.015,3,0,?,?,notpresent,notpresent,74,...,39,7800,4.4,yes,yes,no,good,yes,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,55,80,1.02,0,0,normal,normal,notpresent,notpresent,140,...,47,6700,4.9,no,no,no,good,no,no,notckd
395,42,70,1.025,0,0,normal,normal,notpresent,notpresent,75,...,54,7800,6.2,no,no,no,good,no,no,notckd
396,12,80,1.02,0,0,normal,normal,notpresent,notpresent,100,...,49,6600,5.4,no,no,no,good,no,no,notckd
397,17,60,1.025,0,0,normal,normal,notpresent,notpresent,114,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [129]:
column_names = ['age', 'blood pressure', 'specific gravity', 'albumin', 'sugar', 'red blood cells', 'pus cell', 'pus cell clumps', 'bacteria', 'blood glucose random', 'blood urea', 'serum creatinine', 'sodium',
       'potassium', 'hemoglobin', 'packed cell volume', 'white blood cell count', 'red blood cell count', 'hypertension', 'diabetes mellitus', 'coronary artery disease', 'appetite', 'pedal edema', 'anemia', 'class']
kidney.columns = column_names

處理並查看各欄缺失值個數

In [130]:
kidney.replace("?", np.nan, inplace=True) # kidney = kidney.replace("?", pd.NA)
kidney = kidney.map(lambda x: np.nan if x == '' else x)

kidney
kidney.isnull().sum().sort_values(ascending=False)

red blood cells            151
red blood cell count       134
white blood cell count     109
potassium                   87
sodium                      86
packed cell volume          72
pus cell                    65
hemoglobin                  52
sugar                       49
specific gravity            47
albumin                     46
blood glucose random        44
blood urea                  19
serum creatinine            17
coronary artery disease     15
appetite                    14
pedal edema                 14
anemia                      14
diabetes mellitus           13
class                       13
blood pressure              12
age                          9
hypertension                 8
bacteria                     4
pus cell clumps              4
dtype: int64

查看各欄變異程度（資訊量）

In [131]:
numerical_col = ['age', 'blood pressure', 'blood glucose random', 'blood urea', 'serum creatinine', 'sodium', 'potassium', 'hemoglobin', 'packed cell volume', 'white blood cell count', 'red blood cell count']
for i in numerical_col:
    print(i)
    print(kidney[i].var(skipna=True))

age
295.5256476171643
blood pressure
187.69463523048296
blood glucose random
6301.2754038354415
blood urea
2556.068860991529
serum creatinine
33.03781267606602
sodium
108.34211927582533
potassium
10.2010238890263
hemoglobin
8.483744565307923
packed cell volume
81.18479953471794
white blood cell count
8728801.932943564
red blood cell count
1.0222361349342481


資料前處理 - 填補遺漏值、轉dummy

In [132]:
# 二元類別變數轉0/1
boolean_mapping_1 = {'yes': 1, 'no': 0}
boolean_mapping_2 = {'normal': 1, 'abnormal': 0}
boolean_mapping_3 = {'present': 1, 'notpresent': 0}
boolean_mapping_4 = {'good': 1, 'poor': 0}
boolean_mapping_5 = {'ckd': 1, 'notckd': 0}

for i in ['hypertension', 'diabetes mellitus', 'coronary artery disease', 'pedal edema', 'anemia']:
    # kidney[i] = kidney[i].replace(boolean_mapping_1)
    kidney[i] = kidney[i].apply(lambda x: str(x).strip().lower() if pd.notna(x) else '')
    kidney[i] = kidney[i].replace(boolean_mapping_1)

for i in ['red blood cells', 'pus cell']:
    kidney[i] = kidney[i].replace(boolean_mapping_2)
for i in ['pus cell clumps', 'bacteria']:
    kidney[i] = kidney[i].replace(boolean_mapping_3)

kidney['appetite'] = kidney['appetite'].replace(boolean_mapping_4)
kidney['class'] = kidney['class'].replace(boolean_mapping_5)

kidney

Unnamed: 0,age,blood pressure,specific gravity,albumin,sugar,red blood cells,pus cell,pus cell clumps,bacteria,blood glucose random,...,packed cell volume,white blood cell count,red blood cell count,hypertension,diabetes mellitus,coronary artery disease,appetite,pedal edema,anemia,class
0,7.0,50.0,1.020,4.0,0.0,,1.0,0.0,0.0,,...,38.0,6000.0,,0,0,0,1.0,0,0,1.0
1,62.0,80.0,1.010,2.0,3.0,1.0,1.0,0.0,0.0,423.0,...,31.0,7500.0,,0,1,0,0.0,0,1,1.0
2,48.0,70.0,1.005,4.0,0.0,1.0,0.0,1.0,0.0,117.0,...,32.0,6700.0,3.9,1,0,0,0.0,1,1,1.0
3,51.0,80.0,1.010,2.0,0.0,1.0,1.0,0.0,0.0,106.0,...,35.0,7300.0,4.6,0,0,0,1.0,0,0,1.0
4,60.0,90.0,1.015,3.0,0.0,,,0.0,0.0,74.0,...,39.0,7800.0,4.4,1,1,0,1.0,1,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,55.0,80.0,1.020,0.0,0.0,1.0,1.0,0.0,0.0,140.0,...,47.0,6700.0,4.9,0,0,0,1.0,0,0,0.0
395,42.0,70.0,1.025,0.0,0.0,1.0,1.0,0.0,0.0,75.0,...,54.0,7800.0,6.2,0,0,0,1.0,0,0,0.0
396,12.0,80.0,1.020,0.0,0.0,1.0,1.0,0.0,0.0,100.0,...,49.0,6600.0,5.4,0,0,0,1.0,0,0,0.0
397,17.0,60.0,1.025,0.0,0.0,1.0,1.0,0.0,0.0,114.0,...,51.0,7200.0,5.9,0,0,0,1.0,0,0,0.0


In [133]:
#多元變數get dummies
dummy_sg = pd.get_dummies(kidney['specific gravity'], prefix='sp', prefix_sep='_', drop_first=True)
dummy_alb = pd.get_dummies(kidney['albumin'], prefix='alb', prefix_sep='_', drop_first=True)
dummy_sugar = pd.get_dummies(kidney['sugar'], prefix='sugar', prefix_sep='_', drop_first=True)
kidney_input = pd.concat([kidney, dummy_sg, dummy_alb, dummy_sugar], axis=1)
kidney_input = kidney_input.drop(columns=['specific gravity', 'albumin', 'sugar'])
kidney_input


Unnamed: 0,age,blood pressure,red blood cells,pus cell,pus cell clumps,bacteria,blood glucose random,blood urea,serum creatinine,sodium,...,alb_1.0,alb_2.0,alb_3.0,alb_4.0,alb_5.0,sugar_1.0,sugar_2.0,sugar_3.0,sugar_4.0,sugar_5.0
0,7.0,50.0,,1.0,0.0,0.0,,18.0,0.8,,...,False,False,False,True,False,False,False,False,False,False
1,62.0,80.0,1.0,1.0,0.0,0.0,423.0,53.0,1.8,,...,False,True,False,False,False,False,False,True,False,False
2,48.0,70.0,1.0,0.0,1.0,0.0,117.0,56.0,3.8,111.0,...,False,False,False,True,False,False,False,False,False,False
3,51.0,80.0,1.0,1.0,0.0,0.0,106.0,26.0,1.4,,...,False,True,False,False,False,False,False,False,False,False
4,60.0,90.0,,,0.0,0.0,74.0,25.0,1.1,142.0,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,55.0,80.0,1.0,1.0,0.0,0.0,140.0,49.0,0.5,150.0,...,False,False,False,False,False,False,False,False,False,False
395,42.0,70.0,1.0,1.0,0.0,0.0,75.0,31.0,1.2,141.0,...,False,False,False,False,False,False,False,False,False,False
396,12.0,80.0,1.0,1.0,0.0,0.0,100.0,26.0,0.6,137.0,...,False,False,False,False,False,False,False,False,False,False
397,17.0,60.0,1.0,1.0,0.0,0.0,114.0,50.0,1.0,135.0,...,False,False,False,False,False,False,False,False,False,False


In [134]:
# 原始資料集填補遺漏值
kidney_input = kidney_input.map(lambda x: np.nan if x == '' else x)
k = 3
imputer = KNNImputer(n_neighbors=k)
imputed_data = imputer.fit_transform(kidney_input)
kidney_imputed = pd.DataFrame(imputed_data, columns=kidney_input.columns.tolist())
kidney_imputed

Unnamed: 0,age,blood pressure,red blood cells,pus cell,pus cell clumps,bacteria,blood glucose random,blood urea,serum creatinine,sodium,...,alb_1.0,alb_2.0,alb_3.0,alb_4.0,alb_5.0,sugar_1.0,sugar_2.0,sugar_3.0,sugar_4.0,sugar_5.0
0,7.0,50.0,0.333333,1.000000,0.0,0.0,113.666667,18.0,0.8,137.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,62.0,80.0,1.000000,1.000000,0.0,0.0,423.000000,53.0,1.8,132.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,48.0,70.0,1.000000,0.000000,1.0,0.0,117.000000,56.0,3.8,111.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,51.0,80.0,1.000000,1.000000,0.0,0.0,106.000000,26.0,1.4,138.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,60.0,90.0,0.333333,0.333333,0.0,0.0,74.000000,25.0,1.1,142.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,55.0,80.0,1.000000,1.000000,0.0,0.0,140.000000,49.0,0.5,150.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
395,42.0,70.0,1.000000,1.000000,0.0,0.0,75.000000,31.0,1.2,141.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
396,12.0,80.0,1.000000,1.000000,0.0,0.0,100.000000,26.0,0.6,137.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
397,17.0,60.0,1.000000,1.000000,0.0,0.0,114.000000,50.0,1.0,135.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [135]:
#把類別變數轉回0/1
binary_col = ['hypertension', 'diabetes mellitus', 'coronary artery disease', 'pedal edema', 'anemia', 'red blood cells', 'pus cell', 'pus cell clumps', 'bacteria', 'appetite', 'class']
for i in binary_col:
    kidney_imputed[i] = kidney_imputed[i].apply(lambda x:1 if x > 0.5 else 0)
kidney_imputed

Unnamed: 0,age,blood pressure,red blood cells,pus cell,pus cell clumps,bacteria,blood glucose random,blood urea,serum creatinine,sodium,...,alb_1.0,alb_2.0,alb_3.0,alb_4.0,alb_5.0,sugar_1.0,sugar_2.0,sugar_3.0,sugar_4.0,sugar_5.0
0,7.0,50.0,0,1,0,0,113.666667,18.0,0.8,137.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,62.0,80.0,1,1,0,0,423.000000,53.0,1.8,132.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,48.0,70.0,1,0,1,0,117.000000,56.0,3.8,111.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,51.0,80.0,1,1,0,0,106.000000,26.0,1.4,138.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,60.0,90.0,0,0,0,0,74.000000,25.0,1.1,142.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,55.0,80.0,1,1,0,0,140.000000,49.0,0.5,150.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
395,42.0,70.0,1,1,0,0,75.000000,31.0,1.2,141.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
396,12.0,80.0,1,1,0,0,100.000000,26.0,0.6,137.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
397,17.0,60.0,1,1,0,0,114.000000,50.0,1.0,135.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [136]:
#查看相關係數矩陣
correlation_matrix = kidney_imputed.corr()
highly_correlated = []

for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.7:
            highly_correlated.append((correlation_matrix.columns[i], correlation_matrix.columns[j]))
for var1, var2 in highly_correlated:
    print(f"{var1}, {var2}:{correlation_matrix.loc[var1, var2].round(2)}")

hemoglobin, packed cell volume:0.87
hemoglobin, class:-0.71
packed cell volume, red blood cell count:0.71


In [137]:
#查看共變異矩陣
df_ckd = kidney_imputed[kidney_imputed['class'] == True]
df_notckd = kidney_imputed[kidney_imputed['class'] == False]

cov_matrix_ckd = df_ckd.cov() 
cov_matrix_notckd = df_notckd.cov()  

print(cov_matrix_ckd)
print(cov_matrix_notckd)

                                 age  blood pressure  red blood cells  \
age                       312.452598       51.707715        -0.685729   
blood pressure             51.707715      229.815673        -1.396707   
red blood cells            -0.685729       -1.396707         0.249690   
pus cell                   -0.667679       -0.794153         0.014529   
pus cell clumps             0.946356       -0.076727         0.014268   
bacteria                    0.067139        0.308759         0.007281   
blood glucose random      286.000731      116.519652        -5.189342   
blood urea                121.964867      104.320665         0.504867   
serum creatinine            9.741698        8.653981         0.043106   
sodium                     -9.759587        1.962520         0.306384   
potassium                   4.931545        3.783796         0.164278   
hemoglobin                 -0.248516       -5.069327         0.115269   
packed cell volume         -6.523279      -19.61846

# 切分資料

In [138]:
#切訓練/測試資料
features = kidney_imputed.columns.tolist()
features.remove('class')
X = kidney_imputed[features]
y = kidney_imputed['class'].astype(bool)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y

0       True
1       True
2       True
3       True
4       True
       ...  
394    False
395    False
396    False
397    False
398    False
Name: class, Length: 399, dtype: bool

# 羅吉斯回歸

In [139]:
X = sm.add_constant(X_train)  
logit = sm.Logit(y_train, X)
logit_result = logit.fit(maxiter=3)
logit_result.summary()

         Current function value: 0.069913
         Iterations: 3




0,1,2,3
Dep. Variable:,class,No. Observations:,319.0
Model:,Logit,Df Residuals:,283.0
Method:,MLE,Df Model:,35.0
Date:,"Wed, 25 Oct 2023",Pseudo R-squ.:,0.8951
Time:,21:46:38,Log-Likelihood:,-22.302
converged:,False,LL-Null:,-212.69
Covariance Type:,nonrobust,LLR p-value:,1.0899999999999999e-59

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,12.1951,10.003,1.219,0.223,-7.411,31.801
age,-0.0124,0.019,-0.646,0.518,-0.050,0.025
blood pressure,0.0061,0.027,0.224,0.823,-0.047,0.060
red blood cells,-1.0014,0.760,-1.318,0.188,-2.491,0.488
pus cell,0.5310,0.951,0.558,0.577,-1.333,2.395
pus cell clumps,-0.2071,1.692,-0.122,0.903,-3.523,3.109
bacteria,-0.2521,2.174,-0.116,0.908,-4.512,4.008
blood glucose random,0.0026,0.007,0.394,0.693,-0.010,0.016
blood urea,-0.0111,0.012,-0.909,0.363,-0.035,0.013


In [140]:
# 記錄各迴歸係數跟它的p-value，並依p-value排序
p_values_df = pd.DataFrame({'P-value': logit_result.pvalues.round(3)})
sorted_p_values_df = p_values_df.sort_values(by='P-value')
print(sorted_p_values_df)

                         P-value
sp_1.025                   0.007
sp_1.02                    0.022
alb_1.0                    0.080
red blood cells            0.188
hypertension               0.196
alb_2.0                    0.210
const                      0.223
alb_4.0                    0.229
hemoglobin                 0.265
packed cell volume         0.280
pedal edema                0.306
alb_3.0                    0.314
appetite                   0.321
diabetes mellitus          0.322
blood urea                 0.363
coronary artery disease    0.410
sugar_2.0                  0.459
age                        0.518
pus cell                   0.577
red blood cell count       0.634
sugar_4.0                  0.671
anemia                     0.690
blood glucose random       0.693
sugar_3.0                  0.693
potassium                  0.710
serum creatinine           0.763
sodium                     0.768
white blood cell count     0.779
sp_1.015                   0.803
sugar_5.0 

In [141]:
# 建立羅吉斯回歸模型
logit = LogisticRegression(max_iter=50000)
logit_result = logit.fit(X_train, y_train)

# 在測試資料上進行預測
y_pred = logit.predict(X_test)

# 評估模型表現
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.975


# 線性判別分析模型

In [142]:
# 建立線性判别分析模型
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

# 在測試資料上進行預測
y_pred = lda.predict(X_test)

# 評估模型表現
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9375


# 二次判別分析模型

In [143]:
# 建立二次判别分析模型
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)

# 在測試資料上進行預測
y_pred = qda.predict(X_test)

# 評估模型表現
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9375


