# 缺失值填补

## 1、随机删除iris中的一些属性值（随即删除总量的5%）


In [1]:
import numpy as np
from sklearn.datasets import load_iris
import pandas as pd

# 加载Iris数据集
iris = load_iris()
iris_array = iris.data

# 设置随机数种子以便复现实验
np.random.seed(42)

# 获取数据集的形状和大小
num_rows, num_cols = iris_array.shape
num_values = num_rows * num_cols

# 计算每一列要删除的值的数量
num_values_to_remove = int(num_values * 0.05/num_cols)


# 在每个属性中随机选择要删除的值，并将它们设置为NaN
for i in range(num_cols):
    col_values = iris_array[:, i]
    num_values_in_col = len(col_values)
    indices_to_remove = np.random.choice(num_values_in_col, size=num_values_to_remove, replace=False)
    col_values[indices_to_remove] = np.nan

# 将修改后的数组转换回Pandas数据帧
modified_iris_df = pd.DataFrame(iris_array,columns=iris.feature_names)

# 打印修改后的数据集
print(modified_iris_df,modified_iris_df.isna().sum())



     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                  5.1               3.5                1.4               0.2
1                  4.9               3.0                1.4               0.2
2                  4.7               3.2                1.3               0.2
3                  4.6               3.1                1.5               0.2
4                  5.0               3.6                1.4               0.2
..                 ...               ...                ...               ...
145                6.7               3.0                5.2               2.3
146                6.3               2.5                5.0               1.9
147                6.5               3.0                5.2               2.0
148                6.2               3.4                5.4               2.3
149                5.9               NaN                5.1               1.8

[150 rows x 4 columns] sepal length (cm)    7
sepal width (cm) 

## 2、采用均值、中位数、前值进行缺失值填补(填补后进行z-score标准化)

- 均值填补

In [2]:
# 使用每个属性的均值来填补缺失值
mean_values = modified_iris_df.mean()
iris_df_filled_mean = modified_iris_df.fillna(mean_values)
iris_df_filled_mean['species'] = iris.target_names[iris.target]

#标准化
mean = np.mean(iris_df_filled_mean.iloc[:, 0:4], axis=0)
std = np.std(iris_df_filled_mean.iloc[:, 0:4], axis=0)
iris_df_filled_mean.iloc[:, 0:4] = (iris_df_filled_mean.iloc[:, 0:4] - mean) / std

# 打印填补后的数据集
print(iris_df_filled_mean)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0            -0.901670          1.028792          -1.368290         -1.300653   
1            -1.149868         -0.127790          -1.368290         -1.300653   
2            -1.398067          0.334843          -1.426386         -1.300653   
3            -1.522166          0.103526          -1.310195         -1.300653   
4            -1.025769          1.260108          -1.368290         -1.300653   
..                 ...               ...                ...               ...   
145           1.083914         -0.127790           0.839337          1.523542   
146           0.587518         -1.284372           0.723146          0.985600   
147           0.835716         -0.127790           0.839337          1.120085   
148           0.463419          0.797475           0.955528          1.523542   
149           0.091122          0.000000           0.781242          0.851114   

       species  
0       se

- 中位数填补

In [3]:
# 使用每个属性的中位数来填补缺失值
median_values = modified_iris_df.median()
iris_df_filled_median = modified_iris_df.fillna(median_values)
iris_df_filled_median['species'] = iris.target_names[iris.target]

# 标准化
mean = np.mean(iris_df_filled_median.iloc[:, 0:4], axis=0)
std = np.std(iris_df_filled_median.iloc[:, 0:4], axis=0)
iris_df_filled_median.iloc[:, 0:4] = (iris_df_filled_median.iloc[:, 0:4] - mean) / std

# 打印填补后的数据集
print(iris_df_filled_median)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0            -0.900110          1.034380          -1.381466         -1.308063   
1            -1.148302         -0.121782          -1.381466         -1.308063   
2            -1.396494          0.340682          -1.439381         -1.308063   
3            -1.520590          0.109450          -1.323551         -1.308063   
4            -1.024206          1.265612          -1.381466         -1.308063   
..                 ...               ...                ...               ...   
145           1.085426         -0.121782           0.819304          1.514128   
146           0.589042         -1.277945           0.703474          0.976568   
147           0.837234         -0.121782           0.819304          1.110958   
148           0.464946          0.803147           0.935134          1.514128   
149           0.092658         -0.121782           0.761389          0.842178   

       species  
0       se

- 前值填补

In [4]:
# 使用每个属性的前一个值来填补缺失值
iris_df_filled_ffill = modified_iris_df.fillna(method='ffill')
iris_df_filled_ffill['species'] = iris.target_names[iris.target]

# 标准化
mean = np.mean(iris_df_filled_ffill.iloc[:, 0:4], axis=0)
std = np.std(iris_df_filled_ffill.iloc[:, 0:4], axis=0)
iris_df_filled_ffill.iloc[:, 0:4] = (iris_df_filled_ffill.iloc[:, 0:4] - mean) / std
# 打印填补后的数据集
print(iris_df_filled_ffill)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0            -0.892250          0.963467          -1.340254         -1.307945   
1            -1.132317         -0.149939          -1.340254         -1.307945   
2            -1.372384          0.295424          -1.397448         -1.307945   
3            -1.492418          0.072743          -1.283060         -1.307945   
4            -1.012283          1.186148          -1.340254         -1.307945   
..                 ...               ...                ...               ...   
145           1.028288         -0.149939           0.833131          1.409751   
146           0.548154         -1.263344           0.718742          0.892094   
147           0.788221         -0.149939           0.833131          1.021508   
148           0.428120          0.740786           0.947520          1.409751   
149           0.068019          0.740786           0.775937          0.762680   

       species  
0       se

# 结果比较实验

## 随机选择80%数据训练，20%数据测试，设置种子可重复实验

In [5]:
from sklearn.model_selection import train_test_split
# Load the datasets
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target_names[iris.target]

def prepare_data(X):
    np.random.seed(42)
    X_train,X_test,y_train,y_test = train_test_split(
    X.drop('species', axis=1), iris_df['species'], test_size=0.2, random_state=123)
    return X_train,X_test,y_train,y_test

X_list = [iris_df,iris_df_filled_mean,iris_df_filled_median,iris_df_filled_ffill]

## 使用KNN分类测试比较有缺失值和无缺失值的数据训练和测试结果，测试结果需要列表比较 accuracy，precision，recall和f1-score


In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

result = pd.DataFrame(columns=['Dataset','Accuracy','Precision','Recall','F1_score'])
result['Dataset']= ['iris_df', 'iris_df_filled_mean','iris_df_filled_median','iris_df_filled_ffill']
i = 0
for X in X_list:
    X_train,X_test,y_train,y_test = prepare_data(X)

    # Train a KNN classifier on the training set of each dataset
    k = 6
    iris_knn = KNeighborsClassifier(n_neighbors=k)
    iris_knn.fit(X_train, y_train)
    iris_y_pred = iris_knn.predict(X_test)

    # Calculate the evaluation metrics
    result.Accuracy[i] = accuracy_score(y_test, iris_y_pred)
    result.Precision[i] = precision_score(y_test, iris_y_pred, average='weighted')
    result.Recall[i] = recall_score(y_test, iris_y_pred, average='weighted')
    result.F1_score[i]= f1_score(y_test, iris_y_pred, average='weighted')    
    i=i+1
    
print(result)

                 Dataset  Accuracy Precision    Recall  F1_score
0                iris_df       0.9   0.90619       0.9  0.901465
1    iris_df_filled_mean  0.833333  0.865278  0.833333  0.836842
2  iris_df_filled_median  0.833333  0.865278  0.833333  0.836842
3   iris_df_filled_ffill       0.9   0.90619       0.9  0.901465
