In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import numpy as np

from sklearn.preprocessing import scale
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

import os

from geomloss import SamplesLoss

from imputers import OTimputer, RRimputer

from utils import *
from data_loaders import dataset_loader
from softimpute import softimpute, cv_softimpute

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.debug("test")

torch.set_default_tensor_type('torch.DoubleTensor')

In [2]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# 读取 CSV 文件
df = pd.read_csv("processed_data.csv")
df

Unnamed: 0,Id,Ni,Mn,Co,Li,dapant,dopant_ratio,tem,holding_time
0,1,95.0,0.0,4,101,0,1,800,10
1,2,95.0,0.0,4,101,24,1,800,10
2,3,95.0,0.0,4,101,21,1,800,10
3,4,95.0,0.0,4,101,17,1,800,10
4,5,95.0,0.0,4,101,16,1,800,10
...,...,...,...,...,...,...,...,...,...
395,75,92.0,4.0,4,101,0,1,750,10
396,76,92.0,4.0,4,101,0,3,750,10
397,77,92.0,4.0,4,101,0,5,750,10
398,78,81.5,0.0,15,105,0,3.5,750,15


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             400 non-null    int64  
 1   Ni             400 non-null    float64
 2   Mn             400 non-null    float64
 3   Co             400 non-null    object 
 4   Li             400 non-null    int64  
 5   dapant         400 non-null    int64  
 6   dopant_ratio   400 non-null    object 
 7   tem            400 non-null    int64  
 8    holding_time  400 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 28.2+ KB


In [4]:

# 1. 去除 Id 列
df = df.drop("Id", axis=1)
data = df.to_numpy()
print(data[:5])

[[95.0 0.0 '4' 101 0 '1' 800 '10']
 [95.0 0.0 '4' 101 24 '1' 800 '10']
 [95.0 0.0 '4' 101 21 '1' 800 '10']
 [95.0 0.0 '4' 101 17 '1' 800 '10']
 [95.0 0.0 '4' 101 16 '1' 800 '10']]


In [5]:
# 检查每列的数据类型和唯一值
for col in df.columns:
    print(f"\n列名: {col}")
    print("唯一值:", df[col].unique()[:5])  # 显示前5个唯一值
    print("数据类型:", df[col].dtype)

# 识别数值型列
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
print("\n数值型列:", numeric_cols.tolist())


列名: Ni
唯一值: [95.  94.  99.  98.5 98. ]
数据类型: float64

列名: Mn
唯一值: [ 0.    2.   10.    1.82  3.3 ]
数据类型: float64

列名: Co
唯一值: ['4' '3' '0' '10' '2.9']
数据类型: object

列名: Li
唯一值: [101 110 103 105 104]
数据类型: int64

列名: dapant
唯一值: [ 0 24 21 17 16]
数据类型: int64

列名: dopant_ratio
唯一值: ['1' '1.5' '2' '3' '4']
数据类型: object

列名: tem
唯一值: [800 700 750 770 790]
数据类型: int64

列名:  holding_time
唯一值: ['10' '30' '60' '15' '24']
数据类型: object

数值型列: ['Ni', 'Mn', 'Li', 'dapant', 'tem']


In [6]:
import re

# 定义函数来提取数字
def extract_number(x):
    if isinstance(x, (int, float)):
        return float(x)
    # 提取字符串中的数字
    numbers = re.findall(r'\d+', str(x))
    return float(numbers[0]) if numbers else 0

# 对每一列应用转换
df = df.applymap(extract_number)

# 转换为numpy数组
data = df.to_numpy()

print("转换后的数据预览：")
print(data[:5])

转换后的数据预览：
[[ 95.   0.   4. 101.   0.   1. 800.  10.]
 [ 95.   0.   4. 101.  24.   1. 800.  10.]
 [ 95.   0.   4. 101.  21.   1. 800.  10.]
 [ 95.   0.   4. 101.  17.   1. 800.  10.]
 [ 95.   0.   4. 101.  16.   1. 800.  10.]]


 ## 保存

In [7]:
# 对每一列应用转换
df = df.applymap(extract_number)

# 转换为numpy数组
data = df.to_numpy()

# 将数据保存为CSV文件
import pandas as pd
pd.DataFrame(data).to_csv('processed_data1.csv', index=False)
print("数据已保存到 processed_data1.csv")

print("转换后的数据预览：")
print(data[:5])

数据已保存到 processed_data1.csv
转换后的数据预览：
[[ 95.   0.   4. 101.   0.   1. 800.  10.]
 [ 95.   0.   4. 101.  24.   1. 800.  10.]
 [ 95.   0.   4. 101.  21.   1. 800.  10.]
 [ 95.   0.   4. 101.  17.   1. 800.  10.]
 [ 95.   0.   4. 101.  16.   1. 800.  10.]]


In [None]:
#ground_truth = scale(data) # "wine" can be replaced with any of the datasets
                                             # supported by dataset_loader (see data_loaders.py)
#ground_truth = data
#X_true = torch.from_numpy(ground_truth)

In [8]:
from data_scaler import DataScaler
scaler = DataScaler()
ground_truth = scaler.fit_transform(data)

X_true = torch.from_numpy(ground_truth)

## MISS DATA GENERATION

In [9]:
np.random.seed(42)

p = 0.3

mask = np.random.rand(*ground_truth.shape) < p #True for missing values, false for others
x_miss = np.copy(ground_truth)

x_miss[mask] = np.nan
X_miss = torch.from_numpy(x_miss)

 ## Hyperparameter

In [10]:
n, d = X_miss.shape
batchsize = 128 # If the batch size is larger than half the dataset's size,
                # it will be redefined in the imputation methods.
lr = 1e-2
epsilon = pick_epsilon(X_miss) # Set the regularization parameter as a multiple of the median distance, as per the paper.

 # sinkhorn algorithm

In [11]:
sk_imputer = OTimputer(eps=epsilon, batchsize=batchsize, lr=lr, niter=2000)

In [12]:
sk_imp, sk_maes, sk_rmses = sk_imputer.fit_transform(X_miss, verbose=True, report_interval=500, X_true=X_true)

INFO:root:batchsize = 128, epsilon = 0.1659
INFO:root:Iteration 0:	 Loss: 1.3389	 Validation MAE: 0.7291	RMSE: 1.0513
INFO:root:Iteration 500:	 Loss: 0.8614	 Validation MAE: 0.5603	RMSE: 0.9898
INFO:root:Iteration 1000:	 Loss: 0.6857	 Validation MAE: 0.5700	RMSE: 0.9999
INFO:root:Iteration 1500:	 Loss: 0.9164	 Validation MAE: 0.5679	RMSE: 1.0031


In [29]:
print(sk_maes.size)

2000


In [27]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
# 转换为DataFrame
restored_data = scaler.inverse_transform(sk_imp.detach_().numpy())
#MAE = mean_absolute_error(data, restored_data.detach().numpy())

mae = mean_absolute_error(data, restored_data)

mse = mean_squared_error(data, restored_data)
rmse = np.sqrt(mse)

print("MAE:", mae)
print("RMSE:", rmse)

MAE: 2.813477974126047
RMSE: 18.152475443448097


In [None]:
df_imp = pd.DataFrame(restored_data)  # 先detach再转换为numpy数组

# 保存为CSV文件
df_imp.to_csv('imputed_data1.csv', index=False)
print("数据已保存到 imputed_data1.csv")

数据已保存到 imputed_data.csv


## MICE

In [30]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
# 设置MICE插补器
imputer = IterativeImputer(random_state=42, max_iter=10)

# 对缺失值进行插补
x_imputed = imputer.fit_transform(x_miss)

restored_data = scaler.inverse_transform(x_imputed)
MAE = mean_absolute_error(data, restored_data)
mse = mean_squared_error(data, restored_data)
rmse = np.sqrt(mse)
#MAE = mean_absolute_error(sk_imp.detach().numpy(), x_imputed)
print(MAE)
print(rmse)


3.7354394864939366
21.336747406738393




In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np

# 设置MICE插补器
imputer = IterativeImputer(random_state=42, max_iter=10)

# 对缺失值进行插补
x_imputed = imputer.fit_transform(x_miss)

# 计算 MAE 和 RMSE
mask = np.isnan(x_miss)  # 获取缺失值的位置
mae = np.mean(np.abs(x_imputed[mask] - ground_truth[mask]))  # 计算MAE
rmse = np.sqrt(np.mean((x_imputed[mask] - ground_truth[mask])**2))  # 计算RMSE

print("验证 MAE:", mae)
print("验证 RMSE:", rmse)

## Liner round-bin

In [None]:
#Create the imputation models
d_ = d - 1
models = {}

for i in range(d):
    models[i] = nn.Linear(d_, 1)

#Create the imputer
lin_rr_imputer = RRimputer(models, eps=epsilon, lr=lr)

In [None]:
lin_imp, lin_maes, lin_rmses = lin_rr_imputer.fit_transform(X_miss, verbose=True, X_true=X_true)

 ## MLP round-bin

In [None]:
#Create the imputation models
d_ = d - 1
models = {}

for i in range(d):
    models[i] = nn.Sequential(nn.Linear(d_, 2 * d_),
                              nn.ReLU(),
                              nn.Linear(2 * d_, d_),
                              nn.ReLU(),
                              nn.Linear(d_, 1))

#Create the imputer
mlp_rr_imputer = RRimputer(models, eps=epsilon, lr=lr)

In [None]:
mlp_imp, mlp_maes, mlp_rmses = mlp_rr_imputer.fit_transform(X_miss, verbose=True, X_true=X_true)


## column miss data

In [None]:
np.random.seed(43)

n_samples, n_features = ground_truth.shape
total_missing = int(0.3 * n_samples * n_features)

# 随机打乱列的顺序并选择足够数量的列
cols = np.random.permutation(n_features)
selected_cols = []
candidate_count = 0

# 选择列直到候选元素足够覆盖总缺失数
for col in cols:
    selected_cols.append(col)
    candidate_count += n_samples
    if candidate_count >= total_missing:
        break

# 生成所有候选位置的索引
rows = np.arange(n_samples)
candidate_rows, candidate_cols = np.meshgrid(rows, selected_cols)
candidate_rows = candidate_rows.ravel()
candidate_cols = candidate_cols.ravel()

# 随机打乱并选择指定数量的缺失位置
indices = np.random.permutation(len(candidate_rows))[:total_missing]
missing_rows = candidate_rows[indices]
missing_cols = candidate_cols[indices]

# 创建缺失数据
x_miss = np.copy(ground_truth)
x_miss[missing_rows, missing_cols] = np.nan

X_miss = torch.from_numpy(x_miss)

In [None]:
# 设置MICE插补器
imputer = IterativeImputer(random_state=42, max_iter=10)

# 对缺失值进行插补
x_imputed = imputer.fit_transform(x_miss)

# 计算 MAE 和 RMSE
mask = np.isnan(x_miss)  # 获取缺失值的位置
mae = np.mean(np.abs(x_imputed[mask] - ground_truth[mask]))  # 计算MAE
rmse = np.sqrt(np.mean((x_imputed[mask] - ground_truth[mask])**2))  # 计算RMSE
print("验证 MAE:", mae)
print("验证 RMSE:", rmse)

In [None]:
sk_imputer = OTimputer(eps=epsilon, batchsize=batchsize, lr=lr, niter=2000)

In [None]:
sk_imp, sk_maes, sk_rmses = sk_imputer.fit_transform(X_miss, verbose=True, report_interval=500, X_true=X_true)
#sinkhorn

In [None]:
#Create the imputation models
d_ = d - 1
models = {}

for i in range(d):
    models[i] = nn.Linear(d_, 1)

#Create the imputer
lin_rr_imputer = RRimputer(models, eps=epsilon, lr=lr)
lin_imp, lin_maes, lin_rmses = lin_rr_imputer.fit_transform(X_miss, verbose=True, X_true=X_true)