In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import numpy as np

from sklearn.preprocessing import scale
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import os

from geomloss import SamplesLoss

from imputers import OTimputer, RRimputer

from data_scaler import DataScaler

from utils import *
from data_loaders import dataset_loader
from softimpute import softimpute, cv_softimpute

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.debug("test")

torch.set_default_tensor_type('torch.DoubleTensor')

In [2]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# 读取 CSV 文件
df = pd.read_csv("processed_data.csv")
df

Unnamed: 0,Id,Ni,Mn,Co,Li,dapant,dopant_ratio,tem,holding_time
0,1,95.0,0.0,4,101,0,1,800,10
1,2,95.0,0.0,4,101,24,1,800,10
2,3,95.0,0.0,4,101,21,1,800,10
3,4,95.0,0.0,4,101,17,1,800,10
4,5,95.0,0.0,4,101,16,1,800,10
...,...,...,...,...,...,...,...,...,...
395,75,92.0,4.0,4,101,0,1,750,10
396,76,92.0,4.0,4,101,0,3,750,10
397,77,92.0,4.0,4,101,0,5,750,10
398,78,81.5,0.0,15,105,0,3.5,750,15


In [3]:
# 1. 去除 Id 列
df = df.drop("Id", axis=1)
data = df.to_numpy()
print(data[:5])

[[95.0 0.0 '4' 101 0 '1' 800 '10']
 [95.0 0.0 '4' 101 24 '1' 800 '10']
 [95.0 0.0 '4' 101 21 '1' 800 '10']
 [95.0 0.0 '4' 101 17 '1' 800 '10']
 [95.0 0.0 '4' 101 16 '1' 800 '10']]


In [4]:
# 检查每列的数据类型和唯一值
for col in df.columns:
    print(f"\n列名: {col}")
    print("唯一值:", df[col].unique()[:5])  # 显示前5个唯一值
    print("数据类型:", df[col].dtype)

# 识别数值型列
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
print("\n数值型列:", numeric_cols.tolist())


列名: Ni
唯一值: [95.  94.  99.  98.5 98. ]
数据类型: float64

列名: Mn
唯一值: [ 0.    2.   10.    1.82  3.3 ]
数据类型: float64

列名: Co
唯一值: ['4' '3' '0' '10' '2.9']
数据类型: object

列名: Li
唯一值: [101 110 103 105 104]
数据类型: int64

列名: dapant
唯一值: [ 0 24 21 17 16]
数据类型: int64

列名: dopant_ratio
唯一值: ['1' '1.5' '2' '3' '4']
数据类型: object

列名: tem
唯一值: [800 700 750 770 790]
数据类型: int64

列名:  holding_time
唯一值: ['10' '30' '60' '15' '24']
数据类型: object

数值型列: ['Ni', 'Mn', 'Li', 'dapant', 'tem']


In [5]:
import re

# 定义函数来提取数字
def extract_number(x):
    if isinstance(x, (int, float)):
        return float(x)
    # 提取字符串中的数字
    numbers = re.findall(r'\d+', str(x))
    return float(numbers[0]) if numbers else 0

# 对每一列应用转换
df = df.applymap(extract_number)

# 转换为numpy数组
data = df.to_numpy()

print("转换后的数据预览：")
print(data[:5])

转换后的数据预览：
[[ 95.   0.   4. 101.   0.   1. 800.  10.]
 [ 95.   0.   4. 101.  24.   1. 800.  10.]
 [ 95.   0.   4. 101.  21.   1. 800.  10.]
 [ 95.   0.   4. 101.  17.   1. 800.  10.]
 [ 95.   0.   4. 101.  16.   1. 800.  10.]]


In [6]:
#ground_truth = scale(data) # "wine" can be replaced with any of the dataset
                                   # supported by dataset_loader (see data_loaders.py)
ground_truth = data
X_true = torch.from_numpy(ground_truth)

## test by sinkhorn (10 kinds of missing data)

In [8]:
np.random.seed(42)
n_experiments = 10
all_maes = []
all_rmses = []
for exp in range(n_experiments):
    # 生成随机缺失数据
    scaler = DataScaler()
    ground_truth = scaler.fit_transform(data)
    X_true = torch.from_numpy(ground_truth)

    
    mask = np.random.rand(*ground_truth.shape) < 0.3
    x_miss = np.copy(ground_truth)
    x_miss[mask] = np.nan
    X_miss = torch.from_numpy(x_miss)
    
    # 使用sk方法进行插补
    n, d = X_miss.shape
    batchsize = 128 # If the batch size is larger than half the dataset's size,
                # it will be redefined in the imputation methods.
    lr = 1e-2
    epsilon = pick_epsilon(X_miss) 
    sk_imputer = OTimputer(eps=epsilon, batchsize=batchsize, lr=lr, niter=2000)
    sk_imp, sk_maes, sk_rmses = sk_imputer.fit_transform(X_miss, verbose=False, report_interval=500, X_true=X_true)

    restored_data = scaler.inverse_transform(sk_imp.detach_().numpy())
#MAE = mean_absolute_error(data, restored_data.detach().numpy())

    mae = mean_absolute_error(data, restored_data)

    mse = mean_squared_error(data, restored_data)
    rmse = np.sqrt(mse)
    
    # 记录最后一次迭代的评估指标
    all_maes.append(mae)
    all_rmses.append(rmse)

# 计算平均指标
mean_mae = np.mean(all_maes)
mean_rmse = np.mean(all_rmses)

print(f"10次实验的平均 MAE: {mean_mae:.4f}")
print(f"10次实验的平均 RMSE: {mean_rmse:.4f}")

10次实验的平均 MAE: 2.3044
10次实验的平均 RMSE: 13.7485


In [11]:
np.random.seed(42)
n_experiments = 10
all_maes = []
all_rmses = []

# 创建MICE插补器
imputer = IterativeImputer(random_state=42, max_iter=10)

for exp in range(n_experiments):
    # 生成随机缺失数据
    mask = np.random.rand(*ground_truth.shape) < 0.3
    x_miss = np.copy(ground_truth)
    x_miss[mask] = np.nan
    
    # 使用MICE进行插补
    x_imputed = imputer.fit_transform(x_miss)
    
    # 计算MAE和RMSE
    restored_data = scaler.inverse_transform(x_imputed)
    mae = mean_absolute_error(data, restored_data)
    mse = mean_squared_error(data, restored_data)
    rmse = np.sqrt(mse)
    #diff = x_imputed[mask] - ground_truth[mask]
    #mae = np.mean(np.abs(diff))
    #rmse = np.sqrt(np.mean(np.square(diff)))
    
    all_maes.append(mae)
    all_rmses.append(rmse)

# 计算平均指标
mean_mae = np.mean(all_maes)
mean_rmse = np.mean(all_rmses)

print(f"10次实验的平均 MAE: {mean_mae:.4f}")
print(f"10次实验的平均 RMSE: {mean_rmse:.4f}")



10次实验的平均 MAE: 2.9815
10次实验的平均 RMSE: 15.8683




In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
# 设置MICE插补器
imputer = IterativeImputer(random_state=42, max_iter=10)

# 对缺失值进行插补
x_imputed = imputer.fit_transform(x_miss)

restored_data = scaler.inverse_transform(x_imputed)
MAE = mean_absolute_error(data, restored_data)
mse = mean_squared_error(data, restored_data)
rmse = np.sqrt(mse)
#MAE = mean_absolute_error(sk_imp.detach().numpy(), x_imputed)
print(MAE)
print(rmse)
