In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import pyarrow.parquet as pq
from IPython.display import clear_output
import os
from sklearn.model_selection import train_test_split
import gc
parquet_train_clean=pq.ParquetFile("D:/kaggle/DRW/train.parquet")
feature=np.array(np.matrix(pd.read_excel("D:/kaggle/DRW/feature.xlsx")).T[1,:])[0]
parquet_test_clean=pq.ParquetFile("D:/kaggle/DRW/test.parquet")
test=parquet_test_clean.read().to_pandas()
data=parquet_train_clean.read().to_pandas()
X1=data.loc[:,data.columns!='label']
X=data.loc[:,feature]
test1=test.loc[:,data.columns!='label']
test2=data.loc[:,feature]
y=data['label']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=116)
x_train = tf.cast(x_train, tf.float32)
x_test = tf.cast(x_test, tf.float32)
y_train = tf.cast(y_train, tf.float32)
y_test = tf.cast(y_test, tf.float32)
gc.collect()
train_db = tf.data.Dataset.from_tensor_slices((x_train,y_train)).batch(1024)
test_db = tf.data.Dataset.from_tensor_slices((x_test,y_test)).batch(1024)
print(len(test2))
del x_train, x_test, y_train, y_test, X,X1, y
class Precision6(tf.keras.metrics.Metric):
    def __init__(self, name='6digit_precision', **kwargs):
        super().__init__(name=name, **kwargs)
        self.correct = self.add_weight(name='correct', initializer='zeros')
        self.total = self.add_weight(name='total', initializer='zeros')
    
    def update_state(self, y_true, y_pred, sample_weight=None):
        # 比较4位小数精度
        matches = tf.abs(y_true - y_pred) < 1e-4
        self.correct.assign_add(tf.reduce_sum(tf.cast(matches, tf.float32)))
        self.total.assign_add(tf.cast(tf.size(y_true), tf.float32))
    
    def result(self):
        return self.correct / self.total
    
    def reset_states(self):
        self.correct.assign(0.)
        self.total.assign(0.)
# 可视化回调类
class TrainingVisualizer(tf.keras.callbacks.Callback):
    def __init__(self, plot_interval=100):
        super().__init__()
        self.plot_interval = plot_interval
        self.batch_losses = []
        self.batch_mae = []
        self.batch_precision6 = []
        self.val_losses = []
        self.val_mae = []
        self.val_precision6 = []
        self.batch_count = 0
    
    def on_train_begin(self, logs=None):
        plt.figure(figsize=(15, 10))
    
    def on_batch_end(self, batch, logs=None):
        self.batch_count += 1
        
        # 记录训练指标
        self.batch_losses.append(logs.get('loss'))
        self.batch_mae.append(logs.get('mae'))
        self.batch_precision6.append(logs.get('6digit_precision'))
        
        # 定期更新图表
        if self.batch_count % self.plot_interval == 0:
            self.update_plots()
    
    def on_epoch_end(self, epoch, logs=None):
        # 记录验证指标
        self.val_losses.append(logs.get('val_loss'))
        self.val_mae.append(logs.get('val_mae'))
        self.val_precision6.append(logs.get('val_6digit_precision'))
        
        # 更新图表
        self.update_plots()
    
    def update_plots(self):
        clear_output(wait=True)
        plt.figure(figsize=(15, 10))
        
        # 损失曲线
        plt.subplot(2, 2, 1)
        plt.plot(self.batch_losses, label='Train Loss')
        if self.val_losses:
            val_x = [i * len(self.batch_losses)/len(self.val_losses) for i in range(len(self.val_losses))]
            plt.plot(val_x, self.val_losses, 'ro-', label='Validation Loss')
        plt.title('Loss per Batch')
        plt.xlabel('Batch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True)
        
        # MAE曲线
        plt.subplot(2, 2, 2)
        plt.plot(self.batch_mae, label='Train MAE')
        if self.val_mae:
            val_x = [i * len(self.batch_mae)/len(self.val_mae) for i in range(len(self.val_mae))]
            plt.plot(val_x, self.val_mae, 'ro-', label='Validation MAE')
        plt.title('MAE per Batch')
        plt.xlabel('Batch')
        plt.ylabel('MAE')
        plt.legend()
        plt.grid(True)
        
        # Precision6曲线
        plt.subplot(2, 2, 3)
        plt.plot(self.batch_precision6, label='Train Precision6')
        if self.val_precision6:
            val_x = [i * len(self.batch_precision6)/len(self.val_precision6) for i in range(len(self.val_precision6))]
            plt.plot(val_x, self.val_precision6, 'ro-', label='Validation Precision6')
        plt.title('6-digit Precision per Batch')
        plt.xlabel('Batch')
        plt.ylabel('Precision')
        plt.legend()
        plt.grid(True)
        
        # 学习率曲线
        if hasattr(self.model.optimizer, 'learning_rate'):
            lr = self.model.optimizer.learning_rate
            if callable(lr):
                current_lr = lr(self.model.optimizer.iterations).numpy()
            else:
                current_lr = lr.numpy()
            plt.subplot(2, 2, 4)
            plt.axhline(y=current_lr, color='r', linestyle='-')
            plt.text(0.5, current_lr, f'Current LR: {current_lr:.7f}', 
                     verticalalignment='bottom', horizontalalignment='center')
            plt.title('Current Learning Rate')
            plt.ylim(0, max(0.001, current_lr * 2))
        
        plt.tight_layout()
        plt.show()
# 创建模型
model = tf.keras.models.Sequential([
    # 第一层
    tf.keras.layers.Dense(
        units=2048,
        input_shape=(270,),
        activation='relu',
        kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-4)
        ),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.6),
    
    # 第二层
    tf.keras.layers.Dense(
        units=1024,
        activation='relu',
        kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-4)
        ),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    
    # 第三层
    tf.keras.layers.Dense(
        units=512,
        activation='relu',
        kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-4)
        ),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    
    # 第四层
    tf.keras.layers.Dense(
        units=256,
        activation='relu',
        kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-4)
        ),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    
    tf.keras.layers.Dense(1, activation='linear') 
])


lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.0005,
    decay_steps=10000,
    decay_rate=0.9,
    staircase=True
)

optimizer = tf.keras.optimizers.Adam(
    learning_rate=lr_schedule,
    beta_1=0.9,
    beta_2=0.999
)

model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[
        tf.keras.metrics.MeanAbsoluteError(name='mae'),
        tf.keras.metrics.RootMeanSquaredError(name='rmse'),
        Precision6(name='6digit_precision')
    ]
)
# 创建保存模型的目录
os.makedirs('model_checkpoints', exist_ok=True)

# 1. 模型保存点回调
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='model_checkpoints/best_model.h5',
    monitor='val_6digit_precision',
    verbose=1,  # 显示保存信息
    save_best_only=True,
    save_weights_only=False,  # 保存整个模型（结构+权重）
    mode='max',  # 因为精度是越大越好
    save_freq='epoch'  # 每个epoch结束时检查
)

# 2. 早停回调
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_6digit_precision',
    patience=20,
    mode='max',
    restore_best_weights=True
)

'''
# 3. 学习率调整回调
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_6digit_precision',
    factor=0.5,
    patience=5,
    min_lr=1e-7,
    verbose=1,  # 显示学习率变化
    mode='max'
)


# 4. TensorBoard日志回调
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir='./logs',
    histogram_freq=1,  # 每1个epoch记录一次直方图
    update_freq='batch',  # 每个batch更新一次标量
    profile_batch=0  # 禁用性能分析
)
'''



# 创建可视化回调实例
visualizer = TrainingVisualizer(plot_interval=50)

# 整合所有回调
callbacks = [
    early_stopping,
    checkpoint_callback,
    visualizer,
]
history = model.fit(
    train_db,
    epochs=100,
 #   validation_data=test_db,
  #  callbacks=callbacks
    )

#model.summary()
pd.DataFrame({'predict':model.predict(test2).reshape(1,-1)[0]}).to_excel("D:/kaggle/DRW/result.xlsx")

525886
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 41ms/step - 6digit_precision: 0.0866 - loss: 2.8180 - mae: 0.8883 - rmse: 1.2980
Epoch 2/100
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 42ms/step - 6digit_precision: 0.1139 - loss: 2.0399 - mae: 0.6455 - rmse: 0.9773
Epoch 3/100
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 47ms/step - 6digit_precision: 0.1075 - loss: 1.7384 - mae: 0.5781 - rmse: 0.8444
Epoch 4/100
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 46ms/step - 6digit_precision: 0.1035 - loss: 1.4902 - mae: 0.5176 - rmse: 0.7342
Epoch 5/100
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 46ms/step - 6digit_precision: 0.1015 - loss: 1.3074 - mae: 0.4735 - rmse: 0.6641
Epoch 6/100
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 48ms/step - 6digit_precision: 0.0982 - loss: 1.1537 - mae: 0.4407 - rmse: 0.6150
Epoch 7/100
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━

PermissionError: [Errno 13] Permission denied: 'D:/kaggle/DRW/result.xlsx'