<h2>导入包</h2>

In [None]:
import os
import re
import logging
from datetime import datetime, date
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from xgboost import DMatrix
import lightgbm as lgb
from lightgbm import Dataset
import matplotlib.pyplot as plt
from astral import LocationInfo
from astral.sun import sunrise, sunset, dawn, noon, dusk
from tqdm import tqdm

from colorama import Fore, Style
from sklearn.model_selection import GroupKFold
from scipy.optimize import minimize
from sklearn.impute import SimpleImputer, KNNImputer

from concurrent.futures import ThreadPoolExecutor, as_completed
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

is_debug = False

<h2>读取数据</h2>

In [None]:
# CFG   
n_splits = 5   
seed = 308    
# root_dir = '/kaggle/input/child-mind-institute-problematic-internet-use/'
example_dir = 'D:\\projects\\Kaggle\\CMI24\\dataset\\child-mind-institute-problematic-internet-use\\sample_submission.csv'  

# meta_data:
train_meta_dir = 'D:\\projects\\Kaggle\\CMI24\\dataset\\child-mind-institute-problematic-internet-use\\train.csv'
test_meta_dir = 'D:\\projects\\Kaggle\\CMI24\\dataset\\child-mind-institute-problematic-internet-use\\test.csv'

# time_series:
train_ts_dir = 'D:\\projects\\Kaggle\\CMI24\\dataset\\child-mind-institute-problematic-internet-use\\series_train.parquet'
test_ts_dir = 'D:\\projects\\Kaggle\CMI24\\dataset\\child-mind-institute-problematic-internet-use\\series_test.parquet'

### meta_data

### # utils

In [None]:
def printcolor(text):
    print(Fore.YELLOW + text + Style.RESET_ALL)

def process_file(folder_path, folder_name):
    try:
        file_path = os.path.join(folder_path, "part-0.parquet")
        
        # 读取 parquet 文件
        df = pd.read_parquet(file_path)
        
        # 提取 ID 信息，并将其添加为新列
        file_id = folder_name.split('=')[1]
        df['id'] = file_id
        
        # 删除步长列
        if 'step' in df.columns:
            df = df.drop(columns=['step'])
        
        # 添加时间相关特征
        df['time_of_day_hours'] = df['time_of_day'] / 1e9 / 3600  # 将纳秒转换为小时
        df['day_time'] = df['relative_date_PCIAT'] + (df['time_of_day_hours'] / 24)
        df['day_of_week'] = (df['relative_date_PCIAT'] % 7).astype(int)  # 将日期映射为星期几
        df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)  # 标记周末
        
        # 将时间特征放在 DataFrame 最前面
        cols = ['time_of_day_hours', 'day_time'] + [col for col in df.columns if col not in ['time_of_day_hours', 'day_time','time_of_day']]
        df = df[cols]
        
        return df
    except Exception as e:
        print(f"Error processing file {folder_path}: {e}")
        return None

def load_ts(root_dir):
    data = []
    with ThreadPoolExecutor() as executor:
        # 获取所有 'id=...' 文件夹路径
        folder_paths = [
            (os.path.join(root_dir, folder_name), folder_name)
            for folder_name in os.listdir(root_dir)
            if os.path.isdir(os.path.join(root_dir, folder_name)) and folder_name.startswith("id=")
        ]
        
        # 在调试模式下限制为只读取前五个文件夹
        if is_debug:
            folder_paths = folder_paths[:1]
            printcolor("Debug Mode: Only loaded 5 training data files")
        # 提交所有文件的处理任务
        futures = {executor.submit(process_file, path, name): name for path, name in folder_paths}
        
        # 使用 tqdm 显示进度条
        for future in tqdm(as_completed(futures), total=len(futures), desc=f"Processing files in {root_dir}"):
            result = future.result()
            if result is not None:
                data.append(result)
    
    # 合并所有 DataFrame
    return pd.concat(data, ignore_index=True)

def plot_features_over_time(df, time_column='day_time'):
    features = [col for col in df.columns if col not in [time_column, 'id']]
    
    num_features = len(features)
    fig, axes = plt.subplots(nrows=num_features, ncols=1, figsize=(10, num_features * 3), sharex=True)
    fig.suptitle("Features Over Time", fontsize=16)

    for i, feature in enumerate(features):
        axes[i].plot(df[time_column], df[feature], label=feature)
        axes[i].set_ylabel(feature)
        axes[i].legend(loc='upper right')
    
    axes[-1].set_xlabel(time_column)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

def feature_engineering(df):
    season_cols = [col for col in df.columns if 'Season' in col]
    df = df.drop(season_cols, axis=1) 
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    
    return df


class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim, hidden_dims=None):
        super(AutoEncoder, self).__init__()
        
        # 编码器层
        encoder_layers = []
        current_dim = input_dim
        if hidden_dims:
            for hidden_dim in hidden_dims:
                encoder_layers.append(nn.Linear(current_dim, hidden_dim))
                encoder_layers.append(nn.BatchNorm1d(hidden_dim))
                encoder_layers.append(nn.LeakyReLU())
                current_dim = hidden_dim
        encoder_layers.append(nn.Linear(current_dim, encoding_dim))
        encoder_layers.append(nn.ReLU())
        self.encoder = nn.Sequential(*encoder_layers)
        
        # 解码器层
        decoder_layers = []
        current_dim = encoding_dim
        for hidden_dim in reversed(hidden_dims or []):
            decoder_layers.append(nn.Linear(current_dim, hidden_dim))
            decoder_layers.append(nn.BatchNorm1d(hidden_dim))
            decoder_layers.append(nn.LeakyReLU())
            current_dim = hidden_dim
        decoder_layers.append(nn.Linear(current_dim, input_dim))
        decoder_layers.append(nn.Sigmoid())
        self.decoder = nn.Sequential(*decoder_layers)
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

def perform_autoencoder(df, encoding_dim=50, hidden_dims=[128, 64], epochs=300, batch_size=64, patience=15):

    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim, hidden_dims)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)
    
    # 早停机制和保存最优模型
    best_loss = np.inf
    patience_counter = 0
    best_model_state = None
    
    for epoch in range(epochs):
        epoch_loss = 0.0
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        epoch_loss /= (len(data_tensor) / batch_size)
        
        # 输出损失
        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {epoch_loss:.4f}')
        
        # 早停机制：如果损失没有下降，增加计数器
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            patience_counter = 0
            best_model_state = autoencoder.state_dict()  # 保存最佳模型
        else:
            patience_counter += 1
        
        # 如果超过耐心次数，停止训练
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break
    
    # 加载最佳模型
    if best_model_state:
        autoencoder.load_state_dict(best_model_state)
    
    # 提取编码后的数据
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()
        
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    
    return df_encoded




train_ts = load_ts(train_ts_dir)  
test_ts = load_ts(test_ts_dir)

train_meta = pd.read_csv(train_meta_dir)
test_meta = pd.read_csv(test_meta_dir)

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

train_ts_encoded = perform_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
test_ts_encoded = perform_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)

time_series_cols = train_ts_encoded.columns.tolist()
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

train = pd.merge(train_meta, train_ts_encoded, how="left", on='id')
test = pd.merge(test_meta, test_ts_encoded, how="left", on='id')
column_types = train_ts.dtypes
print(column_types)


In [None]:
column_types = train_ts.dtypes
print(column_types)

### 滑窗

In [None]:

window_size = 10
# 创建一个新的 DataFrame 来存储统计特征
stat_features = pd.DataFrame()
# 对需要提取统计特征的列进行处理，假设 'X', 'Y', 'Z' 为主要时间序列特征
columns_to_drop = ['X', 'Y', 'Z']  # 原始列
for col in [columns_to_drop]:
    # 滑动均值
    stat_features[f'{col}_mean'] = train_ts[col].rolling(window=window_size).mean()
    # 滑动标准差
    stat_features[f'{col}_std'] = train_ts[col].rolling(window=window_size).std()
    # 滑动最大值
    stat_features[f'{col}_max'] = train_ts[col].rolling(window=window_size).max()
    # 滑动最小值
    stat_features[f'{col}_min'] = train_ts[col].rolling(window=window_size).min()
    # 滑动和
    stat_features[f'{col}_sum'] = train_ts[col].rolling(window=window_size).sum()
    # 滑动中位数
    stat_features[f'{col}_median'] = train_ts[col].rolling(window=window_size).median()

train_ts_processed = train_ts.drop(columns=columns_to_drop)
train_ts = pd.concat([train_ts, stat_features], axis=1)

In [None]:
imputer = KNNImputer(n_neighbors=5)
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
imputed_data = imputer.fit_transform(train[numeric_cols])
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
train_imputed['sii'] = train_imputed['sii'].round().astype(int)
for col in train.columns:
    if col not in numeric_cols:
        train_imputed[col] = train[col]


train = train_imputed

train = feature_engineering(train)
train = train.dropna(thresh=10, axis=0)
test = feature_engineering(test)

train = train.drop('id', axis=1)
test  = test .drop('id', axis=1)   


featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'sii', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW']

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW']

featuresCols += time_series_cols
test = test[featuresCols]

In [None]:
# import matplotlib.pyplot as plt

# df = test_ts

# # 获取所有的 id
# unique_ids = df['id'].unique()

# # 选择一些要绘制的特征列
# features = ['X', 'Y', 'Z', 'enmo', 'anglez', 'light', 'battery_voltage']

# # 设置图表大小
# fig, axes = plt.subplots(len(features), 1, figsize=(12, 3 * len(features)), sharex=True)
# fig.suptitle("Feature Variation over day_time for each ID")

# # 对每个特征绘制图表
# for feature, ax in zip(features, axes):
#     for uid in unique_ids:
#         # 提取当前 id 的数据
#         df_id = df[df['id'] == uid]
#         ax.plot(df_id['day_time'], df_id[feature], label=f'ID {uid}')
    
#     # 设置标签和标题
#     ax.set_ylabel(feature)
#     ax.legend(loc="upper right", fontsize='small')
#     ax.grid(True)

# # 设置 x 轴标签
# axes[-1].set_xlabel("day_time")
# plt.tight_layout(rect=[0, 0, 1, 0.96])
# plt.show()


<h2>训练模型</h2>

<h3>评测指标</h3>

In [None]:
if np.any(np.isinf(train)):
    train = train.replace([np.inf, -np.inf], np.nan)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

<h3>LGB</h3>

In [None]:


def TrainML(model_class, test_data, train_data, n_splits=5, use_group_kfold=False, optimize_method='Powell'):
    # 提取训练集特征和标签
    X = train_data.drop(['sii', 'group_column'], axis=1)  # 假设 group_column 是用于 GroupKFold 的列
    y = train_data['sii']
    
    # 初始化交叉验证方法
    if use_group_kfold:
        cv = GroupKFold(n_splits=n_splits)
        groups = train_data['group_column']  # 假设每个样本都有一个 group_column
    else:
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
        groups = None
    
    # 初始化预测结果和得分列表
    oof_non_rounded = np.zeros(len(y), dtype=float)
    oof_rounded = np.zeros(len(y), dtype=int)
    test_preds = np.zeros((len(test_data), n_splits))
    train_scores, val_scores = [], []

    # 交叉验证训练
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups=groups)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        # 保存预测结果
        oof_non_rounded[val_idx] = y_val_pred
        oof_rounded[val_idx] = np.round(y_val_pred).astype(int)
        
        train_kappa = quadratic_weighted_kappa(y_train, np.round(y_train_pred).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, oof_rounded[val_idx])

        train_scores.append(train_kappa)
        val_scores.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")

    print(f"Mean Train QWK: {np.mean(train_scores):.4f}")
    print(f"Mean Validation QWK: {np.mean(val_scores):.4f}")
    
    # 可视化评估
    plt.figure(figsize=(10, 5))
    plt.plot(range(1, n_splits + 1), train_scores, label="Train QWK")
    plt.plot(range(1, n_splits + 1), val_scores, label="Validation QWK")
    plt.xlabel("Fold")
    plt.ylabel("Quadratic Weighted Kappa")
    plt.title("QWK Scores per Fold")
    plt.legend()
    plt.show()
    
    # 优化阈值
    def evaluate_predictions(thresholds, y_true, y_pred):
        y_pred_tuned = threshold_Rounder(y_pred, thresholds)
        return -quadratic_weighted_kappa(y_true, y_pred_tuned)
    
    KappaOptimizer = minimize(evaluate_predictions, x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), method=optimize_method)
    assert KappaOptimizer.success, "Optimization did not converge."

    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOptimizer.x)
    tuned_kappa = quadratic_weighted_kappa(y, oof_tuned)
    print(f"Optimized QWK Score: {tuned_kappa:.4f}")
    
    # 生成提交文件
    tpm = test_preds.mean(axis=1)
    tp_tuned = threshold_Rounder(tpm, KappaOptimizer.x)
    
    submission = pd.DataFrame({
        'id': test_data['id'],  # 假设 test_data 中包含 'id' 列
        'sii': tp_tuned
    })
    
    return submission


In [None]:
# voting_model = VotingRegressor(estimators=[
#     ('lightgbm', Light)
    # ('xgboost', XGB_Model),
    # ('catboost', CatBoost_Model),
    # ('tabnet', TabNet_Model)
# ])

submission1 = TrainML(LGB, test)

<h3>xgboost模型</h3>

<h3>交叉验证</h3>

In [None]:
# importance = DataFrame()
# importance["特征"] = model_lgb[0].feature_name()
# importance["重要性"] = 0
# for model in model_lgb:
#     importance["重要性"] = importance["重要性"] + model.feature_importance()
# importance["重要性"] = importance["重要性"] / kfold.n_splits
# importance.sort_values("重要性", ascending=False)[0:50]

<h2>预测</h2>