In [1]:
import joblib
import glob
import numpy as np
import pandas as pd
from scipy.fft import fft
from scipy.signal import find_peaks
from tqdm.auto import tqdm

import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from joblib import Parallel, delayed

import warnings
warnings.filterwarnings("ignore")  # 忽略所有警告

In [2]:
def split_dataframe_by_interval(df, time_column, interval_days):
    nanoseconds_per_week = interval_days * 24 * 60 * 60 * 1e9
    df['w_th'] = df['t'].astype('int64') // nanoseconds_per_week

    sub_dfs = []
    for w_th in df['w_th'].unique():
        sub_df = df[df['w_th']==w_th].copy()
        if len(sub_df)>=interval_days:
            sub_dfs.append(sub_df)
    return sub_dfs

In [None]:
labels_df = pd.read_csv('../input/train_y_v0.1.0.csv')

# 1. 将每行的 94 列标签合并为一个字符串
labels_df['combined_label'] = labels_df[labels_df.columns.tolist()[1:]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

# 2. 创建从合并标签到唯一 ID 的映射
label_to_id = {label: idx for idx, label in enumerate(labels_df['combined_label'].unique())}

# 3. 将合并标签映射为唯一 ID
labels_df['label_id'] = labels_df['combined_label'].map(label_to_id)

joblib.dump(label_to_id,'label_to_id.pkl')

new_labels_df = labels_df[['filename','label_id']].copy()
new_labels_df.columns = ['file_name','label_id']
new_labels_df.to_csv('../input/label.csv',index=False)

In [3]:
import os
import joblib
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis, tvar, tstd
from scipy.fft import fft
from scipy.signal import find_peaks
from tqdm import tqdm

# 定义特征提取函数
def extract_features(sequence, pre_fix, is_v_seq=False, more_features=False):
    """
    提取时序数据的特征
    :param sequence: 一维时序数据 (np.array)
    :return: 特征字典
    """
    warnings.filterwarnings("ignore") 
    features = {}

    # 1. 基本统计特征
    features['mean'] = np.mean(sequence)
    features['std']  = tstd(sequence)
    features['min']  = np.min(sequence)
    features['max']  = np.max(sequence)
    features['median'] = np.median(sequence)
    features['skewness'] = skew(sequence)
    features['kurtosis'] = kurtosis(sequence)
    features['range'] = features['max'] - features['min']
    features['q1']  = np.percentile(sequence, 25)
    features['q3']  = np.percentile(sequence, 75)
    features['iqr'] = features['q3'] - features['q1']
    
    if is_v_seq==True:
        features['mad'] = np.mean(np.abs(sequence - np.mean(sequence)))
        if np.mean(sequence)!=0:
            features['cv'] =  tstd(sequence) / np.mean(sequence)
        else:
            features['cv'] =  0
    
        if features['cv'] != 0:
            features['reciprocal_cv'] = 1 / features['cv']
        else:
            features['reciprocal_cv'] = 0
    
        # 2. 时域特征
        features['zero_crossings'] = np.sum(np.diff(np.sign(sequence)) != 0)
        features['autocorrelation'] = np.correlate(sequence, sequence, mode='full')[len(sequence) - 1]
        features['energy'] = np.sum(sequence ** 2)
        # 均方根 (RMS)
        features['rms'] = np.sqrt(np.mean(np.square(sequence)))
        # 能量
        features['energy'] = np.sum(np.square(sequence))
        features['energy_rate'] = features['energy'] / len(sequence)
        # 过零率
        features['zero_crossing_rate'] = np.sum(np.diff(np.sign(sequence)) != 0) / len(sequence)
        # 不过零率
        features['non_zero_crossing_rate'] = 1 - np.sum(np.diff(np.sign(sequence)) != 0) / len(sequence)
        # 绝对均值
        features['mean_absolute_value'] = np.mean(np.abs(sequence))
        
        if features['mean_absolute_value']!=0:
            # 形状因子
            features['shape_factor']   = features['rms'] / features['mean_absolute_value']
            # 脉冲因子
            features['impulse_factor'] = np.max(np.abs(sequence)) / features['mean_absolute_value']
        else:
            features['shape_factor']   = 0
            features['impulse_factor'] = 0
        if features['rms']!=0:
            # 峰值因子
            features['crest_factor'] = np.max(np.abs(sequence)) / features['rms']
        else:
            features['crest_factor'] = 0
        if np.mean(np.sqrt(np.abs(sequence))) ** 2!=0:
            # 裕度因子
            features['margin_factor'] = np.max(np.abs(sequence)) / np.mean(np.sqrt(np.abs(sequence))) ** 2
        else:
            features['margin_factor'] = 0

        # 3. 频域特征
        fft_values = np.abs(fft(sequence))
        features['fft_mean']= np.mean(fft_values)
        features['fft_std'] = tstd(fft_values)
        features['fft_max'] = np.max(fft_values)
        features['fft_min'] = np.min(fft_values)
        features['fft_dominant_freq'] = np.argmax(fft_values)
        features['fft_skewness'] = skew(fft_values)
        features['fft_kurtosis'] = kurtosis(fft_values)
    
        # # 4. 峰值特征
        peaks, _ = find_peaks(sequence)
        peak_intervals = np.diff(peaks)
        features['num_peaks'] = len(peaks)
        if len(peaks) > 0:
            features['peak_mean'] = np.mean(sequence[peaks])
            features['peak_std']  = tstd(sequence[peaks])
        else:
            features['peak_mean'] = 0
            features['peak_std']  = 0
    
    if more_features==True:
        if len(peaks) > 0:
            features['num_peaks_ratio'] = len(peaks) / len(sequence)
            features['peak_max'] = np.max(sequence[peaks])
            features['peak_min'] = np.min(sequence[peaks])
            features['peak_iqr'] = features['peak_max'] - features['peak_min']
            
            features['peak_skewness'] = skew(sequence[peaks])
            features['peak_kurtosis'] = kurtosis(sequence[peaks])
            features['peak_range'] = features['max'] - features['min']
            features['peak_q1']  = np.percentile(sequence[peaks], 25)
            features['peak_q3']  = np.percentile(sequence[peaks], 75)
            features['peak_iqr'] = features['peak_q3'] - features['peak_q1']
            features['peak_mad'] = np.mean(np.abs(sequence[peaks] - np.mean(sequence[peaks])))
        
        else:
            features['num_peaks_ratio'] = 0
            features['peak_max'] = 0
            features['peak_min'] = 0
            features['peak_iqr'] = 0
            features['peak_skewness'] = 0
            features['peak_kurtosis'] = 0
            features['peak_range'] = 0
            features['peak_q1']  = 0
            features['peak_q3']  = 0
            features['peak_iqr'] = 0
            features['peak_mad'] = 0
        
        if len(peak_intervals) > 0:   
            features['peak_intervals_max'] = np.max(peak_intervals)
            features['peak_intervals_min'] = np.min(peak_intervals)
            features['peak_intervals_std'] = tstd(peak_intervals)
            features['peak_intervals_mean']= np.mean(peak_intervals)
            features['peak_intervals_range'] = features['peak_intervals_max'] - features['peak_intervals_min']
    
            features['peak_intervals_skewness'] = skew(peak_intervals)
            features['peak_intervals_kurtosis'] = kurtosis(peak_intervals)
            
        else:   
            features['peak_intervals_max'] = 0
            features['peak_intervals_min'] = 0
            features['peak_intervals_std'] = 0
            features['peak_intervals_mean']= 0
            features['peak_intervals_range']    = 0
            features['peak_intervals_skewness'] = 0
            features['peak_intervals_kurtosis'] = 0
    out_features = {}
    for k,v in features.items():
        out_features[f'{pre_fix}_{k}'] = v
    
    return out_features

def process_file(file_path):
    warnings.filterwarnings("ignore") 
    pkl_data = pd.DataFrame(joblib.load(file_path))
    pkl_data['t_scds'] = pkl_data['t'].dt.total_seconds()
    pkl_data['v_diff'] = pkl_data['v'] - pkl_data['v'] .shift(1)
    pkl_data['t_diff'] = pkl_data['t_scds'] - pkl_data['t_scds'].shift(1)
    
    # 按每周拆分
    sub_dfs = split_dataframe_by_interval(pkl_data, time_column='t', interval_days=1)
    # 如果不够一周时间需要检查是否有n条数据
    if not sub_dfs:
        sub_dfs = [pkl_data]
    
    sub_df_features = []
    for tmp_df in sub_dfs:
        sequence_tmp1 = tmp_df['v'].values
        sequence_tmp2 = tmp_df['v_diff'].values
        sequence_tmp3 = tmp_df['t_diff'].values
        features1 = extract_features(sequence_tmp1, pre_fix='v', is_v_seq=True, more_features=True)
        features2 = extract_features(sequence_tmp2, pre_fix='v_diff', is_v_seq=False, more_features=False)
        features3 = extract_features(sequence_tmp3, pre_fix='t_diff', is_v_seq=False, more_features=False)

        features = {}
        for tmp_features_dict in [features1,features2,features3]:
            for k,v in tmp_features_dict.items():
                features[k] = v
        
        features['week_points_count'] = len(tmp_df)
        features['week_points_count_ratio'] = len(tmp_df) / len(sub_dfs[0])
        features['total_weeks'] = len(sub_dfs)
        features['file_name'] = file_path.split('/')[-1]  # 添加文件名作为标识
        sub_df_features.append(features)
        
    return sub_df_features

# 定义批量提取特征的函数
def extract_features_from_files(data_dir, output_file):
    """
    从pkl文件中批量提取特征
    :param data_dir: 存放pkl文件的目录
    :param output_file: 保存特征的输出文件路径
    """
    all_features = []
    file_names = [f for f in os.listdir(data_dir) if f.endswith('.pkl')][:]
    file_paths = [os.path.join(data_dir, file_name) for file_name in file_names]
    
    results = Parallel(n_jobs=8)(delayed(process_file)(file_path) for file_path in tqdm(file_paths))
    '''   
    # 使用线程池并行处理文件
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        # 提交任务
        futures = {executor.submit(process_file, file_path): file_path for file_path in file_paths}

        # 使用 tqdm 显示进度条
        for future in tqdm(as_completed(futures), total=len(file_paths), desc="Extracting features"):
            file_path = futures[future]
            try:
                result = future.result()
                all_features.extend(result)
            except Exception as e:
                print(f"文件 {file_path} 处理失败: {e}")
    '''
    for result in results:
        all_features.extend(result)
    # 将特征保存为DataFrame并导出为CSV文件
    df_features = pd.DataFrame(all_features)
    df_features.to_csv(output_file, index=False)
    print(f"Features saved to {output_file}, shape is {df_features.shape}")

In [4]:
data_directory = '../input/train_X'  # 替换为pkl文件所在的目录
output_csv = '../input/round2_train_X_extracted_features6.csv'  # 输出文件路径
extract_features_from_files(data_directory, output_csv)

100%|██████████| 31839/31839 [08:43<00:00, 60.85it/s] 


Features saved to ../input/round2_train_X_extracted_features6.csv, shape is (1038194, 79)


In [5]:
data_directory = '../input/test_X'  # 替换为pkl文件所在的目录
output_csv = '../input/round2_test_X_extracted_features6.csv' # 输出文件路径
extract_features_from_files(data_directory, output_csv) 

100%|██████████| 315720/315720 [1:30:59<00:00, 57.83it/s]  


Features saved to ../input/round2_test_X_extracted_features6.csv, shape is (10138308, 79)


In [6]:
1

1