In [1]:
import pandas as pd
import numpy as np
# 读取数据
train_df = pd.read_csv('train_d.csv')
test_df = pd.read_csv('test_d.csv')

In [2]:
train_df['name'].value_counts()

name
Banana_10       5606
Toust_13        5606
Date_1          5602
Orange_14       5601
Bread_198       5600
                ... 
Watermelon_1       8
Decoration_0       8
Apple_71           7
Ginger_5           7
Strawberry_4       7
Name: count, Length: 2653, dtype: int64

In [3]:
train_df['product_unique_id'].value_counts()

product_unique_id
1279    5606
1009    5606
2087    5602
1280    5601
2312    5600
        ... 
32         8
2214       8
1226       7
1899       7
122        7
Name: count, Length: 2653, dtype: int64

In [4]:
train_df.columns

Index(['unique_id', 'date', 'warehouse', 'total_orders', 'sales',
       'sell_price_main', 'type_0_discount', 'type_1_discount',
       'type_2_discount', 'type_3_discount', 'type_4_discount',
       'type_5_discount', 'type_6_discount', 'holiday_name', 'holiday',
       'shops_closed', 'winter_school_holidays', 'school_holidays',
       'product_unique_id', 'name', 'L1_category_name_en',
       'L2_category_name_en', 'L3_category_name_en', 'L4_category_name_en',
       'year', 'month_sin', 'month_cos', 'day_sin', 'day_cos'],
      dtype='object')

## 处理name列，均值替换

In [5]:
# 定义需要处理的列
columns_to_process = ['year','name', 'holiday_name', 'L2_category_name_en', 'L3_category_name_en', 'L4_category_name_en']

# 对每一列进行处理
for column in columns_to_process:
    # 在 train 数据上按列分组，计算 sales 的均值
    mean_sales = train_df.groupby(column)['sales'].mean().reset_index()
    mean_sales.rename(columns={'sales': f'{column}_mean'}, inplace=True)
    
    # 将均值合并到 train 数据中
    train_df = train_df.merge(mean_sales, on=column, how='left')
    
    # 将均值合并到 test 数据中
    test_df = test_df.merge(mean_sales, on=column, how='left')
    if column != 'year':
        # 删除原始列（如果需要）
        train_df.drop(columns=[column], inplace=True)
        test_df.drop(columns=[column], inplace=True)


In [6]:
train_df.columns

Index(['unique_id', 'date', 'warehouse', 'total_orders', 'sales',
       'sell_price_main', 'type_0_discount', 'type_1_discount',
       'type_2_discount', 'type_3_discount', 'type_4_discount',
       'type_5_discount', 'type_6_discount', 'holiday', 'shops_closed',
       'winter_school_holidays', 'school_holidays', 'product_unique_id',
       'L1_category_name_en', 'year', 'month_sin', 'month_cos', 'day_sin',
       'day_cos', 'year_mean', 'name_mean', 'holiday_name_mean',
       'L2_category_name_en_mean', 'L3_category_name_en_mean',
       'L4_category_name_en_mean'],
      dtype='object')

In [7]:
test_df

Unnamed: 0,unique_id,date,warehouse,total_orders,sell_price_main,type_0_discount,type_1_discount,type_2_discount,type_3_discount,type_4_discount,...,month_sin,month_cos,day_sin,day_cos,year_mean,name_mean,holiday_name_mean,L2_category_name_en_mean,L3_category_name_en_mean,L4_category_name_en_mean
0,1226,2024-06-03,Brno_1,8679.0,13.13,0.00000,0.0,0.0,0.0,0.0,...,1.224647e-16,-1.0,0.571268,0.820763,114.774336,11.186467,108.952108,67.817624,22.765791,136.896545
1,1226,2024-06-11,Brno_1,8795.0,13.13,0.15873,0.0,0.0,0.0,0.0,...,1.224647e-16,-1.0,0.790776,-0.612106,114.774336,11.186467,108.952108,67.817624,22.765791,136.896545
2,1226,2024-06-13,Brno_1,10009.0,13.13,0.15873,0.0,0.0,0.0,0.0,...,1.224647e-16,-1.0,0.485302,-0.874347,114.774336,11.186467,108.952108,67.817624,22.765791,136.896545
3,1226,2024-06-15,Brno_1,8482.0,13.13,0.15873,0.0,0.0,0.0,0.0,...,1.224647e-16,-1.0,0.101168,-0.994869,114.774336,11.186467,108.952108,67.817624,22.765791,136.896545
4,1226,2024-06-09,Brno_1,8195.0,13.13,0.00000,0.0,0.0,0.0,0.0,...,1.224647e-16,-1.0,0.968077,-0.250653,114.774336,11.186467,108.952108,67.817624,22.765791,136.896545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47016,4572,2024-06-03,Munich_1,5254.0,2.09,0.00000,0.0,0.0,0.0,0.0,...,1.224647e-16,-1.0,0.571268,0.820763,114.774336,28.606474,108.952108,135.501565,127.007666,104.395348
47017,3735,2024-06-04,Prague_1,9698.0,11.00,0.00000,0.0,0.0,0.0,0.0,...,1.224647e-16,-1.0,0.724793,0.688967,114.774336,100.466355,108.952108,135.501565,172.588918,367.883015
47018,3735,2024-06-03,Prague_1,10256.0,11.00,0.00000,0.0,0.0,0.0,0.0,...,1.224647e-16,-1.0,0.571268,0.820763,114.774336,100.466355,108.952108,135.501565,172.588918,367.883015
47019,2129,2024-06-03,Brno_1,8679.0,37.75,0.00000,0.0,0.0,0.0,0.0,...,1.224647e-16,-1.0,0.571268,0.820763,114.774336,539.669723,108.952108,135.501565,175.948201,104.395348


In [8]:
test_df['holiday_name_mean'].max()

108.95210822780084

## 处理折扣列，删除折扣，改为最大折扣

In [9]:
# 定义折扣列
discount_columns = [
    'type_0_discount', 'type_1_discount', 'type_2_discount',
    'type_3_discount', 'type_4_discount', 'type_5_discount', 'type_6_discount'
]

# 对 train 和 test 数据分别处理
for df in [train_df, test_df]:
    # 1. 是否有折扣列
    df['is_discount'] = df[discount_columns].apply(lambda row: 1 if row.any() else 0, axis=1).astype(bool)
    
    # 2. 最高折扣值列
    df['max_discount'] = df[discount_columns].apply(lambda row: row.max(), axis=1).astype('float64')

In [10]:
train_df.columns

Index(['unique_id', 'date', 'warehouse', 'total_orders', 'sales',
       'sell_price_main', 'type_0_discount', 'type_1_discount',
       'type_2_discount', 'type_3_discount', 'type_4_discount',
       'type_5_discount', 'type_6_discount', 'holiday', 'shops_closed',
       'winter_school_holidays', 'school_holidays', 'product_unique_id',
       'L1_category_name_en', 'year', 'month_sin', 'month_cos', 'day_sin',
       'day_cos', 'year_mean', 'name_mean', 'holiday_name_mean',
       'L2_category_name_en_mean', 'L3_category_name_en_mean',
       'L4_category_name_en_mean', 'is_discount', 'max_discount'],
      dtype='object')

In [11]:
train_df.drop(columns=discount_columns, inplace=True)
test_df.drop(columns=discount_columns, inplace=True)

In [12]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4007367 entries, 0 to 4007366
Data columns (total 25 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   unique_id                 int64  
 1   date                      object 
 2   warehouse                 object 
 3   total_orders              float64
 4   sales                     float64
 5   sell_price_main           float64
 6   holiday                   bool   
 7   shops_closed              bool   
 8   winter_school_holidays    bool   
 9   school_holidays           bool   
 10  product_unique_id         int64  
 11  L1_category_name_en       object 
 12  year                      object 
 13  month_sin                 float64
 14  month_cos                 float64
 15  day_sin                   float64
 16  day_cos                   float64
 17  year_mean                 float64
 18  name_mean                 float64
 19  holiday_name_mean         float64
 20  L2_category_name_en_mean

In [13]:
train_df.shape,test_df.shape

((4007367, 25), (47021, 24))

In [14]:
train_unid = train_df['unique_id']
val_unid = test_df['unique_id']
train_date = train_df['date']
val_date = test_df['date']

y_train = train_df['sales']
train_df.drop(['sales'], axis=1, inplace=True)

In [15]:
train_df.columns

Index(['unique_id', 'date', 'warehouse', 'total_orders', 'sell_price_main',
       'holiday', 'shops_closed', 'winter_school_holidays', 'school_holidays',
       'product_unique_id', 'L1_category_name_en', 'year', 'month_sin',
       'month_cos', 'day_sin', 'day_cos', 'year_mean', 'name_mean',
       'holiday_name_mean', 'L2_category_name_en_mean',
       'L3_category_name_en_mean', 'L4_category_name_en_mean', 'is_discount',
       'max_discount'],
      dtype='object')

In [16]:
train_df.select_dtypes(include=['float64']).columns.tolist()

['total_orders',
 'sell_price_main',
 'month_sin',
 'month_cos',
 'day_sin',
 'day_cos',
 'year_mean',
 'name_mean',
 'holiday_name_mean',
 'L2_category_name_en_mean',
 'L3_category_name_en_mean',
 'L4_category_name_en_mean',
 'max_discount']

In [17]:
from sklearn.preprocessing import StandardScaler

standard_col = ['total_orders',
 'sell_price_main',
 'name_mean',
 'year_mean',
 'holiday_name_mean',
 'L2_category_name_en_mean',
 'L3_category_name_en_mean',
 'L4_category_name_en_mean',
 'max_discount']


# 去掉 unique_id 和 date 列
train_df.drop(['unique_id', 'date','product_unique_id'], axis=1, inplace=True)
test_df.drop(['unique_id', 'date','product_unique_id'], axis=1, inplace=True)
# 记录训练集和验证集的大小
train_size = len(train_df)
val_size = len(test_df)
# 拼接训练集和验证集
combined_data = pd.concat([train_df, test_df], axis=0)

# 对拼接后的数据进行独热编码
combined_data = pd.get_dummies(combined_data, drop_first=True)

# 拆分回训练集和验证集
train_df = combined_data.iloc[:train_size, :]
test_df = combined_data.iloc[train_size:, :]

# 标准化
scaler = StandardScaler()
train_df.loc[:, standard_col] = scaler.fit_transform(train_df[standard_col])
test_df.loc[:, standard_col] = scaler.transform(test_df[standard_col])
train_df.shape, test_df.shape


((4007367, 30), (47021, 30))

In [18]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4007367 entries, 0 to 4007366
Data columns (total 30 columns):
 #   Column                                   Dtype  
---  ------                                   -----  
 0   total_orders                             float64
 1   sell_price_main                          float64
 2   holiday                                  bool   
 3   shops_closed                             bool   
 4   winter_school_holidays                   bool   
 5   school_holidays                          bool   
 6   month_sin                                float64
 7   month_cos                                float64
 8   day_sin                                  float64
 9   day_cos                                  float64
 10  year_mean                                float64
 11  name_mean                                float64
 12  holiday_name_mean                        float64
 13  L2_category_name_en_mean                 float64
 14  L3_category_name_en_mea

In [19]:
train_df = train_df.copy()
test_df = test_df.copy()
train_df.loc[:, 'unique_id'] = train_unid
test_df.loc[:, 'unique_id'] = val_unid
train_df.loc[:, 'date'] = train_date
test_df.loc[:, 'date'] = val_date
train_df.loc[:, 'sales'] = y_train
train_df.shape, test_df.shape

((4007367, 33), (47021, 32))

In [20]:
train_df['sales']

0          16.34
1          12.63
2          34.55
3          34.52
4          35.92
           ...  
4007362    26.56
4007363    27.42
4007364    33.39
4007365    22.88
4007366    32.10
Name: sales, Length: 4007367, dtype: float64

In [21]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)