In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

df_train = pd.read_csv('G:\\Browser\\GAIDC_preliminary_A\\train.csv') #.sample(frac=0.1)
df_test = pd.read_csv('G:\\Browser\\GAIDC_preliminary_A\\preliminary_A.csv') #.sample(frac=0.1)
df = df_train.append(df_test, sort=False)

In [2]:
le = LabelEncoder()
for f in tqdm(list(df.select_dtypes(include='object'))):
    '''
    LabelEncoding (for object)
    '''
    try:
        if f=="id": # 保留id
            continue
        df[f] = df[f].astype('float')# 将数字转成数值型数据
    except:
        df[f] = le.fit_transform(df[f].astype('str')).astype('int')# 字符串label化

100%|██████████| 4/4 [00:00<00:00, 190.51it/s]


# 测试

In [None]:
# 数量编码
groupby_list = ['transformers', 'month','date_id', 'is_weekend', 'time', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6']
for name in tqdm(groupby_list):
    # 加上同类数属性列
    df_temp = df.groupby(name).size().reset_index()#重置索引
    df_temp.columns = [name] + ["num_in_"+name]
    df = df.merge(df_temp, how='left')  # 左合并   

100%|██████████| 11/11 [00:00<00:00, 53.40it/s]


In [None]:
L_list = ['L1', 'L2', 'L3', 'L4', 'L5', 'L6']
L_norm_list = [i+"_norm" for i in L_list]
max_min_scaler = lambda x : (x-x.min())/(x.max()-x.min())
for i in L_list: # 将L1-6进行min-max归一化
    df[f'{i}_norm'] = df[[i]].apply(max_min_scaler).values
df["L_norm_sum"] = df[L_norm_list].sum(axis=1) # 归一化总和
df["L_sum"] = df[L_list].sum(axis=1) # 总和
df['L_norm_std']  = df[L_norm_list].std(axis=1) # 方差
df['L_std']  = df[L_list].std(axis=1) # 方差

In [None]:
# 猜测复杂L1-L6是否大于0可能会有影响
ge0 = lambda x: 1 if x>=0 else 0
L_ge0_list = [f"is_{i}_ge0" for i in L_list]
for i in L_list:
    df[f'is_{i}_ge0'] = df[[i]].applymap(ge0).values
df["num_L_ge0"] = df[L_ge0_list].sum(axis=1)
df["is_L_all_ge0"] = df["is_L1_ge0"] * df["is_L2_ge0"] * df["is_L3_ge0"] * df["is_L4_ge0"] * df["is_L5_ge0"] * df["is_L6_ge0"]

# 测试

In [3]:
# 新建一个dataframe，总行数为原df的1/48，将原df的所有feat都整合成长48的list
column_names = list(df.columns.values) #.remove("id")
# column_names.remove("id")
df_48 = pd.DataFrame(columns=column_names)
# test_id_list = df_test["id"].values.tolist() # 存储测试集id，便于分隔
# df_48.info()

In [4]:
feat_name = df_48.columns.values
# 把M1和M2分开
for transformers in df["transformers"].value_counts().index.values:
    date_id_list = df[df["transformers"] == transformers]["date_id"].value_counts().index.values
    for date_id in tqdm(date_id_list):
        tmp = df.query(f' transformers=={transformers} and date_id=={date_id} ')
        data = {i: tmp[i].values for i in feat_name}
        # data = {
        #     "y": df_list["y"].values,
        #     "L1": df_list["L1"].values,
        #     "L2": df_list["L2"].values,
        #     "L3": df_list["L3"].values,
        #     }
        df_48 = df_48.append(data, ignore_index=True)
df_48[["id","y"]]

100%|██████████| 517/517 [00:07<00:00, 70.32it/s]
100%|██████████| 483/483 [00:06<00:00, 71.54it/s]


Unnamed: 0,id,y
0,"[M1_D00003_00_00, M1_D00003_00_30, M1_D00003_0...","[9.7434, 9.955, 10.1302, 10.5524, 10.377, 10.9..."
1,"[M1_D00582_00_00, M1_D00582_00_30, M1_D00582_0...","[13.472, 13.4716, 13.4719, 13.3662, 13.4366, 1..."
2,"[M1_D00631_00_00, M1_D00631_00_30, M1_D00631_0...","[16.7076, 16.884, 17.657, 17.0595, 17.3757, 17..."
3,"[M1_D00630_00_00, M1_D00630_00_30, M1_D00630_0...","[15.9694, 16.0042, 16.2151, 16.1796, 16.1799, ..."
4,"[M1_D00627_00_00, M1_D00627_00_30, M1_D00627_0...","[17.1651, 17.9387, 18.0092, 17.8331, 17.9038, ..."
...,...,...
995,"[M2_D00299_00_00, M2_D00299_00_30, M2_D00299_0...","[27.4078, 27.2975, 26.9678, 26.8582, 26.529, 2..."
996,"[M2_D00298_00_00, M2_D00298_00_30, M2_D00298_0...","[27.5171, 27.1877, 26.8583, 26.6381, 26.3086, ..."
997,"[M2_D00296_00_00, M2_D00296_00_30, M2_D00296_0...","[13.6133, 13.6133, 13.1744, 13.0644, 12.8443, ..."
998,"[M2_D00295_00_00, M2_D00295_00_30, M2_D00295_0...","[15.5177, 15.5175, 15.5179, 15.5177, 15.5175, ..."


In [5]:
# 区分train和test
test_no_list = []
train_no_list = [i for i in range(1000)]
for i,j in enumerate(df_48["y"]):
    if j.tolist() != j.tolist(): # 即为nan
        # print(j)
        test_no_list.append(i)
        train_no_list.remove(i)

len(test_no_list), len(train_no_list)

(200, 800)

In [6]:
# df_48.drop(columns="id").info()
df_48.iloc[train_no_list].drop(columns="id").info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 0 to 998
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   transformers  800 non-null    object
 1   date_id       800 non-null    object
 2   month         800 non-null    object
 3   is_weekend    800 non-null    object
 4   time          800 non-null    object
 5   L1            800 non-null    object
 6   L2            800 non-null    object
 7   L3            800 non-null    object
 8   L4            800 non-null    object
 9   L5            800 non-null    object
 10  L6            800 non-null    object
 11  y             800 non-null    object
dtypes: object(12)
memory usage: 81.2+ KB


In [7]:
# 数据框转换为numpy数组
df_train_np = df_48.iloc[train_no_list].drop(columns="id").to_numpy() # 抽取出train

diff = len(["id","y"])
# 提取出list
df_train_list = []
for i in range(len(df_48.columns.values) - diff): # 去除y和id所以减2
    df_train_list.append(df_train_np[:, i])
label = df_train_np[:, -1] # 

# 将特征A，特征B和标签转换为三维张量。将数值类型从list转换为array
x_train = np.array(
    [
        [df_train_list[i][j] for i in range(len(df_48.columns.values) - diff)] 
        for j in range(len(df_train_list[0]))
    ]
    )
y_train = np.array([label[i] for i in range(len(label))])


df_test_np = df_48.iloc[test_no_list].drop(columns="id").to_numpy() # 抽取出test
# 提取出list
df_test_list = []
for i in range(len(df_48.columns.values) - diff):
    df_test_list.append(df_test_np[:, i])

x_test = np.array(
    [
        [df_test_list[i][j] for i in range(len(df_48.columns.values) - diff)] 
        for j in range(len(df_test_list[0]))
    ]
    )

In [8]:
# import pandas as pd
# import numpy as np

# # 生成有内部关联性的数据
# feature_a = np.random.normal(10, 2, (10000, 2)).tolist()
# feature_b = np.random.normal(15, 3, (10000, 2)).tolist()
# label = np.random.normal(20, 5, (10000, 2)).tolist()

# # 创建数据框
# data = {
#     'feature_a': feature_a,
#     'feature_b': feature_b,
#     'label': label
# }
# df = pd.DataFrame(data,index=range(10000,20000,1))

# folds = KFold(n_splits=5, shuffle=True, random_state=2019)

# for fold_, (trn_idx, val_idx) in enumerate(folds.split(range(10000), df['label'])):
#     print(val_idx)

In [12]:
x_test.shape,x_train.shape

((200, 11, 48), (800, 11, 48))

In [9]:
import time
from keras.layers import LSTM, Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
# 五折交叉验证
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof = np.zeros([len(x_train), 48])
predictions = np.zeros([len(x_test), 48])   

for fold_, (trn_idx, val_idx) in enumerate(
    folds.split(range(len(x_train)), y_train)
    ):
    # KFold五折交叉
    print(f'\nFold_{fold_+1} Featuring ==========={time.asctime(time.localtime(time.time()))}\n')
    X_tra, X_val = x_train[trn_idx], x_train[val_idx]
    y_tra, y_val = y_train[trn_idx], y_train[val_idx]

    model = Sequential()
    model.add(LSTM(320, input_shape=(11, 48))) # input_shape（特征数量，特征值维数）
    # model.add(LSTM(320, input_shape=(640, 1))) # input_shape（特征数量，特征值维数）
    model.add(Dense(48))
    model.compile(loss='mean_squared_error', optimizer='adam',metrics=['mae'])

    # 训练模型
    # x_train是特征A和特征B的训练数据，y_train是标签的训练数据
    # x_train和y_train需要转换成三维张量
    model.fit(x_train, y_train, epochs=1000, batch_size=8,
        callbacks=[EarlyStopping(monitor='loss', patience=5)])

    oof[val_idx] = model.predict(X_val)

    
    predictions += model.predict(x_test) / folds.n_splits
    # print(predictions)
    del model






MemoryError: Unable to allocate 1.61 MiB for an array with shape (800, 11, 48) and data type float32

In [None]:
# len(y_pred)
# y_pred[0]
from sklearn.metrics import mean_squared_error, mean_absolute_error
score = mean_absolute_error(y_train, oof)
score

2.3009556762084844

In [None]:
df_test_48 = df_48.iloc[test_no_list]
for i,j in enumerate(df_test_48["y"]):
    df_test_48.iloc[i,-1] = predictions[i]
df_test_48[["id","y"]]

Unnamed: 0,id,y
132,"[M1_D00525_00_00, M1_D00525_00_30, M1_D00525_0...","[17.960859775543213, 18.03029155731201, 18.013..."
133,"[M1_D00524_00_00, M1_D00524_00_30, M1_D00524_0...","[0.4580961838364601, 0.48713668156415224, 0.16..."
134,"[M1_D00512_00_00, M1_D00512_00_30, M1_D00512_0...","[21.23642587661743, 21.12035059928894, 21.0759..."
135,"[M1_D00501_00_00, M1_D00501_00_30, M1_D00501_0...","[6.550054311752319, 6.462775826454163, 6.51308..."
136,"[M1_D00484_00_00, M1_D00484_00_30, M1_D00484_0...","[15.822006940841675, 15.91051459312439, 15.856..."
...,...,...
753,"[M2_D00075_00_00, M2_D00075_00_30, M2_D00075_0...","[12.560163497924805, 12.326066493988037, 12.16..."
754,"[M2_D00073_00_00, M2_D00073_00_30, M2_D00073_0...","[32.80941581726074, 32.43967008590698, 32.3510..."
755,"[M2_D00066_00_00, M2_D00066_00_30, M2_D00066_0...","[16.509177684783936, 16.18205213546753, 16.175..."
756,"[M2_D00053_00_00, M2_D00053_00_30, M2_D00053_0...","[23.47450065612793, 23.328338623046875, 23.062..."


In [None]:
df_test

Unnamed: 0,id,transformers,date_id,month,is_weekend,time,L1,L2,L3,L4,L5,L6
0,M1_D00000_00_00,M1,D00000,3,1,0:00,13.0278,8.4753,4.3406,1.7753,-0.2663,11.782043
1,M1_D00000_00_30,M1,D00000,3,1,0:30,12.3578,8.1378,4.0206,0.9720,-0.5864,11.772829
2,M1_D00000_01_00,M1,D00000,3,1,1:00,13.8315,9.8085,3.9294,2.3443,0.3556,11.854339
3,M1_D00000_01_30,M1,D00000,3,1,1:30,13.8988,9.7902,3.9297,1.3737,-0.2842,11.786057
4,M1_D00000_02_00,M1,D00000,3,1,2:00,13.8315,9.8256,3.9751,1.9085,0.1776,11.518905
...,...,...,...,...,...,...,...,...,...,...,...,...
9595,M2_D00712_21_30,M2,D00712,2,1,21:30,38.5751,46.9907,-8.6837,0.0000,2.0513,12.698127
9596,M2_D00712_22_00,M2,D00712,2,1,22:00,36.8583,45.0476,-8.0806,0.0003,1.6087,12.376602
9597,M2_D00712_22_30,M2,D00712,2,1,22:30,34.8477,43.8814,-8.8380,0.0002,2.0377,12.180814
9598,M2_D00712_23_00,M2,D00712,2,1,23:00,34.0516,43.1575,-9.5099,0.0001,1.6085,12.049113


In [None]:
# df_test_48[["id",]].values[0].tolist()
# arr.flatten()
test_id_list = np.concatenate(df_test_48["id"].values)
test_y_list = np.concatenate(df_test_48["y"].values)
df_test["y"] = -100 # 添加新列置为-100
for i,j in tqdm(zip(test_id_list, test_y_list)):
    df_test.loc[df_test["id"]==i,"y"] = j # 修改值不能用[]
df_test[["id","y"]]

9600it [00:04, 2031.55it/s]


In [None]:
df_test[["id","y"]].to_csv(f'lstm_{int(100*score)}.csv', index=False)

KeyError: "['y'] not in index"

In [None]:
# int(100*round(2.352429460351817, 2))
int(100*2.352429460351817)

235