In [1]:
import json
import pickle
import numpy as np
import pandas as pd
import collections
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
input_len = 144
df = pd.read_csv('data/wtbdata_245days.csv')
df = df.fillna(0)

In [3]:
## drop abnormal samples
def drop_abnormal(df_train):
    index_list1 = []
    ls = list(df_train['Patv_seq'])
    for i in range(len(ls)):
        n = len([x for x in ls[i] if x<0])
        if n>100:
            index_list1.append(i)

    index_list2 = []
    ls = list(df_train['Pab1_seq'])
    for i in range(len(ls)):
        n = len([x for x in ls[i] if x>89])
        if n>100:
            index_list2.append(i)

    abnormal = list(set(index_list1)|set(index_list2))
    ls_new = []
    for x in range(len(df_train)):
        if x not in abnormal:
            ls_new.append(x)
            
    return df_train.loc[ls_new]

## generate sequences of each turbine

In [4]:
for turbid in range(134): 
    full_dict = collections.defaultdict(list)
    turb_data = df[df['TurbID']==turbid+1].reset_index(drop=True)
    ls = list(turb_data['Patv'])
    for i in range(0, len(ls)-288-input_len):
        
        full_dict['Wspd_seq'].append(list(turb_data['Wspd'])[i:i+input_len])
        full_dict['Patv_seq'].append(list(turb_data['Patv'])[i:i+input_len])
        full_dict['Etmp_seq'].append(list(turb_data['Etmp'])[i:i+input_len])
        full_dict['Itmp_seq'].append(list(turb_data['Itmp'])[i:i+input_len])
        full_dict['Pab1_seq'].append(list(turb_data['Pab1'])[i:i+input_len])      

        full_dict['target'].append(list(turb_data['Patv'])[i+input_len:i+input_len+288])

    df_his = pd.DataFrame(full_dict)
    df_his.to_csv('/data/turbine_data'+str(turbid+1)+'.csv', index=False)

## get space information

In [5]:
df_loc = pd.read_csv('data/sdwpf_baidukddcup2022_turb_location.CSV')

dist_dict = {}
for i in tqdm(range(134)):
    x = float(df_loc[df_loc['TurbID']==i+1]['x'])
    y = float(df_loc[df_loc['TurbID']==i+1]['y'])
    df_loc['dist'] = np.sqrt((df_loc['x']-x)**2+(df_loc['y']-y)**2)
    dist_dict[i+1] = list(df_loc.sort_values('dist', ascending=True)['TurbID'])[1:]

100%|██████████| 134/134 [00:00<00:00, 968.02it/s]


In [6]:
df_group = df[['Day','Tmstamp','Patv']].groupby(['Day','Tmstamp'], as_index=False).agg(list)
df_group.columns = ['Day','Tmstamp','Patv_list']

In [7]:
with open('data/dist_dict.pickle', 'wb') as f:
    pickle.dump(dist_dict, f)

## sample data

In [9]:
## seed = 2
for i in tqdm(range(134)):
    df_tmp = pd.read_csv('data/turbine_data'+str(i+1)+'.csv')
    df_tmp['index'] = list(range(len(df_tmp)))
    df_tmp = df_tmp.sample(frac=0.01, random_state=2)
    index_list = list(df_tmp['index'])
    df_tmp['TurbID'] = i+1

    turb_data = df[df['TurbID']==i+1].reset_index(drop=True)
    near_turbs = dist_dict[i+1][:121]
    index_new = [x+144-1 for x in index_list]
    selected_df = turb_data.loc[index_new]
    selected_df = pd.merge(selected_df, df_group, how='left', on=['Day','Tmstamp'])
    selected_df['Patv_space'] = selected_df['Patv_list'].apply(lambda x: [x[k-1] for k in near_turbs])    
    df_tmp['Patv_space'] = list(selected_df['Patv_space'])
    
    
    if i==0:
        train = df_tmp.copy()
    else:
        train = pd.concat([train, df_tmp])
train = train.reset_index(drop=True)
train = train.sample(frac=1, random_state=2).reset_index(drop=True)

100%|██████████| 134/134 [03:55<00:00,  1.76s/it]


In [11]:
cols = [x for x in train.columns if 'seq' in x or x=='target']
for col in cols:
    train[col] = train[col].apply(lambda x: json.loads(x))
    
train = drop_abnormal(train)
train.to_csv('data/train_data_normal.csv', index=False)

In [12]:
## seed = 42
for i in tqdm(range(134)):
    df_tmp = pd.read_csv('data/turbine_data'+str(i+1)+'.csv')
    df_tmp['index'] = list(range(len(df_tmp)))
    df_tmp = df_tmp.sample(frac=0.01, random_state=2)
    index_list = list(df_tmp['index'])
    df_tmp['TurbID'] = i+1

    turb_data = df[df['TurbID']==i+1].reset_index(drop=True)
    near_turbs = dist_dict[i+1][:121]
    index_new = [x+144-1 for x in index_list]
    selected_df = turb_data.loc[index_new]
    selected_df = pd.merge(selected_df, df_group, how='left', on=['Day','Tmstamp'])
    selected_df['Patv_space'] = selected_df['Patv_list'].apply(lambda x: [x[k-1] for k in near_turbs])    
    df_tmp['Patv_space'] = list(selected_df['Patv_space'])
    
    
    if i==0:
        train = df_tmp.copy()
    else:
        train = pd.concat([train, df_tmp])
train = train.reset_index(drop=True)
train = train.sample(frac=1, random_state=2).reset_index(drop=True)

100%|██████████| 134/134 [02:27<00:00,  1.10s/it]


In [13]:
cols = [x for x in train.columns if 'seq' in x or x=='target']
for col in cols:
    train[col] = train[col].apply(lambda x: json.loads(x))
    
train = drop_abnormal(train)
train.to_csv('data/train_data42.csv', index=False)

In [14]:
## seed = 2022
for i in tqdm(range(134)):
    df_tmp = pd.read_csv('data/turbine_data'+str(i+1)+'.csv')
    df_tmp['index'] = list(range(len(df_tmp)))
    df_tmp = df_tmp.sample(frac=0.01, random_state=2)
    index_list = list(df_tmp['index'])
    df_tmp['TurbID'] = i+1

    turb_data = df[df['TurbID']==i+1].reset_index(drop=True)
    near_turbs = dist_dict[i+1][:121]
    index_new = [x+144-1 for x in index_list]
    selected_df = turb_data.loc[index_new]
    selected_df = pd.merge(selected_df, df_group, how='left', on=['Day','Tmstamp'])
    selected_df['Patv_space'] = selected_df['Patv_list'].apply(lambda x: [x[k-1] for k in near_turbs])    
    df_tmp['Patv_space'] = list(selected_df['Patv_space'])
    
    
    if i==0:
        train = df_tmp.copy()
    else:
        train = pd.concat([train, df_tmp])
train = train.reset_index(drop=True)
train = train.sample(frac=1, random_state=2).reset_index(drop=True)

100%|██████████| 134/134 [02:28<00:00,  1.11s/it]


In [15]:
cols = [x for x in train.columns if 'seq' in x or x=='target']
for col in cols:
    train[col] = train[col].apply(lambda x: json.loads(x))
    
train = drop_abnormal(train)
train.to_csv('data/train_data2022.csv', index=False)