In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import os

np.random.seed(7)
random.seed(7)
tf.random.set_seed(7)

In [2]:
"""
Read.me

#1. csv파일 읽어오기, x 칼럼 정리
#2. Pre-processing - Train, Test 데이터를 수정 
                    시간추가, 
                    누적데이터를 10분간의 데이터로 변경, 
                    풍향을 16방위로 변경, 
                    고장난 센서 삭제       
                    
#3. X변수들을 산술평균, 가중평균으로 계산
                    - 변수를 9개로 압축하면, 오히려 mse값이 증가하는 문제가 있음
                      일단 전체 X를 사용하도록, 주석처리 해놨음
                      
#4. 데이터를 120분씩 묶는 함수 (0~120, 10~130 ...) 
#5. X를 모두 이용한 train_df 생성, 데이터 정규화, 지정한 Y만큼 데이터 늘리기
#6. X를 모두 이용한 test_df 생성, 정규화
#7. 모델 생성
#8. 모델 학습
#9. 마지막 3일 데이터('Y18')를 사용하여 fine-tuning 
#10. 결과 예측하기
"""

"\nRead.me\n\n#1. csv파일 읽어오기, x 칼럼 정리\n#2. Pre-processing - Train, Test 데이터를 수정 \n                    시간추가, \n                    누적데이터를 10분간의 데이터로 변경, \n                    풍향을 16방위로 변경, \n                    고장난 센서 삭제       \n                    \n#3. X변수들을 산술평균, 가중평균으로 계산\n                    - 변수를 9개로 압축하면, 오히려 mse값이 증가하는 문제가 있음\n                      일단 전체 X를 사용하도록, 주석처리 해놨음\n                      \n#4. 데이터를 120분씩 묶는 함수 (0~120, 10~130 ...) \n#5. X를 모두 이용한 train_df 생성, 데이터 정규화, 지정한 Y만큼 데이터 늘리기\n#6. X를 모두 이용한 test_df 생성, 정규화\n#7. 모델 생성\n#8. 모델 학습\n#9. 마지막 3일 데이터('Y18')를 사용하여 fine-tuning \n#10. 결과 예측하기\n"

In [3]:
#1
#csv파일 불러오기
train = pd.read_csv('/Users/keom/Desktop/work/Weather Forecast/train.csv')
test = pd.read_csv('/Users/keom/Desktop/work/Weather Forecast/test.csv')
    
#x칼럼 정리.
x_time = ['id'] #시간
x_temperature = ['X00', 'X07', 'X28', 'X31', 'X32'] #기온
x_local_pressure = ['X01', 'X06', 'X22', 'X27', 'X29'] #현지기압
x_wind_speed = ['X02', 'X03', 'X18', 'X24', 'X26'] #풍속
x_daily_rainfall = ['X04', 'X10', 'X21', 'X36', 'X39'] #일일 누적강수량
x_sealevel_pressure = ['X05', 'X08', 'X09', 'X23', 'X33'] #해면기압
x_daily_sun = ['X11', 'X34'] #일일 누적일사량 (X14, X16, X19는 제거됨)
x_humidity = ['X12', 'X20', 'X30', 'X37', 'X38'] #습도
x_wind_direction = ['X13', 'X15', 'X17', 'X25', 'X35'] #풍향

#y칼럼 정리
y_sensor = ['Y00', 'Y01','Y02','Y03','Y04', 'Y05', 'Y06', 'Y07', 'Y08', 
            'Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17'] #센서측정온도
y_target = ['Y18'] #예측대상

In [4]:
"""
#2 pre-processing(1)

시간변경, 누적데이터를 10분간의 데이터로 변경, 풍향을 16방위로 변경, 고장난 센서 삭제

"""
# 날짜를 나타내주는 변수 추가
def make_day(df):
    for j in np.arange(len(df["id"])) :
        df.loc[j,"day"] = (df.loc[j, "id"] // 144) + 1 

#id값들을 시간형 데이터로 변경 (y=x)
def make_timedata(df):
    minute = (df['id'] % 144).astype(int)
    df['id'] = minute
    print("id를 y=x로 변경 완료")

    
# 누적강수량/누적일사량을 10분 단위로 변환
def accumulate_to_10minute(df, col):
    for i in col:
        #누적데이터를 10분간의 데이터로 분류
        tmp = df[i].iloc[0]
        df[i] = df[i] - df[i].shift(1)
        df[i].iloc[0] = tmp
        
        #24시를 넘어서 데이터가 음수가 된다면, 0으로 변경
        for j in np.arange(0,len(df[i])):
            if df[i].iloc[j] < 0:
                df[i].iloc[j] = 0   
    print("누적 데이터 변경 완료")
     

#풍향을 0은 1, 360은 -1로 가지는 값으로 변경
def make_wind_direction(df):
    df['X13'] = np.cos(df['X13'] * np.pi /360)
    df['X15'] = np.cos(df['X15'] * np.pi /360)
    df['X17'] = np.cos(df['X17'] * np.pi /360)
    df['X25'] = np.cos(df['X25'] * np.pi /360)
    df['X35'] = np.cos(df['X35'] * np.pi /360)
    print('풍향을 cos데이터로 변경 완료')
    

    
#고장난 센서 X14, X16, X19 삭제
train = train.drop(['X14', 'X16', 'X19'], axis =1)
test = test.drop(['X14', 'X16', 'X19'], axis =1)
print('X14, X16, X19 삭제 완료')

#시간 생성
make_day(train)
make_day(test)

make_timedata(train)
make_timedata(test)

#누적 강수량 변경
accumulate_to_10minute(train, x_daily_rainfall)
accumulate_to_10minute(test, x_daily_rainfall)

#누적 일사량 변경
accumulate_to_10minute(train, x_daily_sun)
accumulate_to_10minute(test, x_daily_sun)

#wind_direction을 cos형 데이터로 변경
make_wind_direction(train)
make_wind_direction(test)

X14, X16, X19 삭제 완료
id를 y=x로 변경 완료
id를 y=x로 변경 완료


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


누적 데이터 변경 완료
누적 데이터 변경 완료
누적 데이터 변경 완료
누적 데이터 변경 완료
풍향을 cos데이터로 변경 완료
풍향을 cos데이터로 변경 완료


In [5]:
print(train); print(test)

       id   X00    X01  X02  X03  X04     X05    X06   X07     X08  ...  Y10  \
0       0   9.7  988.8  1.2  0.6  0.0  1009.3  989.6  12.2  1009.9  ...  7.5   
1       1   9.3  988.9  1.7  1.9  0.0  1009.3  989.6  12.1  1010.0  ...  7.5   
2       2   9.4  989.0  1.1  2.3  0.0  1009.2  989.7  12.1  1010.1  ...  7.5   
3       3   9.4  988.9  1.5  0.7  0.0  1009.2  989.6  12.0  1010.0  ...  7.0   
4       4   9.2  988.9  0.8  1.7  0.0  1009.2  989.7  12.0  1010.1  ...  7.0   
...   ...   ...    ...  ...  ...  ...     ...    ...   ...     ...  ...  ...   
4747  139  19.9  987.6  0.9  0.8  0.0  1006.9  987.7  21.7  1007.5  ...  NaN   
4748  140  19.9  987.6  0.5  0.7  0.0  1006.8  987.7  21.6  1007.5  ...  NaN   
4749  141  19.7  987.7  0.9  0.6  0.0  1006.9  987.6  21.4  1007.4  ...  NaN   
4750  142  19.4  987.7  0.9  0.8  0.0  1006.9  987.8  21.3  1007.6  ...  NaN   
4751  143  19.1  987.6  1.0  0.3  0.0  1006.8  987.8  21.2  1007.5  ...  NaN   

      Y11  Y12   Y13  Y14  Y15  Y16  Y1

In [34]:
x = train.iloc[:,1:38]
train.loc[:4319, "Y18"] = train.loc[:4319,y_sensor].mean(axis=1)
y = train["Y18"]

In [39]:
from sklearn.ensemble import ExtraTreesRegressor

et_model = ExtraTreesRegressor()
et_model.fit(x,y)

print(et_model.feature_importances_)
feature_list = pd.concat([pd.Series(x.columns), pd.Series(et_model.feature_importances_)], axis=1)
feature_list.columns = ['features_name', 'importance']
feature_list.sort_values("importance", ascending =False)

[1.74553416e-01 5.25159478e-04 4.27855899e-04 3.74954499e-04
 5.10998654e-05 6.61819988e-04 7.65721540e-04 1.29180441e-01
 8.10805585e-04 6.50430810e-04 1.22306239e-05 3.44916253e-02
 8.96492908e-02 6.40574052e-04 6.71607016e-04 4.93308509e-04
 4.34599922e-04 5.58970852e-02 4.09445047e-06 6.16318986e-04
 6.90037834e-04 3.77268404e-04 5.18102845e-04 5.61034681e-04
 6.67803580e-04 6.63767646e-02 6.32439042e-04 5.66007624e-02
 1.74474214e-01 1.57862282e-01 6.86111701e-04 3.54884420e-02
 5.34173081e-04 5.92876111e-05 1.21215539e-02 1.35750134e-03
 7.97802512e-05]


Unnamed: 0,features_name,importance
0,X00,0.174553
28,X31,0.174474
29,X32,0.157862
7,X07,0.12918
12,X12,0.089649
25,X28,0.066377
27,X30,0.056601
17,X20,0.055897
31,X34,0.035488
11,X11,0.034492


In [46]:
x = train.iloc[:4320,1:38]
y = train.loc[:4319,"Y16"]

In [47]:
from sklearn.ensemble import ExtraTreesRegressor

et_model = ExtraTreesRegressor()
et_model.fit(x,y)

print(et_model.feature_importances_)
feature_list = pd.concat([pd.Series(x.columns), pd.Series(et_model.feature_importances_)], axis=1)
feature_list.columns = ['features_name', 'importance']
feature_list.sort_values("importance", ascending =False)

[1.47577687e-01 6.98004128e-04 6.44060270e-04 5.94389130e-04
 4.29297216e-05 6.61038095e-04 7.35911249e-04 6.54249857e-02
 5.98294216e-04 6.69694011e-04 1.31246988e-05 1.32316291e-01
 1.09960702e-01 6.77152441e-04 6.39991548e-04 5.26340068e-04
 5.58718798e-04 4.31561986e-02 5.24415602e-06 6.59705346e-04
 6.93313354e-04 5.60211558e-04 7.77669226e-04 6.55508054e-04
 6.98143110e-04 1.65774940e-02 7.07693147e-04 3.01482408e-02
 1.36566030e-01 2.14288590e-01 8.05199274e-04 8.21217048e-02
 5.10843572e-04 1.99797747e-05 6.68545226e-03 1.92916905e-03
 9.42960178e-05]


Unnamed: 0,features_name,importance
29,X32,0.214289
0,X00,0.147578
28,X31,0.136566
11,X11,0.132316
12,X12,0.109961
31,X34,0.082122
7,X07,0.065425
17,X20,0.043156
27,X30,0.030148
25,X28,0.016577


In [120]:
#3 x변수들을 평균 값들로 변경하는 부분. (산술평균, 가중평균)

"""
변수는 총 9개

변경X - 시간, 풍향
산술평균 - 기온, 강수량, 일사량
가중평균 - 해면기압, 지면기압, 풍속, 습도

"""
# Train의 산술평균 (기온, 강수량, 일사량)
def make_train_avg(df, col_name, col_num=[]):
    df[col_name] = pd.Series(train[col_num].mean(axis = 1))
    
    
# Train의 가중평균 (해면기압, 지면기압, 풍속, 습도)
def make_train_w_avg(df, col_name, col_num=[]):
    mat = pd.concat([train.loc[:4319,col_num], train.loc[:4319,y_sensor]], axis=1)
    cor_matrix = mat.corr().iloc[5:,:5]
    cor = pd.DataFrame(np.abs(cor_matrix).sum())
    cor_sum = pd.DataFrame(cor.sum(axis=0))
    weight = pd.Series()
    weighted_mean = []
    
    for i in col_num:
        weight[i] = (cor.loc[i]/cor_sum).values
    for j in np.arange(len(train["X12"])):
        weighted_mean.append(np.average(train.loc[j, col_num], 
                                        weights = weight.values, axis=0)[0][0])

    df[col_name] = pd.Series(weighted_mean)
    print(df)
    print("가중평균 변환 완료")  

    
# Test의 산술평균 (기온, 강수량, 일사량)
def make_test_avg(df, col_name, col_num=[]):
    df[col_name] = pd.Series(test[col_num].mean(axis = 1))
    
    
# Test의 가중평균 (해면기압, 지면기압, 풍속, 습도)
def make_test_w_avg(df, col_name, col_num=[]):
    mat = pd.concat([train.loc[:,col_num], train.loc[:,y_sensor]], axis=1)
    cor_matrix = mat.corr().iloc[5:,:5]
    cor = pd.DataFrame(np.abs(cor_matrix).sum())
    cor_sum = pd.DataFrame(cor.sum(axis=0))
    weight = pd.Series()
    weighted_mean = []
    
    for i in col_num:
        weight[i] = (cor.loc[i]/cor_sum).values
    for j in np.arange(len(test["X12"])):
        weighted_mean.append(np.average(test.loc[j, col_num],
                                        weights = weight.values, axis=0)[0][0])

    df[col_name] = pd.Series(weighted_mean)
    print(df)
    print("가중평균 변환 완료")  
    

#학습용 데이터프레임 생성
train_df = pd.DataFrame()


#X 대표값 추가
make_train_avg(train_df, 'x_temperature', x_temperature)
make_train_avg(train_df, 'x_local_pressure', x_local_pressure)
make_train_avg(train_df, 'x_wind_speed', x_wind_speed)
make_train_avg(train_df, 'x_daily_rainfall', x_daily_rainfall)
make_train_avg(train_df, 'x_sealevel_pressure', x_sealevel_pressure)
make_train_avg(train_df, 'x_daily_sun', x_daily_sun)
make_train_avg(train_df, 'x_humidity', x_humidity)

#id값을 추가
train_df.insert(loc=0, column='id', value=train['id'])

#풍향도 일단 산술평균
make_train_avg(train_df, 'x_wind_direction', x_wind_direction)


# standardization을 위해 평균과 표준편차 구하기
MEAN = train_df.mean()
STD = train_df.std()

# 표준편차가 0일 경우 대비하여 1e-07 추가 
train_df = (train_df - MEAN) / (STD + 1e-07)

#day변수 추가
train_df.insert(loc=0, column='day', value=train['day'])
train_df



# 테스트용 데이터프레임 생성
test_df = pd.DataFrame()

#X 대표값 추가
make_test_avg(test_df, 'x_temperature', x_temperature)
make_test_avg(test_df, 'x_local_pressure', x_local_pressure)
make_test_avg(test_df, 'x_wind_speed', x_wind_speed)
make_test_avg(test_df, 'x_daily_rainfall', x_daily_rainfall)
make_test_avg(test_df, 'x_sealevel_pressure', x_sealevel_pressure)
make_test_avg(test_df, 'x_daily_sun', x_daily_sun)
make_test_avg(test_df, 'x_humidity', x_humidity)

#id값 추가
test_df.insert(loc=0, column='id', value=test['id'])

#풍향도 일단 산술평균
make_test_avg(test_df, 'x_wind_direction', x_wind_direction)

test_df = (test_df - MEAN) / (STD + 1e-07)

#day변수 추가
test_df.insert(loc=0, column='day', value=test['day'])
train_df


Unnamed: 0,day,id,x_temperature,x_local_pressure,x_wind_speed,x_daily_rainfall,x_sealevel_pressure,x_daily_sun,x_humidity,x_wind_direction
0,1.0,-1.719883,-2.140777,0.202786,-0.703308,-0.182847,0.316214,-0.795451,0.530917,1.293910
1,1.0,-1.695829,-2.182981,0.214597,-0.632867,-0.182847,0.333602,-0.795451,0.550002,1.094455
2,1.0,-1.671775,-2.216744,0.208692,-0.632867,-0.182847,0.333602,-0.795451,0.629354,0.646140
3,1.0,-1.647720,-2.246286,0.196880,-0.985069,-0.182847,0.322010,-0.795451,0.671541,0.642064
4,1.0,-1.623666,-2.271609,0.202786,-0.985069,-0.182847,0.327806,-0.795451,0.711719,1.689470
...,...,...,...,...,...,...,...,...,...,...
4747,33.0,1.623666,-0.068562,-0.393686,-0.679827,-0.182847,-0.385086,-0.795451,0.867410,-0.936496
4748,33.0,1.647720,-0.077003,-0.399592,-0.750268,-0.182847,-0.390882,-0.795451,0.895535,-0.279054
4749,33.0,1.671775,-0.089664,-0.387781,-1.032030,-0.182847,-0.379290,-0.795451,0.897543,-0.371285
4750,33.0,1.695829,-0.119207,-0.375969,-1.008550,-0.182847,-0.361903,-0.795451,0.914619,-0.264185


In [121]:
#4. RNN 모델에 입력 할 수 있는 시계열 형태로 데이터 변환 
def convert_to_timeseries(df, interval):
    sequence_list = []
    target_list = []
    
    for i in tqdm(range(df.shape[0] - interval)):
        sequence_list.append(np.array(df.iloc[i:i+interval,:-1]))
        target_list.append(df.iloc[i+interval,-1])
    
    sequence = np.array(sequence_list)
    target = np.array(target_list)
    
    return sequence, target

In [122]:
x_num = 10
#학습에 사용할 Y값 지정
y_columns = ['Y09', 'Y15', 'Y16']


#지정한 Y들만큼 데이터의 길이를 늘림
sequence = np.empty((0, 12, x_num))
target = np.empty((0,))

for column in y_columns :
    
    concat = pd.concat([train_df, train[column]], axis = 1)

    _sequence, _target = convert_to_timeseries(concat.head(144*30), interval = 12)

    sequence = np.vstack((sequence, _sequence))
    target = np.hstack((target, _target))

100%|██████████| 4308/4308 [00:01<00:00, 4065.23it/s]
100%|██████████| 4308/4308 [00:00<00:00, 4558.05it/s]
100%|██████████| 4308/4308 [00:00<00:00, 4846.66it/s]


In [123]:
"""
#6. test_df 생성 (x변수를 평균 사용)
"""

train_df['dummy'] = 0
test_df['dummy'] = 0

X_test, _ = convert_to_timeseries(pd.concat([train_df, test_df], axis = 0), interval=12)
X_test = X_test[-11520:, :, :]

# 만들어 두었던 dummy feature 제거
train_df.drop('dummy', axis = 1, inplace = True)
test_df.drop('dummy', axis = 1, inplace = True)

100%|██████████| 16260/16260 [00:07<00:00, 2125.77it/s]


In [124]:
"""

#7. 학습 모델 생성

"""
optimizer = tf.keras.optimizers.SGD(clipnorm=1.0)

simple_lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(128, input_shape=sequence.shape[-2:]),
    tf.keras.layers.Dense(256, activation='elu', kernel_initializer = "he_normal"),
    tf.keras.layers.Dense(128, activation='elu', kernel_initializer = "he_normal"),
    tf.keras.layers.Dense(1)
])

simple_lstm_model.compile(optimizer='adam', loss='mse')
simple_lstm_model.summary()


# loss가 4미만으로 떨어지면 학습 종료 시키는 기능
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs = None):
        if(logs.get('loss') < 4):
            print('\n Loss is under 4, cancelling training')
            self.model.stop_training = True
            

callbacks = myCallback()

Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_12 (LSTM)               (None, 128)               71168     
_________________________________________________________________
dense_44 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_45 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_46 (Dense)             (None, 1)                 129       
Total params: 137,217
Trainable params: 137,217
Non-trainable params: 0
_________________________________________________________________


In [125]:
"""
#8. 모델 학습
"""


simple_lstm_model.fit(    
    sequence, target,
    epochs=200,
    batch_size=64,
    verbose=1,
    shuffle=False,
    callbacks = [callbacks]
)

Train on 12924 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
 Loss is under 4, cancelling training


<tensorflow.python.keras.callbacks.History at 0x14fbd3d10>

In [126]:
from tensorflow import keras

finetune_X, finetune_y = convert_to_timeseries(pd.concat([train_df.tail(432), 
                                                          train['Y18'].tail(432)], axis = 1), 
                                               interval=12)
transfer_model = keras.models.Sequential(simple_lstm_model.layers[:-1])#output전까지
transfer_model.add(keras.layers.Dense(1))

simple_lstm_clone = keras.models.clone_model(simple_lstm_model)
simple_lstm_clone.set_weights(simple_lstm_model.get_weights())
#clone method로 weights는 복사 안됨

#freeze the reused layers for first few epochs, giving the new layer some time to learn reasonable weights

for layer in transfer_model.layers[:-1]:
    layer.trainable = False
transfer_model.compile(loss="mse", optimizer="adam")

# you must always compile after you freeze or unfreeze layers

# 이제 freeze로 few epochs만 train:
finetune_history = transfer_model.fit(finetune_X, finetune_y, epochs=2)

#다시 unfreeze:
for layer in transfer_model.layers[2:-1]:
    layer.trainable=True

#unfreeze했으니깐 compile:
transfer_model.compile(loss="mse", optimizer="adam")

#다시 train:
finetune_history = transfer_model.fit(finetune_X, finetune_y, epochs=15, batch_size=64,
                                    shuffle=False, verbose = 1)




100%|██████████| 420/420 [00:00<00:00, 2485.79it/s]


Train on 420 samples
Epoch 1/2
Epoch 2/2
Train on 420 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [127]:
#10. 결과 예측하기 
finetune_pred = simple_lstm_model.predict(X_test)


# 제출 파일 만들기
submit = pd.DataFrame({'id':range(144*33, 144*113),
              'Y18':finetune_pred.reshape(1,-1)[0]})

submit.to_csv('/Users/keom/Desktop/result1.csv', index = False)