In [None]:
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, RobustScaler, MinMaxScaler
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, recall_score, roc_curve, precision_score, f1_score, auc, mean_absolute_error
import matplotlib.pyplot as plt
from datetime import datetime

# 데이터 불러오기

In [None]:
filepath = '/content/drive/MyDrive/dacon/solar/'
train=pd.read_csv(filepath+'train/train.csv')

# for i in range(0,81):
#     test = "test_%d = pd.read_csv('/content/drive/MyDrive/dacon/solar/test/%d.csv')"%(i,i)
#     exec(test)

submission=pd.read_csv(filepath+'sample_submission.csv')

# Train Set 파생변수 생성

In [None]:
# 일별 Target MAX 값 Column 생성 
def max_feature(data):
  max = data.groupby('Day').max()[['TARGET']].reset_index()
  data = pd.merge(data, max, on ='Day', how ='left')
  data.rename({'TARGET_x' : 'TARGET', 'TARGET_y':'max_tagrget'}, axis=1, inplace=True)
  
  return data

In [None]:
# 일출/일몰 시간 변수 생성
def suntime_feature(data):
  data['day_time'] = data['Hour'] * 2 + data['Minute'] * 1/30 

  no_0 = data[data['TARGET'] != 0 ]

  rise = no_0.groupby('Day').min()[['day_time']].reset_index()
  data = pd.merge(data, rise, on ='Day', how ='left')
  data.rename({'day_time_x' : 'day_time', 'day_time_y':'rising_time'}, axis=1, inplace=True)

  sunset = no_0.groupby('Day').max()[['day_time']].reset_index()
  data = pd.merge(data, sunset, on ='Day', how ='left')
  data.rename({'day_time_x' : 'day_time', 'day_time_y':'set_time'}, axis=1, inplace=True)

  return data

In [None]:
# 동시간대의 4, 7일 별 Target, DHI, DNI, T 평균값 생성
step = [4,7]
date_time = np.arange(0,48,1)

def mean_feature(data, feature):
  for i in tqdm(step):
    tmp_df = pd.DataFrame()
    for j in date_time:
      tmp_df = tmp_df.append(data[data['day_time'] == j].rolling(window=i).mean())[[feature]]
    data.loc[:,'mean_{}'.format(feature)+str(i)] = tmp_df.sort_index().values

In [None]:
def new_day(data):
  data.reset_index(inplace= True)

  data['new_day'] = None
  data['new_day'][0] = 0

  for i in tqdm(range(1, len(data))):
    if data['Day'][i] == data['Day'][i-1]:
     data['new_day'][i] = data['new_day'][i-1]
    else:
     data['new_day'][i] = data['new_day'][i-1] + 1 
  return data

In [None]:
# 파생변수 생성
train = max_feature(train)
train = suntime_feature(train)

feature = ['TARGET','DHI','DNI','T']
for feat in feature:
  mean_feature(train,feat)

train = new_day(train)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52559.0), HTML(value='')))




In [None]:
# 훈련용 Target 값 Columns 생성
train['target1'] = train['TARGET'].shift(-48) # 1일 만큼 Shift한 변수 생성
train['target2'] = train['TARGET'].shift(-96) # 2일 만큼 Shift한 변수 생성 

# Binary Classification Modeling

In [None]:
train

Unnamed: 0,index,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET,max_tagrget,day_time,rising_time,set_time,mean_TARGET4,mean_TARGET7,mean_DHI4,mean_DHI7,mean_DNI4,mean_DNI7,mean_T4,mean_T7,new_day,target1,target2
0,0,0,0,0,0,0,1.5,69.08,-12,0.0,33.129393,0.0,16.0,33.0,,,,,,,,,0,0.0,0.0
1,1,0,0,30,0,0,1.5,69.06,-12,0.0,33.129393,1.0,16.0,33.0,,,,,,,,,0,0.0,0.0
2,2,0,1,0,0,0,1.6,71.78,-12,0.0,33.129393,2.0,16.0,33.0,,,,,,,,,0,0.0,0.0
3,3,0,1,30,0,0,1.6,71.75,-12,0.0,33.129393,3.0,16.0,33.0,,,,,,,,,0,0.0,0.0
4,4,0,2,0,0,0,1.6,75.20,-12,0.0,33.129393,4.0,16.0,33.0,,,,,,,,,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52555,52555,1094,21,30,0,0,2.4,70.70,-4,0.0,39.322261,43.0,16.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.00,-3.000000,1094,,
52556,52556,1094,22,0,0,0,2.4,66.79,-4,0.0,39.322261,44.0,16.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.50,-3.285714,1094,,
52557,52557,1094,22,30,0,0,2.2,66.78,-4,0.0,39.322261,45.0,16.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.50,-3.285714,1094,,
52558,52558,1094,23,0,0,0,2.1,67.72,-4,0.0,39.322261,46.0,16.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.75,-3.571429,1094,,


In [None]:
# 전처리 과정 중 발생한 null값 포함 row 제거
print(train.isnull().sum(),train.shape)
train = train.dropna()
print(train.isnull().sum(),train.shape)

index           0
Day             0
Hour            0
Minute          0
DHI             0
DNI             0
WS              0
RH              0
T               0
TARGET          0
max_tagrget     0
day_time        0
rising_time     0
set_time        0
mean_TARGET4    0
mean_TARGET7    0
mean_DHI4       0
mean_DHI7       0
mean_DNI4       0
mean_DNI7       0
mean_T4         0
mean_T7         0
new_day         0
target1         0
target2         0
dtype: int64 (52176, 25)
(52176, 25)


In [None]:
train.reset_index(drop = True, inplace=True)
train_copy = train.copy()

In [None]:
features = train.columns
features = features.drop(['target1','target2','Minute','Day','Hour'])

# Binary 종속 변수 생성
train['target1_c'] = train['target1'].apply(lambda x: int(1) if x > 0 else 0) 
train['target2_c'] = train['target2'].apply(lambda x: int(1) if x > 0 else 0) 

X_train_1, X_valid_1, Y_train_1, Y_valid_1 = train_test_split(train[features], train['target1_c'], test_size=0.2, random_state=0)
X_train_2, X_valid_2, Y_train_2, Y_valid_2 = train_test_split(train[features], train['target2_c'], test_size=0.2, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
import time

start = time.time()
RF1 = RandomForestClassifier(random_state=0)
RF1.fit(X_train_1, Y_train_1)
print(time.time() - start)

RF2 = RandomForestClassifier(random_state=0)
RF2.fit(X_train_2, Y_train_2)
print(time.time() - start)

6.483649730682373
13.16815972328186


In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(Y_valid_1, RF1.predict(X_valid_1)))
print(accuracy_score(Y_valid_2, RF2.predict(X_valid_2)))

# Binary Classfication의 Accuracy 99% 이상임을 확인

0.9977002683020314
0.996071291682637


# Test Set 파생변수 생성 

In [None]:
def make_features(train):
  train.reset_index(inplace= True)

  train['new_day'] = None
  train['new_day'][0] = 0

  for i in tqdm(range(1, len(train))):
    if train['Day'][i] == train['Day'][i-1]:
      train['new_day'][i] = train['new_day'][i-1]
    else:
      train['new_day'][i] = train['new_day'][i-1] + 1 

  max = train.groupby('new_day').max()[['TARGET']].reset_index()
  train = pd.merge(train, max, on ='new_day', how ='left')
  train.rename({'TARGET_x' : 'TARGET', 'TARGET_y':'max_tagrget'}, axis=1, inplace=True)

  train['day_time'] = train['Hour'] * 2 + train['Minute'] * 1/30
  no_0 = train[train['TARGET'] != 0 ]

  rise = no_0.groupby('new_day').min()[['day_time']].reset_index()
  train = pd.merge(train, rise, on ='new_day', how ='left')
  train.rename({'day_time_x' : 'day_time', 'day_time_y':'rising_time'}, axis=1, inplace=True)

  sunset = no_0.groupby('new_day').max()[['day_time']].reset_index()
  train = pd.merge(train, sunset, on ='new_day', how ='left')
  train.rename({'day_time_x' : 'day_time', 'day_time_y':'set_time'}, axis=1, inplace=True)

  return train

In [None]:
test = pd.DataFrame()
for i in tqdm(range(81)):
  file_path = '/content/drive/MyDrive/dacon/solar/test/' + str(i) + '.csv'
  temp = pd.read_csv(file_path)
  test = test.append(temp)

HBox(children=(FloatProgress(value=0.0, max=81.0), HTML(value='')))




In [None]:
test = make_features(df_test)

HBox(children=(FloatProgress(value=0.0, max=27215.0), HTML(value='')))




In [None]:
feature = ['TARGET','DHI','DNI','T']
for feat in feature:
  mean_feature(test,feat)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [None]:
test =  test[test['Day'] == 6]
test.reset_index(drop = True, inplace=True)
print(test.isnull().sum())

index           0
Day             0
Hour            0
Minute          0
DHI             0
DNI             0
WS              0
RH              0
T               0
TARGET          0
new_day         0
max_tagrget     0
day_time        0
rising_time     0
set_time        0
mean_TARGET4    0
mean_TARGET7    0
mean_DHI4       0
mean_DHI7       0
mean_DNI4       0
mean_DNI7       0
mean_T4         0
mean_T7         0
dtype: int64


In [None]:
# Test Set 에 대해서 0, 1 Classification 이후, 1인 값들의 Index 추출

preds1 = RF1.predict(test[features])
preds2 = RF2.predict(test[features])

test['pred_c_1'] = preds1
test['pred_c_2'] = preds2

reg1 = test[test['pred_c_1'] == 1]
reg2 = test[test['pred_c_2'] == 1]

ind1 = reg1.index
ind2 = reg2.index

# Light GBM (예측값 0 제외 Regression)

In [None]:
from lightgbm import LGBMRegressor

quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

def LGBM(q, X_train, y_train, X_valid, y_valid, X_test):
  model = LGBMRegressor(objective='quantile', alpha=q,
                        n_estimators=500, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)                                        
  model.fit(X_train, y_train, eval_metric = ['quantile'], 
        eval_set=[(X_valid, y_valid)], early_stopping_rounds=300, verbose=500)
  pred = pd.Series(model.predict(X_test).round(2))
  return pred, model

# Target 예측
def prediction(X_train, y_train, X_valid, y_valid, X_test):
    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()
    for q in quantiles:
        print(q)
        pred , model = LGBM(q, X_train, y_train, X_valid, y_valid, X_test)
        LGBM_models.append(model)
        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1)
    LGBM_actual_pred.columns=quantiles  
    return LGBM_models, LGBM_actual_pred

In [None]:
a = train.iloc[:42000]
b = train.iloc[42000:]

X_valid_1 = np.array(b[b['target1'] > 0][features])
y_valid_1 = np.array(b[b['target1'] > 0]['target1'])
X_train_1 = np.array(a[a['target1'] > 0][features])
y_train_1 = np.array(a[a['target1'] > 0]['target1'])
X_test = np.array(test[features].iloc[ind1])

start = time.time()
models_1, results_1 = prediction(X_train_1, y_train_1, X_valid_1, y_valid_1, X_test)
results_1.sort_index()[:48]
print(time.time() - start)


0.1
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[71]	valid_0's quantile: 2.91892
0.2
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[141]	valid_0's quantile: 4.66405
0.3
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[130]	valid_0's quantile: 5.49456
0.4
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[160]	valid_0's quantile: 5.66896
0.5
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[166]	valid_0's quantile: 5.3499
0.6
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[189]	valid_0's quantile: 4.77937
0.7
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's quantile: 4.12404
Did not meet early stopping. Best iteration is:
[243]	valid_0's quantile: 3.93598
0.8

In [None]:
X_valid_2 = np.array(b[b['target2'] > 0][features])
y_valid_2 = np.array(b[b['target2'] > 0]['target1'])
X_train_2 = np.array(a[a['target2'] > 0][features])
y_train_2 = np.array(a[a['target2'] > 0]['target1'])
X_test = np.array(test[features].iloc[ind2])

start = time.time()
models_2, results_2 = prediction(X_train_2, y_train_2, X_valid_2, y_valid_2, X_test)
results_2.sort_index()[:48]
print(time.time() - start)

0.1
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[118]	valid_0's quantile: 2.89822
0.2
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[111]	valid_0's quantile: 4.67393
0.3
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[137]	valid_0's quantile: 5.48814
0.4
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[159]	valid_0's quantile: 5.66798
0.5
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[178]	valid_0's quantile: 5.37072
0.6
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[181]	valid_0's quantile: 4.80872
0.7
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[195]	valid_0's quantile: 3.94762
0.8
Training until validation scores don't impro

In [None]:
results_1['index'] = ind1
results_2['index'] = ind2

In [None]:
del test['index']
test.reset_index(inplace=True)
test

Unnamed: 0,index,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET,new_day,max_tagrget,day_time,rising_time,set_time,mean_TARGET4,mean_TARGET7,mean_DHI4,mean_DHI7,mean_DNI4,mean_DNI7,mean_T4,mean_T7,pred_c_1,pred_c_2
0,0,6,0,0,0,0,0.8,80.92,-2.8,0.0,6,15.954963,0.0,15.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.275,-0.914286,0,0
1,1,6,0,30,0,0,0.9,81.53,-2.9,0.0,6,15.954963,1.0,15.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.525,-1.071429,0,0
2,2,6,1,0,0,0,1.0,79.91,-3.0,0.0,6,15.954963,2.0,15.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.750,-1.214286,0,0
3,3,6,1,30,0,0,0.9,79.91,-3.0,0.0,6,15.954963,3.0,15.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.825,-1.285714,0,0
4,4,6,2,0,0,0,0.9,77.20,-3.0,0.0,6,15.954963,4.0,15.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.875,-1.357143,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3883,3883,6,21,30,0,0,0.8,63.35,13.7,0.0,566,80.781000,43.0,9.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,12.875,11.171429,0,0
3884,3884,6,22,0,0,0,0.7,64.82,13.1,0.0,566,80.781000,44.0,9.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,12.325,10.657143,0,0
3885,3885,6,22,30,0,0,0.7,66.10,12.8,0.0,566,80.781000,45.0,9.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,11.925,10.228571,0,0
3886,3886,6,23,0,0,0,0.6,67.64,12.4,0.0,566,80.781000,46.0,9.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,11.500,9.785714,0,0


In [None]:
print(test.shape)
test = pd.merge(test, results_1, on='index',how='left')
print(test.shape)
test = pd.merge(test, results_2, on='index',how='left')
print(test.shape)

(3888, 25)
(3888, 34)
(3888, 43)


In [None]:
first_cols = ['0.1_x', '0.2_x', '0.3_x', '0.4_x', '0.5_x', '0.6_x','0.7_x', '0.8_x', '0.9_x']
second_cols = ['0.1_y', '0.2_y', '0.3_y', '0.4_y', '0.5_y','0.6_y', '0.7_y', '0.8_y', '0.9_y']
cols = submission.columns.tolist()[1:]

In [None]:
prediction = []
for a in range(81):
    prediction.extend(test.iloc[a*48:a*48+48][first_cols].values)
    prediction.extend(test.iloc[a*48:a*48+48][second_cols].values)
len(prediction)

7776

In [None]:
submission[cols] = prediction
submission.fillna(0, inplace=True)
submission.set_index('id', inplace=True)
submission[submission <= 0] = 0 # 음수는 0으로 변환
submission[submission >= 100] = 99.9 # 최대값 99.9로 제한
submission.reset_index(inplace=True)
submission.iloc[10:30]

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
10,0.csv_Day7_5h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.csv_Day7_5h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.csv_Day7_6h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.csv_Day7_6h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.csv_Day7_7h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.csv_Day7_7h30m,1.11,1.07,1.63,1.82,2.2,2.34,2.24,3.03,2.44
16,0.csv_Day7_8h00m,1.64,3.52,4.26,6.86,5.93,7.77,8.0,9.95,10.77
17,0.csv_Day7_8h30m,2.91,5.64,6.14,10.47,11.11,11.77,15.44,16.72,18.67
18,0.csv_Day7_9h00m,5.25,9.26,10.24,15.51,17.92,21.04,25.26,24.94,27.92
19,0.csv_Day7_9h30m,6.68,12.38,18.71,20.33,25.84,29.78,32.66,35.35,36.01


In [None]:
submission.to_csv('/content/drive/MyDrive/dacon/solar/submission(lgbm).csv', index=False)