In [30]:
import random
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from math import pi
from scipy.stats import norm, skew

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [31]:
# RandomSeed
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(41)

In [32]:
# error
def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [33]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [34]:
train = train.drop(['ID'], axis=1)
test = test.drop(['ID'], axis=1)

In [35]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2207 entries, 0 to 2206
Columns: 183 entries, 착과량(int) to 2022-11-28 엽록소
dtypes: float64(182), int64(1)
memory usage: 3.1 MB


In [36]:
train.head()

Unnamed: 0,착과량(int),수고(m),수관폭1(min),수관폭2(max),수관폭평균,2022-09-01 새순,2022-09-02 새순,2022-09-03 새순,2022-09-04 새순,2022-09-05 새순,...,2022-11-19 엽록소,2022-11-20 엽록소,2022-11-21 엽록소,2022-11-22 엽록소,2022-11-23 엽록소,2022-11-24 엽록소,2022-11-25 엽록소,2022-11-26 엽록소,2022-11-27 엽록소,2022-11-28 엽록소
0,692,275.0,287.0,292.0,289.5,2.8,2.8,2.7,2.7,2.7,...,70.978249,70.876794,70.705253,70.559603,70.427356,70.340491,70.29383,70.262422,70.169841,70.043251
1,534,293.0,284.0,336.0,310.0,3.3,3.3,3.3,3.2,3.2,...,71.535483,71.382303,71.253604,71.092665,70.955608,70.79663,70.59755,70.565088,70.560502,70.4276
2,634,300.0,392.0,450.0,421.0,3.0,2.9,2.9,2.9,2.9,...,71.279804,71.19957,71.14402,71.02674,70.920038,70.876723,70.710129,70.595971,70.418203,70.399578
3,639,289.0,368.0,379.0,373.5,3.1,3.0,3.0,3.0,3.0,...,69.934615,69.884124,69.845683,69.794682,69.779813,69.614644,69.455404,69.28159,69.238689,69.13397
4,496,306.0,353.0,358.0,355.5,3.7,3.6,3.6,3.6,3.5,...,68.313016,68.285364,68.20986,68.209458,68.040083,67.859963,67.775556,67.701456,67.504244,67.410093


In [37]:
test.head()

Unnamed: 0,수고(m),수관폭1(min),수관폭2(max),수관폭평균,2022-09-01 새순,2022-09-02 새순,2022-09-03 새순,2022-09-04 새순,2022-09-05 새순,2022-09-06 새순,...,2022-11-19 엽록소,2022-11-20 엽록소,2022-11-21 엽록소,2022-11-22 엽록소,2022-11-23 엽록소,2022-11-24 엽록소,2022-11-25 엽록소,2022-11-26 엽록소,2022-11-27 엽록소,2022-11-28 엽록소
0,231.0,219.0,302.0,260.5,4.3,4.2,4.2,4.1,4.1,4.1,...,72.091497,71.906176,71.886799,71.85872,71.72554,71.531618,71.476935,71.321237,71.184295,71.000223
1,206.0,324.0,346.0,335.0,2.5,2.5,2.5,2.5,2.5,2.4,...,68.423776,68.3532,68.346777,68.188159,67.995233,67.879958,67.74482,67.673522,67.539973,67.447849
2,282.0,373.0,404.0,388.5,4.7,4.7,4.6,4.6,4.5,4.5,...,74.435783,74.247468,74.142664,74.070401,73.892408,73.796143,73.615838,73.440136,73.387051,73.228319
3,234.0,337.0,357.0,347.0,3.6,3.6,3.6,3.6,3.5,3.5,...,73.570735,73.420243,73.313538,73.149298,73.109599,72.924511,72.847411,72.838263,72.672156,72.538171
4,222.0,329.0,347.0,338.0,2.7,2.7,2.7,2.7,2.6,2.6,...,72.216758,72.163153,72.052952,71.905982,71.766141,71.746813,71.668847,71.501043,71.419715,71.328224


### Feature Engineering

- start_end_prob_s + start_end_prob_y + 수목

In [38]:
train['start_end_prob_s1'] = train['2022-09-07 새순'] / train['2022-09-01 새순']
test['start_end_prob_s1'] = test['2022-09-07 새순'] / test['2022-09-01 새순']

train['start_end_prob_s2'] = train['2022-09-14 새순'] / train['2022-09-08 새순']
test['start_end_prob_s2'] = test['2022-09-14 새순'] / test['2022-09-08 새순']

train['start_end_prob_s3'] = train['2022-09-21 새순'] / train['2022-09-15 새순']
test['start_end_prob_s3'] = test['2022-09-21 새순'] / test['2022-09-15 새순']

train['start_end_prob_s4'] = train['2022-09-28 새순'] / train['2022-09-22 새순']
test['start_end_prob_s4'] = test['2022-09-28 새순'] / test['2022-09-22 새순']

train['start_end_prob_s5'] = train['2022-10-05 새순'] / train['2022-09-29 새순']
test['start_end_prob_s5'] = test['2022-10-05 새순'] / test['2022-09-29 엽록소']

train['start_end_prob_s6'] = train['2022-10-12 새순'] / train['2022-10-06 새순']
test['start_end_prob_s6'] = test['2022-10-12 새순'] / test['2022-10-06 새순']

train['start_end_prob_s7'] = train['2022-10-19 새순'] / train['2022-10-13 새순']
test['start_end_prob_s7'] = test['2022-10-19 새순'] / test['2022-10-13 새순']

train['start_end_prob_s8'] = train['2022-10-26 새순'] / train['2022-10-20 새순']
test['start_end_prob_s8'] = test['2022-10-26 새순'] / test['2022-10-20 새순']

train['start_end_prob_s9'] = train['2022-11-02 새순'] / train['2022-10-27 새순']
test['start_end_prob_s9'] = test['2022-11-02 새순'] / test['2022-10-27 새순']

train['start_end_prob_s10'] = train['2022-11-09 새순'] / train['2022-11-03 새순']
test['start_end_prob_s10'] = test['2022-11-09 새순'] / test['2022-11-03 새순']

train['start_end_prob_s11'] = train['2022-11-16 새순'] / train['2022-11-10 새순']
test['start_end_prob_s11'] = test['2022-11-16 새순'] / test['2022-11-10 새순']

train['start_end_prob_s12'] = train['2022-11-23 새순'] / train['2022-11-17 새순']
test['start_end_prob_s12'] = test['2022-11-23 새순'] / test['2022-11-17 새순']

train['start_end_prob_s13'] = train['2022-11-28 새순'] / train['2022-11-24 새순']
test['start_end_prob_s13'] = test['2022-11-28 새순'] / test['2022-11-24 새순']

In [18]:
# train['start_end_prob_y1'] = train['2022-09-07 엽록소'] / train['2022-09-01 엽록소']
# test['start_end_prob_y1'] = test['2022-09-07 엽록소'] / test['2022-09-01 엽록소']

# train['start_end_prob_y2'] = train['2022-09-14 엽록소'] / train['2022-09-08 엽록소']
# test['start_end_prob_y2'] = test['2022-09-14 엽록소'] / test['2022-09-08 엽록소']

# train['start_end_prob_y3'] = train['2022-09-21 엽록소'] / train['2022-09-15 엽록소']
# test['start_end_prob_y3'] = test['2022-09-21 엽록소'] / test['2022-09-15 엽록소']

# train['start_end_prob_y4'] = train['2022-09-28 엽록소'] / train['2022-09-22 엽록소']
# test['start_end_prob_y4'] = test['2022-09-28 엽록소'] / test['2022-09-22 엽록소']

# train['start_end_prob_y5'] = train['2022-10-05 엽록소'] / train['2022-09-29 엽록소']
# test['start_end_prob_y5'] = test['2022-10-05 엽록소'] / test['2022-09-29 엽록소']

# train['start_end_prob_y6'] = train['2022-10-12 엽록소'] / train['2022-10-06 엽록소']
# test['start_end_prob_y6'] = test['2022-10-12 엽록소'] / test['2022-10-06 엽록소']

# train['start_end_prob_y7'] = train['2022-10-19 엽록소'] / train['2022-10-13 엽록소']
# test['start_end_prob_y7'] = test['2022-10-19 엽록소'] / test['2022-10-13 엽록소']

# train['start_end_prob_y8'] = train['2022-10-26 엽록소'] / train['2022-10-20 엽록소']
# test['start_end_prob_y8'] = test['2022-10-26 엽록소'] / test['2022-10-20 엽록소']

# train['start_end_prob_y9'] = train['2022-11-02 엽록소'] / train['2022-10-27 엽록소']
# test['start_end_prob_y9'] = test['2022-11-02 엽록소'] / test['2022-10-27 엽록소']

# train['start_end_prob_y10'] = train['2022-11-09 엽록소'] / train['2022-11-03 엽록소']
# test['start_end_prob_y10'] = test['2022-11-09 엽록소'] / test['2022-11-03 엽록소']

# train['start_end_prob_y11'] = train['2022-11-16 엽록소'] / train['2022-11-10 엽록소']
# test['start_end_prob_y11'] = test['2022-11-16 엽록소'] / test['2022-11-10 엽록소']

# train['start_end_prob_y12'] = train['2022-11-23 엽록소'] / train['2022-11-17 엽록소']
# test['start_end_prob_y12'] = test['2022-11-23 엽록소'] / test['2022-11-17 엽록소']

# train['start_end_prob_y13'] = train['2022-11-28 엽록소'] / train['2022-11-24 엽록소']
# test['start_end_prob_y13'] = test['2022-11-28 엽록소'] / test['2022-11-24 엽록소']

In [39]:
train['start_end_prob_s'] = train['2022-11-28 새순'] / train['2022-09-01 새순']
test['start_end_prob_s'] = test['2022-11-28 새순'] / test['2022-09-01 새순']

# train['start_end_prob_y'] = train['2022-11-28 엽록소'] / train['2022-09-01 엽록소']
# test['start_end_prob_y'] = test['2022-11-28 엽록소'] / test['2022-09-01 엽록소']

train['부피'] = train['수관폭1(min)'] ** 2 * train['수고(m)']
test['부피'] = test['수관폭1(min)'] ** 2 * test['수고(m)']
# train['부피'] = (((train['수관폭평균'] / 2) ** 2) * pi) * train['수고(m)']
# test['부피'] = (((test['수관폭평균'] / 2) ** 2) * pi) * test['수고(m)']

train['면적'] = train['수관폭평균'] ** 2
test['면적'] = test['수관폭평균'] ** 2

In [40]:
train['수목'] = train['수고(m)'] * train['수관폭평균']
test['수목'] = test['수고(m)'] * test['수관폭평균']

In [488]:
# train['햇빛받는잎사귀넓이'] = (((train['수관폭평균'] / 2) ** 2) * pi) * 4 / 2
# test['햇빛받는잎사귀넓이'] = (((test['수관폭평균'] / 2) ** 2) * pi) * 4 / 2

In [337]:
# #최대 - 최소
# train['수관폭차'] = train['수관폭2(max)'] - train['수관폭1(min)']
# test['수관폭차'] = test['수관폭2(max)'] - test['수관폭1(min)']

In [338]:
# #75
# train['수관폭(75)'] = train['수관폭2(max)'] - train['수관폭평균']
# test['수관폭(75)'] = test['수관폭2(max)'] - test['수관폭평균']

# #25
# train['수관폭(25)'] = train['수관폭평균'] - train['수관폭1(min)']
# test['수관폭(25)'] = test['수관폭평균'] - test['수관폭1(min)']

In [339]:
# new_soon = [s for s in train if "새순" in s]
# greengreen = [s for s in train if "엽록" in s]
# new_soon1 = [s for s in test if "새순" in s]
# greengreen1 = [s for s in test if "엽록" in s]

# tr_green = train[greengreen]
# tr_saesoon = train[new_soon]
# ts_green = test[greengreen1]
# ts_saesoon = test[new_soon1]

# train['엽록소기울기'] = (tr_green.iloc[:, 70] - tr_green.iloc[:, 0]) / 70
# train['새순기울기'] = (tr_saesoon.iloc[:, 70] - tr_saesoon.iloc[:, 0]) / 70
# test['엽록소기울기'] = (ts_green.iloc[:, 70] - ts_green.iloc[:, 0]) / 70
# test['새순기울기'] = (ts_saesoon.iloc[:, 70] - ts_saesoon.iloc[:, 0]) / 70

### Data Split

In [41]:
# x, y split
x = train.drop(['착과량(int)'], axis=1)
y = train['착과량(int)']

In [42]:
# train valid split
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.25, random_state=42, shuffle=False)
#x_train, x_valid, y_train, y_valid = train_test_split(train_knn, y, test_size=0.25, random_state=42, shuffle=False)

### Modeling

#### RF

In [36]:
# tuning ver
params = {
    'n_estimators': range(150, 201, 5),
}

rf = RandomForestRegressor(random_state=2022)
rf = GridSearchCV(rf, params, cv=5)
rf.fit(x_train, y_train)
rf.best_params_

{'n_estimators': 175}

In [37]:
pred_rf = rf.predict(x_valid)

In [15]:
print('NMAE:', NMAE(y_valid, pred_rf))

# 0.07939858752481645

NMAE: 0.07939858752481645


#### GB

In [38]:
# {'learning_rate': 0.03, 'n_estimators': 145}

# tuning
params = {'n_estimators': range(50, 150, 5),
          'learning_rate': [0.01, 0.02, 0.03]}

gb = GradientBoostingRegressor()
gb = GridSearchCV(gb, params, cv=5)
gb.fit(x_train, y_train)
gb.best_params_

{'learning_rate': 0.03, 'n_estimators': 145}

In [39]:
pred_gb = gb.predict(x_valid)

In [40]:
print('NMAE:', NMAE(y_valid, pred_gb))

NMAE: 0.07825392012102447


#### LGBM

In [241]:
params = {
    'max_depth': range(1, 20),
    'num_leaves': range(1, 10),
}
lgbm = LGBMRegressor()
model_lgbm = GridSearchCV(lgbm, params, cv=5)
model_lgbm.fit(x_train, y_train)
model_lgbm.best_params_

{'max_depth': 1, 'num_leaves': 2}

In [243]:
pred_lgbm = model_lgbm.predict(x_valid)

In [244]:
print('NMAE:', NMAE(y_valid, pred_lgbm))
# NMAE: 0.0774292733729582

NMAE: 0.0774292733729582


#### Stacking

In [43]:
models = {
    'rf': RandomForestRegressor(n_estimators = 195, random_state=2022),
    'gb': GradientBoostingRegressor(learning_rate = 0.03, n_estimators = 145, random_state=2022),
    'lgbm': LGBMRegressor(random_state=2022),
}

stacking = StackingRegressor(
    estimators=list(models.items()),
    final_estimator=LinearRegression(),
    cv=5
)

stacking.fit(x_train, y_train)

StackingRegressor(cv=5,
                  estimators=[('rf',
                               RandomForestRegressor(n_estimators=195,
                                                     random_state=2022)),
                              ('gb',
                               GradientBoostingRegressor(learning_rate=0.03,
                                                         n_estimators=145,
                                                         random_state=2022)),
                              ('lgbm', LGBMRegressor(random_state=2022))],
                  final_estimator=LinearRegression())

In [44]:
pred_stacking = stacking.predict(x_valid)

In [45]:
# NMAE: 0.07718396449379535
# NMAE: 0.07764071516064674
print('NMAE:', NMAE(y_valid, pred_stacking))

NMAE: 0.07726260567119689


#### soft voting

In [277]:
rf = RandomForestRegressor(n_estimators=195, random_state=2022)
gb = GradientBoostingRegressor(learning_rate=0.03, n_estimators=145, random_state=2022)
lgbm = LGBMRegressor(max_depth=1, num_leaves=2)

rf.fit(x_train, y_train)
gb.fit(x_train, y_train)
lgbm.fit(x_train, y_train)

pred_rf = rf.predict(x_valid)
pred_gb = gb.predict(x_valid)
pred_lgbm = lgbm.predict(x_valid)

pred = (0.15*pred_rf + 0.20*pred_gb + 0.65*pred_lgbm)

print('NMAE:', NMAE(y_valid, pred))

In [278]:
print('NMAE:', NMAE(y_valid, pred_rf))
print('NMAE:', NMAE(y_valid, pred_gb))
print('NMAE:', NMAE(y_valid, pred_lgbm))

NMAE: 0.07852420850720318
NMAE: 0.07762133065734089
NMAE: 0.0774292733729582


In [286]:
# hard voting
pred = (pred_rf + pred_gb + pred_lgbm) / 3

print('NMAE:', NMAE(y_valid, pred))

NMAE: 0.07732533465201491


In [294]:
# soft voting
# NMAE: 0.07730783824543626
# 
pred = (0.15*pred_rf + 0.20*pred_gb + 0.65*pred_lgbm)

print('NMAE:', NMAE(y_valid, pred))

NMAE: 0.07728553148260693


### TEST SET INFERENCE

In [None]:
# test = test.drop(['ID'], axis=1)
# pred = stacking.predict(test)

In [297]:
# soft voting
pred_rf = rf.predict(test)
pred_gb = gb.predict(test)
pred_lgbm = lgbm.predict(test)
pred = (0.15*pred_rf + 0.20*pred_gb + 0.65*pred_lgbm)

### Submission

In [46]:
sample_submission = pd.read_csv('./sample_submission.csv')

In [47]:
pred = stacking.predict(test)

In [48]:
sample_submission['착과량(int)'] = pred
sample_submission.to_csv('hard_coding_3.csv', index=False)

In [49]:
sample_submission

Unnamed: 0,ID,착과량(int)
0,TEST_0000,248.239094
1,TEST_0001,745.596872
2,TEST_0002,153.097636
3,TEST_0003,441.536569
4,TEST_0004,690.054065
...,...,...
2203,TEST_2203,739.152907
2204,TEST_2204,315.399010
2205,TEST_2205,370.233386
2206,TEST_2206,235.651894
