# __!!주의!!__
to_EDA_PGR.ipynb 노트북을 돌려서 얻은 결과물인 'df_base_pre.csv', 'df_pre.csv' 파일을 사용합니다.  
먼저 to_EDA_PGR.ipynb를 실행한 뒤 이 노트북을 실행하세요.

# Machine learning

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()

from sklearn.linear_model import Ridge
model_rg = Ridge()

from sklearn.linear_model import Lasso
model_ls = Lasso()

from sklearn.linear_model import ElasticNet
model_en = ElasticNet(alpha=1)

from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor()

from xgboost import XGBRegressor
model_xg = XGBRegressor()

from sklearn.metrics import mean_squared_error

## Data preparation
### baseline data

In [None]:
df_base_pre = pd.read_csv('df_base_pre.csv', index_col = 0)
df_base_pre

In [None]:
df_base_pre = df_base_pre.dropna()
df_base_pre

In [None]:
# 컬럼명이 yyyy-mm-dd hh:mm:ss 형식으로 되어있으므로 컬럼 이름을 yyyy-mm-dd로 변경
new_column_names_with_dates = []
for column_name in df_base_pre.columns:
    # Check if the column name represents a date
    try:
        # Try converting the column name to a date, if possible, format it
        date_only = pd.to_datetime(column_name).strftime('%Y-%m-%d')
        new_column_names_with_dates.append(date_only)
    except ValueError:
        # If conversion fails, it's not a date column, keep it as is
        new_column_names_with_dates.append(column_name)

df_base_pre.columns = new_column_names_with_dates
df_base_pre

### Our data

In [None]:
df_pre = pd.read_csv('df_pre.csv', index_col = 0)
df_pre

In [None]:
df_pre = df_pre.dropna()
df_pre

## Encoding
### baseline data

In [None]:
df_base_final = pd.get_dummies(df_base_pre)
df_base_final

### Our data

In [None]:
df_pre_final = pd.get_dummies(df_pre)
df_pre_final

## baseline machine learning

### data splitting

In [None]:
train = pd.read_csv('train.csv')
train

In [None]:
X = df_base_final
y = train.iloc[:, 6:]   # 날짜 데이터만 남기기

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
# 샘플링용 데이터 분리
np.random.seed(15)  # 결과의 재현성을 위해 시드 고정
random_columns = np.random.choice(train.columns[6:], size=100, replace=False)

# 선택된 컬럼에 해당하는 데이터만을 y 데이터로 사용
y_random_sampled = train[random_columns]

# 기존에 정의한 X(특성) 데이터 준비 (이전 단계에서 사용한 X_sampled 사용)
# 훈련 데이터와 테스트 데이터로 분할
X_train_random, X_test_random, y_train_random, y_test_random = train_test_split(X, y_random_sampled, test_size=0.2, random_state=15)

print(X_train_random.shape, y_train_random.shape)
print(X_test_random.shape, y_test_random.shape)

### Linear Regression

In [None]:
# 샘플링 데이터로 머신러닝
model_lr.fit(X_train_random, y_train_random)
pred = model_lr.predict(X_test_random)

# 평가
mse_lr = mean_squared_error(y_test_random, pred)
rmse_lr = np.sqrt(mean_squared_error(y_test_random, pred))

print("mse:", mse_lr.round(2), "\nrmse:", rmse_lr.round(2))

In [None]:
# 머신러닝
model_lr.fit(X_train, y_train)
pred = model_lr.predict(X_test)
mean_squared_error(y_test, pred)

# 평가
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print("mse:", mse.round(2), "\nrmse:", rmse.round(2))

### Ridge

In [None]:
# 샘플링 데이터로 머신러닝
model_rg.fit(X_train_random, y_train_random)
pred = model_rg.predict(X_test_random)

# 평가
mse_rg = mean_squared_error(y_test_random, pred)
rmse_rg = np.sqrt(mean_squared_error(y_test_random, pred))

print("mse:", mse_rg.round(2), "\nrmse:", rmse_rg.round(2))

In [None]:
# 머신러닝
model_rg.fit(X_train, y_train)
pred = model_rg.predict(X_test)
mean_squared_error(y_test, pred)

# 평가
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print("mse:", mse.round(2), "\nrmse:", rmse.round(2))

### Lasso

In [None]:
# 샘플링 데이터로 머신러닝
model_ls.fit(X_train_random, y_train_random)
pred = model_ls.predict(X_test_random)

# 평가
mse_ls = mean_squared_error(y_test_random, pred)
rmse_ls = np.sqrt(mean_squared_error(y_test_random, pred))

print("mse:", mse_ls.round(2), "\nrmse:", rmse_ls.round(2))

In [None]:
# 머신러닝
model_ls.fit(X_train, y_train)
pred = model_ls.predict(X_test)
mean_squared_error(y_test, pred)

# 평가
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print("mse:", mse.round(2), "\nrmse:", rmse.round(2))

### ElasticNet

In [None]:
# 샘플링 데이터로 머신러닝
model_en.fit(X_train_random, y_train_random)
pred = model_en.predict(X_test_random)

# 평가
mse_en = mean_squared_error(y_test_random, pred)
rmse_en = np.sqrt(mean_squared_error(y_test_random, pred))

print("mse:", mse_en.round(2), "\nrmse:", rmse_en.round(2))

In [None]:
# 머신러닝
model_en.fit(X_train, y_train)
pred = model_en.predict(X_test)
mean_squared_error(y_test, pred)

# 평가
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print("mse:", mse.round(2), "\nrmse:", rmse.round(2))

### Random Forest

In [None]:
# 샘플링 데이터로 머신러닝
model_rf.fit(X_train_random, y_train_random)
pred = model_rf.predict(X_test_random)

# 평가
mse_rf = mean_squared_error(y_test_random, pred)
rmse_rf = np.sqrt(mean_squared_error(y_test_random, pred))

print("mse:", mse_rf.round(2), "\nrmse:", rmse_rf.round(2))

In [None]:
# 머신러닝
model_rf.fit(X_train, y_train)
pred = model_rf.predict(X_test)
mean_squared_error(y_test, pred)

# 평가
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print("mse:", mse.round(2), "\nrmse:", rmse.round(2))

### XGBoost

In [None]:
# 샘플링 데이터로 머신러닝
model_xg.fit(X_train_random, y_train_random)
pred = model_xg.predict(X_test_random)

# 평가
mse_xg = mean_squared_error(y_test_random, pred)
rmse_xg = np.sqrt(mean_squared_error(y_test_random, pred))

print("mse:", mse_xg.round(2), "\nrmse:", rmse_xg.round(2))

In [None]:
# 머신러닝
model_xg.fit(X_train, y_train)
pred = model_xg.predict(X_test)
mean_squared_error(y_test, pred)

# 평가
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print("mse:", mse.round(2), "\nrmse:", rmse.round(2))

## Our data machine learning

### data splitting

In [None]:
X = df_pre_final
y = train.iloc[:, 6:]   # 날짜 데이터만 남기기

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
# 샘플링용 데이터 분리
np.random.seed(15)  # 결과의 재현성을 위해 시드 고정
random_columns = np.random.choice(train.columns[6:], size=100, replace=False)

# 선택된 컬럼에 해당하는 데이터만을 y 데이터로 사용
y_random_sampled = train[random_columns]

# 기존에 정의한 X(특성) 데이터 준비 (이전 단계에서 사용한 X_sampled 사용)
# 훈련 데이터와 테스트 데이터로 분할
X_train_random, X_test_random, y_train_random, y_test_random = train_test_split(X, y_random_sampled, test_size=0.2, random_state=15)

print(X_train_random.shape, y_train_random.shape)
print(X_test_random.shape, y_test_random.shape)

### Linear Regression

In [None]:
# 샘플링 데이터로 머신러닝
model_lr.fit(X_train_random, y_train_random)
pred = model_lr.predict(X_test_random)

# 평가
mse_lr = mean_squared_error(y_test_random, pred)
rmse_lr = np.sqrt(mean_squared_error(y_test_random, pred))

print("mse:", mse_lr.round(2), "\nrmse:", rmse_lr.round(2))

In [None]:
# 머신러닝
model_lr.fit(X_train, y_train)
pred = model_lr.predict(X_test)
mean_squared_error(y_test, pred)

# 평가
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print("mse:", mse.round(2), "\nrmse:", rmse.round(2))

### Ridge

In [None]:
# 샘플링 데이터로 머신러닝
model_rg.fit(X_train_random, y_train_random)
pred = model_rg.predict(X_test_random)

# 평가
mse_rg = mean_squared_error(y_test_random, pred)
rmse_rg = np.sqrt(mean_squared_error(y_test_random, pred))

print("mse:", mse_rg.round(2), "\nrmse:", rmse_rg.round(2))

In [None]:
# 머신러닝
model_rg.fit(X_train, y_train)
pred = model_rg.predict(X_test)
mean_squared_error(y_test, pred)

# 평가
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print("mse:", mse.round(2), "\nrmse:", rmse.round(2))

### Lasso

In [None]:
# 샘플링 데이터로 머신러닝
model_ls.fit(X_train_random, y_train_random)
pred = model_ls.predict(X_test_random)

# 평가
mse_ls = mean_squared_error(y_test_random, pred)
rmse_ls = np.sqrt(mean_squared_error(y_test_random, pred))

print("mse:", mse_ls.round(2), "\nrmse:", rmse_ls.round(2))

In [None]:
# 머신러닝
model_ls.fit(X_train, y_train)
pred = model_ls.predict(X_test)
mean_squared_error(y_test, pred)

# 평가
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print("mse:", mse.round(2), "\nrmse:", rmse.round(2))

### ElasticNet

In [None]:
# 샘플링 데이터로 머신러닝
model_en.fit(X_train_random, y_train_random)
pred = model_en.predict(X_test_random)

# 평가
mse_en = mean_squared_error(y_test_random, pred)
rmse_en = np.sqrt(mean_squared_error(y_test_random, pred))

print("mse:", mse_en.round(2), "\nrmse:", rmse_en.round(2))

In [None]:
# 머신러닝
model_en.fit(X_train, y_train)
pred = model_en.predict(X_test)
mean_squared_error(y_test, pred)

# 평가
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print("mse:", mse.round(2), "\nrmse:", rmse.round(2))

### Random Forest

In [None]:
# 샘플링 데이터로 머신러닝
model_rf.fit(X_train_random, y_train_random)
pred = model_rf.predict(X_test_random)

# 평가
mse_rf = mean_squared_error(y_test_random, pred)
rmse_rf = np.sqrt(mean_squared_error(y_test_random, pred))

print("mse:", mse_rf.round(2), "\nrmse:", rmse_rf.round(2))

In [None]:
# 머신러닝
model_rf.fit(X_train, y_train)
pred = model_rf.predict(X_test)
mean_squared_error(y_test, pred)

# 평가
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print("mse:", mse.round(2), "\nrmse:", rmse.round(2))

### XGBoost

In [None]:
# 샘플링 데이터로 머신러닝
model_xg.fit(X_train_random, y_train_random)
pred = model_xg.predict(X_test_random)

# 평가
mse_xg = mean_squared_error(y_test_random, pred)
rmse_xg = np.sqrt(mean_squared_error(y_test_random, pred))

print("mse:", mse_xg.round(2), "\nrmse:", rmse_xg.round(2))

In [None]:
# 머신러닝
model_xg.fit(X_train, y_train)
pred = model_xg.predict(X_test)
mean_squared_error(y_test, pred)

# 평가
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print("mse:", mse.round(2), "\nrmse:", rmse.round(2))

df_pre_final도 컬럼이 거의 100개에 가까워서 차원의 저주에 빠질 확률이 높음...
관련해서 elbow graph 그리고 PCA까지 진행해보면 좋을 것 같지만...과연?