# 선박 대기시간 예측

In [49]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import OLSInfluence
from category_encoders import TargetEncoder #####
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
%matplotlib inline

# 선박 대기시간 예측 데이터
ship = pd.read_csv('ship.csv')

##### ship columns
- DIST = 선박과 접안지 사이의 거리
- ATA = 접안지에 들어온 시각
- ID = 선박의 고유번호 (index와 다릅니다)
- BREADTH = 선박의 폭
- DEADWEIGHT = 선박의 무게(선박무게 + 적재물량 무게)
- DEPTH = 선박의 높이
- GT = 선박의 부피
- LENGTH = 선박의 길이
- PORT_SIZE = 항구의 접안 지역 크기
- CI_HOUR = 선박 대기시간 -> target

##### 문제 1 : smf의 OLS를 사용해 회귀분석 진행 (6점)
##### (1) ATA와 ID column을 제외한 numeric feature들만 가지는 ship_num을 생성 후 ols로 회귀분석을 진행 (2점)
##### (2) numeric feature에 StandardScaler를 적용시킨 ship_num_scaled로 한번 더 회귀분석을 진행 (2점)
##### (3) 두 결과에서 각 feature의 t 통계량에 어떤 변화가 있는지 확인합니다. (2점)

In [50]:
# ship_num 데이터프레임 생성
ship_num = ship.drop(columns = ['ATA', 'ID'])

In [51]:
# ols를 사용하면 각 종속변수에 대한 회귀계수와 t 통계량 p-value등 회귀분석을 진행할 수 있습니다.
model = smf.ols('CI_HOUR ~ '+"+".join(ship_num.columns[:-1]), data = ship_num).fit()
model.summary()

0,1,2,3
Dep. Variable:,CI_HOUR,R-squared:,0.056
Model:,OLS,Adj. R-squared:,0.056
Method:,Least Squares,F-statistic:,624.1
Date:,"Sat, 28 Oct 2023",Prob (F-statistic):,0.0
Time:,22:17:27,Log-Likelihood:,-485750.0
No. Observations:,73468,AIC:,971500.0
Df Residuals:,73460,BIC:,971600.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,55.6165,2.673,20.803,0.000,50.377,60.856
DIST,1.3040,0.022,59.122,0.000,1.261,1.347
BREADTH,-0.1317,0.180,-0.733,0.464,-0.484,0.221
DEADWEIGHT,0.0005,3.38e-05,14.061,0.000,0.000,0.001
DEPTH,2.7083,0.201,13.463,0.000,2.314,3.103
GT,-0.0001,6.68e-05,-2.166,0.030,-0.000,-1.37e-05
LENGTH,-0.4025,0.030,-13.236,0.000,-0.462,-0.343
PORT_SIZE,4986.1652,797.721,6.251,0.000,3422.636,6549.695

0,1,2,3
Omnibus:,81800.513,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6423976.575
Skew:,5.882,Prob(JB):,0.0
Kurtosis:,47.274,Cond. No.,111000000.0


In [52]:
# 스케일링 진행
scaler = StandardScaler()

# ship_num_scaled 데이터프레임 생성
ship_num_scaled = pd.DataFrame(scaler.fit_transform(ship_num), columns=ship_num.columns)

In [53]:
# ols를 사용해 한 번 더 회귀분석 진행
model = smf.ols('CI_HOUR ~ '+"+".join(ship_num_scaled.columns[:-1]), data = ship_num_scaled).fit()
model.summary()

0,1,2,3
Dep. Variable:,CI_HOUR,R-squared:,0.056
Model:,OLS,Adj. R-squared:,0.056
Method:,Least Squares,F-statistic:,624.1
Date:,"Sat, 28 Oct 2023",Prob (F-statistic):,0.0
Time:,22:17:28,Log-Likelihood:,-102120.0
No. Observations:,73468,AIC:,204300.0
Df Residuals:,73460,BIC:,204300.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.266e-16,0.004,-3.53e-14,1.000,-0.007,0.007
DIST,0.2161,0.004,59.122,0.000,0.209,0.223
BREADTH,-0.0085,0.012,-0.733,0.464,-0.031,0.014
DEADWEIGHT,0.1604,0.011,14.061,0.000,0.138,0.183
DEPTH,0.0969,0.007,13.463,0.000,0.083,0.111
GT,-0.0296,0.014,-2.166,0.030,-0.056,-0.003
LENGTH,-0.1755,0.013,-13.236,0.000,-0.201,-0.149
PORT_SIZE,0.0239,0.004,6.251,0.000,0.016,0.031

0,1,2,3
Omnibus:,81800.513,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6423976.575
Skew:,5.882,Prob(JB):,0.0
Kurtosis:,47.274,Cond. No.,10.7


In [6]:
# 각 회귀계수들의 t 통계량에는 어떤 변화가 있었나요?



---

##### 문제 2 : XGBRegressor를 이용한 예측 진행(7점)
##### (1) ATA column을 datetime 으로 변환 후 월, 일, 요일 column 생성(1점)
##### (2) CI_HOUR column을 제외시킨 feature 생성, CI_HOUR를 target값에 할당, train_test 데이터셋 분리(1점)
##### (3) XGBRegressor()를 이용해 모델 적합 후 예측 - RMSE(1점)
##### (4) 범주형 변수처리 혹은 이상치처리 등 자유롭게 평균 RMSE값을 150아래로 낮춰주세요. 단, CI_HOUR column은 건드리지 말고 진행해주세요 (5점)

In [36]:
# datetime 변환
ship.ATA = pd.to_datetime(ship.ATA)

In [37]:
# 월, 일, 요일 column 생성
ship['ATA_MONTH'] = ship.ATA.dt.month
ship['ATA_DAY'] = ship.ATA.dt.day
ship['ATA_WEEK'] = ship.ATA.dt.dayofweek

In [38]:
ship.drop(columns = 'ATA', inplace = True)

In [39]:
target_encoder = TargetEncoder()
ship['ID'] = target_encoder.fit_transform(ship['ID'], ship['CI_HOUR'])

In [40]:
# feature, target 설정
feature = ship.drop(columns = 'CI_HOUR')
target = ship.CI_HOUR

In [41]:
# train, test 데이터셋 분리
X_train , X_test, y_train , y_test = train_test_split(feature, target, test_size=0.3, random_state=0)

In [42]:
# xgb 모델 적합 후 예측, RMSE
xgb = XGBRegressor(n_estimators = 1000, n_jobs = -1)
neg_mse_scores = cross_val_score(xgb, X_train, y_train, scoring="neg_mean_squared_error", cv = 5)
rmse_scores  = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

print(' 5 교차 검증의 개별 RMSE scores : ', np.round(rmse_scores, 2))
print(' 5 교차 검증의 평균 RMSE : {0:.3f} '.format(avg_rmse))

 5 교차 검증의 개별 RMSE scores :  [130.58 138.23 142.03 137.88 145.77]
 5 교차 검증의 평균 RMSE : 138.897 
