# 주제: 제주도 도로 교통량과 유의미한 변수

### 모듈 import 및 data 불러오기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import gc

import os
import glob

# scaler
from sklearn.preprocessing import MinMaxScaler

# 요인분석
from factor_analyzer import FactorAnalyzer

# 회귀분석
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error
import time

import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('dataset/train.csv')

## 데이터셋 분할

In [2]:
# road_name = 지방도 추출

road = data.road_name.unique()
road = [i for i in road if "지방" in i]

data = data.query("road_name in @road")

# 2202년 4월 1일 이전, 이후로 분할
test = data[data.base_date >= 20220401]
train = data[data.base_date < 20220401]

In [3]:
rt = test.shape[0]/train.shape[0]
print(f"train : test = {1-rt} : {rt}")

train : test = 0.661040270128357 : 0.338959729871643


## 전처리

In [4]:
train["month"] = train['base_date'].astype("str").apply(
                               lambda x : x[4:6]).apply(
                               lambda x : int(x) if x[0]!=0 else int(x[-1]))

train["day"] = train['base_date'].astype("str").apply(
                               lambda x : x[-2:]).apply(
                               lambda x : int(x) if x[0]!=0 else int(x[-1]))

test["month"] = test['base_date'].astype("str").apply(
                               lambda x : x[4:6]).apply(
                               lambda x : int(x) if x[0]!=0 else int(x[-1]))

test["day"] = test['base_date'].astype("str").apply(
                               lambda x : x[-2:]).apply(
                               lambda x : int(x) if x[0]!=0 else int(x[-1]))



train["start_turn_restricted"] = train["start_turn_restricted"].map({"없음":0, "있음":1})
test["start_turn_restricted"] = test["start_turn_restricted"].map({"없음":0, "있음":1})

train["end_turn_restricted"] = train["end_turn_restricted"].map({"없음":0, "있음":1})
test["end_turn_restricted"] = test["end_turn_restricted"].map({"없음":0, "있음":1})

In [5]:
day_mean = train.groupby("day_of_week")["target"].mean().reset_index().values
day_mean = {i : j for i, j in day_mean}

train["day_of_week"] = train["day_of_week"].map(day_mean)
test["day_of_week"] = test["day_of_week"].map(day_mean)

In [6]:
road_info = ["lane_count", "maximum_speed_limit", "weight_restricted", "road_type", "start_turn_restricted",
            "end_turn_restricted"] 
day = ["day_of_week", "base_hour", "month", "day"]
col = road_info + day

In [7]:
y_tr = train["target"]
y_te = test["target"]

train = train[col]
test = test[col]

In [8]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaler.fit(train)
train.loc[:, :] = scaler.transform(train)
test.loc[:, :] = scaler.transform(test)

# 3. 도로관련 변수 요인분석

In [9]:
from factor_analyzer import FactorAnalyzer


# 탐색적요인분석
fa = FactorAnalyzer(n_factors=2, rotation='varimax').fit(train[road_info])
# 결과 출력
print('요인적재량 :\n',pd.DataFrame(fa.loadings_, index=road_info))

요인적재량 :
                               0         1
lane_count            -0.095888  1.006479
maximum_speed_limit   -0.084552  0.510503
weight_restricted      0.795582 -0.219084
road_type              0.979173  0.230136
start_turn_restricted  0.029149  0.367815
end_turn_restricted    0.033676  0.258533


- Factor0
    - 통과제한하중, 도로유형에 대한 factor loading 값이 큼
    - "속도에 대한 간접적 요인"라고 명명
- Factor2
    - 차로수, 속도제한, 시작/도착지점 회전제한 유무 대한 factor loading 값이 큼
    - "속도에 대한 간접적 요인"라고 명명 

In [10]:
print('\n공통성 :\n', pd.DataFrame(fa.get_communalities(), index=road_info))


공통성 :
                               0
lane_count             1.022194
maximum_speed_limit    0.267762
weight_restricted      0.680949
road_type              1.011742
start_turn_restricted  0.136138
end_turn_restricted    0.067973


In [11]:
print('\n요인점수 :\n', fa.transform(train[road_info]))


요인점수 :
 [[-0.52681027 -0.72142485]
 [-0.52681027 -0.72142485]
 [-0.99757481  1.66614112]
 ...
 [ 1.35971042  2.20839385]
 [-0.73345151  0.43622822]
 [ 1.35971042  2.20839385]]


# 4. 요인점수 적용, 미적용 회귀분석 결과 비교

In [12]:
train["간접적요인"] = fa.transform(train[road_info])[:, 0]
train["직접적요인"] = fa.transform(train[road_info])[:, 1]

test["간접적요인"] = fa.transform(test[road_info])[:, 0]
test["직접적요인"] = fa.transform(test[road_info])[:, 1]

In [13]:
train_f = train.drop(road_info, axis = 1)
test_f = test.drop(road_info, axis = 1)

In [14]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error
import time

### 4.1. 요인점수 적용

In [15]:
linear = LinearRegression()
ridge = Ridge(random_state = 0)
lasso = Lasso(random_state = 0)
elastic = ElasticNet(random_state = 0)

models = [linear, lasso, ridge, elastic]

benchmark_factor = pd.DataFrame()
for model in models:
    start = time.time()
    model.fit(train_f, y_tr)
    pred = model.predict(test_f)
    tm = time.time() - start
    mae = mean_absolute_error(y_te, pred)
    
    benchmark_factor[str(model.__class__.__name__)] = [mae, tm]
        
    

benchmark_factor.index = (["MAE_factor", "time_factor"])
benchmark_factor.T

Unnamed: 0,MAE_factor,time_factor
LinearRegression,9.158527,0.05565
Lasso,9.361843,0.037928
Ridge,9.158527,0.019522
ElasticNet,9.392786,0.020806


### 4.2. 요인점수 미적용

In [16]:
linear = LinearRegression()
ridge = Ridge(random_state = 0)
lasso = Lasso(random_state = 0)
elastic = ElasticNet(random_state = 0)

models = [linear, lasso, ridge, elastic]

benchmark = pd.DataFrame()
for model in models:
    start = time.time()
    model.fit(train[col], y_tr)
    pred = model.predict(test[col])
    tm = time.time() - start
    mae = mean_absolute_error(y_te, pred)
    
    benchmark[str(model.__class__.__name__)] = [mae, tm]
        
    

benchmark.index = (["MAE_nofactor", "time_nofactor"])
benchmark.T

Unnamed: 0,MAE_nofactor,time_nofactor
LinearRegression,7.535561,0.088771
Lasso,9.549749,0.044216
Ridge,7.535572,0.039577
ElasticNet,9.594355,0.053628


In [17]:
final_benchmark = pd.concat([benchmark.T, benchmark_factor.T], axis = 1)

### 4.3. 최종비교

In [18]:
final_benchmark

Unnamed: 0,MAE_nofactor,time_nofactor,MAE_factor,time_factor
LinearRegression,7.535561,0.088771,9.158527,0.05565
Lasso,9.549749,0.044216,9.361843,0.037928
Ridge,7.535572,0.039577,9.158527,0.019522
ElasticNet,9.594355,0.053628,9.392786,0.020806


In [19]:
# 속도 비교
final_benchmark.time_nofactor.mean() / final_benchmark.time_factor.mean()

1.6891893576145616

In [20]:
# Mae 비교
final_benchmark.MAE_factor.mean() / final_benchmark.MAE_nofactor.mean()

1.0834846010429615

- MAE
    - 요인변수를 활용해 feature를 줄인 데이터셋을 활용한 경우가 MAE가 평균적으로 1.08배 증가
    - 하지만 Ridge회귀나 Elastic회귀 사용시 오히려 감소
 
- 모델 구현 속도
    - 요인변수를 활용한 경우가 속도가 1.7배 향상