In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.font_manager as fm
font_name = fm.FontProperties(fname="C:\Windows\Fonts\\malgun.ttf").get_name()
plt.rc("font", family=font_name)
import matplotlib as mlp
mlp.rcParams["axes.unicode_minus"] = False

from datetime import datetime
import zipfile
import shutil
import os
import glob
import json
from tqdm.notebook import tqdm
import cv2
import random

In [77]:
import mglearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNet, ElasticNetCV
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm
from sklearn.datasets import load_iris, load_boston, load_breast_cancer, fetch_california_housing
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, confusion_matrix 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report, roc_curve
from sklearn.metrics import precision_score, recall_score

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVC, SVR                 # 비선형을 처리하기 위한 알고리즘(커널 방식) -> 고차원(평면->입체)으로 mapping시킴
from sklearn.svm import LinearSVC, LinearSVR     # 선형을 처리하기 위한 알고리즘
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostClassifier

In [None]:
HQ_sheet_metal = pd.read_csv("HQ_sheet_metal2.csv")
HQ_exchange = pd.read_csv("HQ_exchange2.csv")

# 1. 코팅(coating)의 HQ 예측해보기 [회귀]

## [1] 데이터 준비

In [52]:
HQ_coating = pd.read_csv("HQ_coating2.csv")

In [55]:
HQ_coating.head()

Unnamed: 0,category_id,part,damage,repair,supercategory_name,HQ
0,as-0000025,Bumper,Crushed,coating,VAN,1.97
1,as-0000025,Door,Separated,coating,VAN,4.15
2,as-0000025,Bumper,Scratched,coating,VAN,1.97
3,as-0000025,Fender,Crushed,coating,VAN,2.8
4,as-0000025,Fender,Separated,coating,VAN,2.8


In [53]:
HQ_coating_ML = HQ_coating[['part','damage', 'supercategory_name', 'HQ']]

## [2] 변수 전처리

### 1-1) 범주형 변수의 원 핫 인코딩

- 머신 러닝 알고리즘은 숫자형 데이터만 받아들이기 때문에, 문자열을 숫자형 데이터로 바꾸어주어야 함.
- 레이블 인코딩은 간단하게 문자열을 숫자형 카테고리 값으로 변환해주지만 선형 머신러닝 알고리즘에는 사용하면 안됨.
    - WHY?
    - 레이블 인코딩된 숫자들은 크고 작음의 순서가 존재한다는 문제가 있음. 
    - ex) 머신러닝 알고리즘에서 짜장면<치킨<탕수육<피자 순으로 가중치를 부여하거나 중요하게 인식할 가능성이 발생.
    - 이러한 특성 때문에 Label encoding은 선형회귀같은 머신러닝 알고리즘에는 적용하지 않아야 함.
- 따라서, 원 핫 인코딩을 사용한다.

In [54]:
HQ_coating_ML.head()

Unnamed: 0,part,damage,supercategory_name,HQ
0,Bumper,Crushed,VAN,1.97
1,Door,Separated,VAN,4.15
2,Bumper,Scratched,VAN,1.97
3,Fender,Crushed,VAN,2.8
4,Fender,Separated,VAN,2.8


In [56]:
HQ_coating_ML = pd.get_dummies(HQ_coating_ML)
HQ_coating_ML

Unnamed: 0,HQ,part_Bonnet,part_Bumper,part_Door,part_Fender,part_Head lights,part_Rear lamp,part_Rocker panel,part_Roof,part_Side mirror,...,damage_Breakage,damage_Crushed,damage_Scratched,damage_Separated,supercategory_name_City car,supercategory_name_Compact car,supercategory_name_Full-size car,supercategory_name_Mid-size car,supercategory_name_SUV,supercategory_name_VAN
0,1.97,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,4.15,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,1.97,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,2.80,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,2.80,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504484,2.19,0,0,1,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
504485,1.24,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
504486,1.24,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
504487,1.24,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


## 1-2) 컬럼명 정리

In [62]:
HQ_coating_ML.columns = ['HQ', 'Bonnet', 'Bumper', 'Door', 'Fender',
       'Head lights', 'Rear lamp', 'Rocker panel', 'Roof',
       'Side mirror', 'Trunk lid', 'Wheel', 
       'Breakage', 'Crushed', 'Scratched', 'Separated',
       'City car', 'Compact car','Full-size car', 'Mid-size car','SUV', 'VAN'] 

In [65]:
HQ_coating_ML.head()

Unnamed: 0,HQ,Bonnet,Bumper,Door,Fender,Head lights,Rear lamp,Rocker panel,Roof,Side mirror,...,Breakage,Crushed,Scratched,Separated,City car,Compact car,Full-size car,Mid-size car,SUV,VAN
0,1.97,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,4.15,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,1.97,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,2.8,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,2.8,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


## 2. 특성(feautre)과 레이블(target)변수 나누기

In [68]:
X = HQ_coating_ML.iloc[:,1:]
y = HQ_coating_ML['HQ']

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.3)

In [74]:
X_train

Unnamed: 0,Bonnet,Bumper,Door,Fender,Head lights,Rear lamp,Rocker panel,Roof,Side mirror,Trunk lid,...,Breakage,Crushed,Scratched,Separated,City car,Compact car,Full-size car,Mid-size car,SUV,VAN
460098,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
347113,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
267903,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
130723,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
41541,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
365838,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
131932,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
146867,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [78]:
# ElasticNet을 사용할 것이기 때문에 최적의 l1_ratio를 찾을 필요가 있음.
elastic_net_cv = ElasticNetCV(cv=5)
elastic_net_cv.fit(X_train, y_train)
best_l1_ratio = elastic_net_cv.l1_ratio_

# 탐색할 하이퍼파라미터 그리드 생성
alphas = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 0.5, 1, 5, 10] 

param_grid = {'alpha': alphas, 'l1_ratio': [best_l1_ratio]}

# 그리드 서치 객체 정의
grid_search = GridSearchCV(ElasticNet(max_iter=1000), param_grid=param_grid, cv=5)

# 그리드 서치 수행
grid_search.fit(X_train, y_train)

# 최적 모델 및 결과 확인
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
cv_results = grid_search.cv_results_

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


KeyboardInterrupt: 

In [None]:
y_test_predict = ElasticNet_grid.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_test_predict))  # Loss
r2 = r2_score(y_test, y_test_predict)                       # 설명력

print("RMSE:", rmse)
print("r2:", r2)