# 데이터 기본 설정

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, RobustScaler, PowerTransformer, StandardScaler, OneHotEncoder
import gc

cv_str_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

train.shape, test.shape, submission.shape

((5497, 14), (1000, 13), (1000, 2))

## 한글 피처 변경

In [4]:
train = train.rename(columns={"fixed acidity":"산도", "volatile acidity": "휘발성산", "citric acid":"시트르산", "residual sugar":"잔여당분",
                      "chlorides":"염화물", "free sulfur dioxide":"독립_이산화황","total sulfur dioxide":"총_이산화황", "density":"밀도",
                      "pH":"수소이온농도", "sulphates":"황산염", "alcohol":"도수", "type":"종류"})

test = test.rename(columns={"fixed acidity":"산도", "volatile acidity": "휘발성산", "citric acid":"시트르산", "residual sugar":"잔여당분",
                      "chlorides":"염화물", "free sulfur dioxide":"독립_이산화황","total sulfur dioxide":"총_이산화황", "density":"밀도",
                      "pH":"수소이온농도", "sulphates":"황산염", "alcohol":"도수", "type":"종류"})

In [5]:
quality_ = train[["quality"]]
train = train.drop(columns="quality")
train = pd.concat([train, quality_], axis=1)

train

Unnamed: 0,index,산도,휘발성산,시트르산,잔여당분,염화물,독립_이산화황,총_이산화황,밀도,수소이온농도,황산염,도수,종류,quality
0,0,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,white,5
1,1,8.8,0.610,0.14,2.4,0.067,10.0,42.0,0.99690,3.19,0.59,9.5,red,5
2,2,7.9,0.210,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,white,5
3,3,7.0,0.210,0.31,6.0,0.046,29.0,108.0,0.99390,3.26,0.50,10.8,white,6
4,4,7.8,0.400,0.26,9.5,0.059,32.0,178.0,0.99550,3.04,0.43,10.9,white,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5492,5492,7.7,0.150,0.29,1.3,0.029,10.0,64.0,0.99320,3.35,0.39,10.1,white,5
5493,5493,6.3,0.180,0.36,1.2,0.034,26.0,111.0,0.99074,3.16,0.51,11.0,white,6
5494,5494,7.8,0.150,0.34,1.1,0.035,31.0,93.0,0.99096,3.07,0.72,11.3,white,7
5495,5495,6.6,0.410,0.31,1.6,0.042,18.0,101.0,0.99195,3.13,0.41,10.5,white,5


In [6]:
del quality_
gc.collect();

## 종류 라벨링

In [7]:
type_dict = {"white":0, "red":1}

train["종류"] = train["종류"].map(type_dict)
test["종류"] = test["종류"].map(type_dict)

del type_dict
gc.collect();

train.shape, test.shape

((5497, 14), (1000, 13))

## 이상치 제거(등급제외)
- wine_EDA.ipynb 참고

### "산도"

In [3034]:
# idx_ = train.sort_values("산도").head().index

# train = train.drop(idx_)

### "휘발성산" 

### "시트르산"

In [3035]:
idx_ = train[train["시트르산"]>=0.9].index
train = train.drop(idx_)

del idx_
gc.collect();

### "잔여당분"

### "염화물"

In [3036]:
# idx_ = train[train["염화물"]>=0.415].index
# train = train.drop(idx_)

# idx_

### "독립_이산화황"

In [3037]:
idx_ = train[train["독립_이산화황"]>=250].index

train = train.drop(idx_)
del idx_
gc.collect();

### "총_이산화황"

In [3038]:
# idx_ = train[train["총_이산화황"]>255].index

# train = train.drop(idx_)
# idx_

### "밀도"

In [3039]:
idx_ = train[train["밀도"]==1.01030].index
train = train.drop(idx_)

del idx_
gc.collect();

### "수소이온농도"

In [3040]:
# train = train.drop(train[train["수소이온농도"]>4].index)

### "황산염"

In [3041]:
# idx_ = train[(train["황산염"]>=1.25) & (train["황산염"]<1.75)].index

# train = train.drop(idx_)

### "도수"

In [3042]:
# idx_ = train[train["도수"]>14.0].index

# train = train.drop(idx_)
# idx_

In [3043]:
train

Unnamed: 0,index,산도,휘발성산,시트르산,잔여당분,염화물,독립_이산화황,총_이산화황,밀도,수소이온농도,황산염,도수,종류,quality
0,0,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,0,5
1,1,8.8,0.610,0.14,2.4,0.067,10.0,42.0,0.99690,3.19,0.59,9.5,1,5
2,2,7.9,0.210,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,0,5
3,3,7.0,0.210,0.31,6.0,0.046,29.0,108.0,0.99390,3.26,0.50,10.8,0,6
4,4,7.8,0.400,0.26,9.5,0.059,32.0,178.0,0.99550,3.04,0.43,10.9,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5492,5492,7.7,0.150,0.29,1.3,0.029,10.0,64.0,0.99320,3.35,0.39,10.1,0,5
5493,5493,6.3,0.180,0.36,1.2,0.034,26.0,111.0,0.99074,3.16,0.51,11.0,0,6
5494,5494,7.8,0.150,0.34,1.1,0.035,31.0,93.0,0.99096,3.07,0.72,11.3,0,7
5495,5495,6.6,0.410,0.31,1.6,0.042,18.0,101.0,0.99195,3.13,0.41,10.5,0,5


## 등급 이상치

In [3044]:
train["quality"].value_counts(normalize=True)

quality
6    0.438628
5    0.325916
7    0.168521
4    0.033741
8    0.027722
3    0.004560
9    0.000912
Name: proportion, dtype: float64

# 중간 저장

In [3054]:
# train.to_csv("train_common.csv",index=False)
# test.to_csv("test_common.csv",index=False)

# 인코딩

# 스케일링

In [3045]:
train_ft = train.iloc[:,1:-1]
test_ft = test.iloc[:,1:]
quality = train["quality"]

train_ft.shape, test_ft.shape

((5483, 12), (1000, 12))

In [3046]:
std_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()
robust_scaler = RobustScaler()
power_scaler = PowerTransformer()

scalers = [std_scaler, minmax_scaler, robust_scaler, power_scaler]

# 모델링

In [3047]:
rf_model = RandomForestClassifier(random_state=42)
cat_model = CatBoostClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42)
hgb_model = HistGradientBoostingClassifier(random_state=42)
lgbm_model = LGBMClassifier(random_state=42)

models = [rf_model, cat_model, xgb_model, hgb_model, lgbm_model]
models_without_xgb = [rf_model, cat_model, hgb_model, lgbm_model]

## 피처 테스트용

In [3048]:
scores = cross_val_score(rf_model, train_ft, quality, scoring="accuracy", cv=cv_kfold)
scores.mean()

Flushing oldest 200 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


0.6802836534942677

In [3003]:
rf_model.fit(train_ft, quality)
pred = rf_model.predict(test_ft)
submission["quality"] = pred
submission.to_csv("1126_8.csv", index=False)

In [2789]:
# score = []
# for scaler in scalers:
#     train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
#     test_ft[test_ft.columns] = scaler.transform(test_ft)
#     scores = cross_val_score(rf_model, train_ft, quality, scoring="accuracy", cv=cv_kfold)
#     score.append(scores.mean())
# print(f"std_scaler_score: {score[0]}")
# print(f"minmax_scaler_score: {score[1]}")
# print(f"robust_scaler_score: {score[2]}")
# print(f"power_scaler_score: {score[3]}")

std_scaler_score: 0.6808307660573961
minmax_scaler_score: 0.6821074729354777
robust_scaler_score: 0.6811945651378344
power_scaler_score: 0.6801003400115777


## 5가지 기본 모델 테스트

In [2817]:
quality_mapping = {3:0, 4:1, 5:2, 6:3, 7:4, 8:5, 9:6}

score = []
for model in models_without_xgb:
    if model is xgb_model:
        xgb_quality = quality.map(quality_mapping)
        scores = cross_val_score(model, train_ft, xgb_quality, scoring="accuracy", cv=cv_kfold)
    else:
        scores = cross_val_score(model, train_ft, quality, scoring="accuracy", cv=cv_kfold)
    score.append(scores.mean())

Learning rate set to 0.085324
0:	learn: 1.8109968	total: 4.01ms	remaining: 4.01s
1:	learn: 1.7032907	total: 6.3ms	remaining: 3.15s
2:	learn: 1.6229854	total: 8.83ms	remaining: 2.94s
3:	learn: 1.5549648	total: 11.1ms	remaining: 2.77s
4:	learn: 1.4988498	total: 13.5ms	remaining: 2.69s
5:	learn: 1.4504634	total: 15.9ms	remaining: 2.63s
6:	learn: 1.4101014	total: 18.4ms	remaining: 2.61s
7:	learn: 1.3735940	total: 21.1ms	remaining: 2.61s
8:	learn: 1.3403653	total: 23.5ms	remaining: 2.59s
9:	learn: 1.3124149	total: 26.3ms	remaining: 2.61s
10:	learn: 1.2892554	total: 28.8ms	remaining: 2.59s
11:	learn: 1.2674006	total: 31ms	remaining: 2.56s
12:	learn: 1.2469327	total: 33.5ms	remaining: 2.54s
13:	learn: 1.2287332	total: 35.8ms	remaining: 2.52s
14:	learn: 1.2104448	total: 38.4ms	remaining: 2.52s
15:	learn: 1.1941418	total: 40.8ms	remaining: 2.51s
16:	learn: 1.1795538	total: 43.2ms	remaining: 2.5s
17:	learn: 1.1663366	total: 45.6ms	remaining: 2.49s
18:	learn: 1.1541712	total: 48.1ms	remaining: 2.

In [2818]:
print(f"rf_model_score: {score[0]}")
print(f"cat_model_score: {score[1]}")
print(f"xgb_model_score: {score[2]}")
print(f"hgb_model_score: {score[3]}")
print(f"lgbm_model_score: {score[4]}")

rf_model_score: 0.6802836534942677
cat_model_score: 0.650920060683084
xgb_model_score: 0.6290450398898123
hgb_model_score: 0.6552954640725535


IndexError: list index out of range