# 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

train.shape, test.shape, submission.shape

((5497, 14), (1000, 13), (1000, 2))

# 한글 피처 변경

In [3]:
train = train.rename(columns={"fixed acidity":"산도", "volatile acidity": "휘발성산", "citric acid":"시트르산", "residual sugar":"잔여당분",
                      "chlorides":"염화물", "free sulfur dioxide":"독립_이산화황","total sulfur dioxide":"총_이산화황", "density":"밀도",
                      "pH":"수소이온농도", "sulphates":"황산염", "alcohol":"도수", "type":"종류"})

test = test.rename(columns={"fixed acidity":"산도", "volatile acidity": "휘발성산", "citric acid":"시트르산", "residual sugar":"잔여당분",
                      "chlorides":"염화물", "free sulfur dioxide":"독립_이산화황","total sulfur dioxide":"총_이산화황", "density":"밀도",
                      "pH":"수소이온농도", "sulphates":"황산염", "alcohol":"도수", "type":"종류"})

# 타겟값 추출

In [4]:
quality = train["quality"]
train = train.drop(columns="quality")

# 인코딩

In [5]:
dict_ = {
    "white":0,
    "red":1
}
train["종류"] = train["종류"].map(dict_)
test["종류"] = test["종류"].map(dict_)

# 모델링

In [6]:
rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42)
knn_model = KNeighborsClassifier()
lgbm_model = LGBMClassifier(random_state=42)
lr_model = LogisticRegression(random_state=42)
cat_model = CatBoostClassifier(random_state=42)
mlp_model = MLPClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)
hgb_model = HistGradientBoostingClassifier(random_state=42)
tree_model = DecisionTreeClassifier(random_state=42)

models = [rf_model, xgb_model, knn_model, lgbm_model, lr_model, cat_model, mlp_model, gb_model, hgb_model, tree_model]

In [8]:
cross_val_score(rf_model, train, quality,scoring="accuracy", cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)).mean()

0.6683652907601952

In [9]:
quality

0       5
1       5
2       5
3       6
4       6
       ..
5492    5
5493    6
5494    7
5495    5
5496    6
Name: quality, Length: 5497, dtype: int64

In [10]:
score = []
for model in models:
    scores = cross_val_score(model, train, quality, scoring="accuracy", cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
    score.append(scores.mean())

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/xgboost/sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4 5 6], got [3 4 5 6 7 8 9]


In [40]:
print(f"rf_model_score: {score[0]}")
print(f"xgb_model: {score[1]}")
print(f"knn_model: {score[2]}")
print(f"lgbm_model: {score[3]}")
print(f"lr_model: {score[4]}")
print(f"cat_model: {score[5]}")
print(f"mlp_model: {score[6]}")
print(f"gb_model: {score[7]}")
print(f"hgb_model: {score[8]}")
print(f"tree_model: {score[9]}")

rf_model_score: 0.6683652907601952
xgb_model: 0.6363442799238977
knn_model: 0.3923997022086194
lgbm_model: 0.6359841177930351
lr_model: 0.4393286458764166
cat_model: 0.629070725452891
mlp_model: 0.3809749358921333
gb_model: 0.5828617751675076
hgb_model: 0.6368922160641907
tree_model: 0.5635864008602862


In [22]:
class_mapping = {3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6}
quality_mapped = quality.map(class_mapping)

In [32]:
tree_model.fit(train, quality)
pred = tree_model.predict(test)

# xgb_mapping = {0:3, 1:4, 2:5, 3:6, 4:7, 5:8, 6:9}
# pred_xgb = pred.map(xgb_mapping)
submission["quality"] = pred
submission.to_csv("1125_8.csv", index=False)

In [11]:
train.shape, test.shape

((5497, 13), (1000, 13))

# 기본 모델별 cv점수, 데이콘 점수

- rf_model
    - cv: 0.668
    - 데이콘: 0.683
    -  +0.015
- cat_model
    - cv: 0.629
    - 데이콘: 0.667
    - +0.038
- xgb_model
    - cv: 0.636
    - 데이콘: 0.665
    - +0.029
- hgb_model
    - cv: 0.636
    - 데이콘: 0.659
    - +0.023
- lgbm_model
    - cv: 0.635
    - 데이콘: 0.656
    - +0.021