## 문제 1. 
- 다음은 Customer Segmentation 데이터 세트이다. 주어진 훈련 데이터 세트를 활용하여 고객이 속한 세그먼트(Segmentation)를 예측하고 해당 예측 결과를 csv 파일로 제출하시오.
- 결과 제출 양식 : 제출한 예측값의 macro_f1 결과를 통해 영역별 배점에 따라 최종 점수가 반영될 예정

In [1]:
import pandas as pd 
pd.DataFrame({
    "ID"  : [1, 2, 3],
    "y_pred" : ["A", "B", "C"]
})

Unnamed: 0,ID,y_pred
0,1,A
1,2,B
2,3,C


## 데이터 불러오기

In [2]:
import pandas as pd 
X_test = pd.read_csv("data/4회/404_x_test.csv")
X_train = pd.read_csv("data/4회/404_x_train.csv")
y_train = pd.read_csv("data/4회/404_y_train.csv")

In [3]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6718 entries, 0 to 6717
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            6718 non-null   int64 
 1   Segmentation  6718 non-null   object
dtypes: int64(1), object(1)
memory usage: 105.1+ KB


In [4]:
import pandas as pd 
pd.DataFrame({
    "변수" : list(X_train.columns) + ["Segmentation"], 
    "설명" : [
        "고객 ID 번호", 
        "성별", 
        "결혼 여부", 
        "나이", 
        "대학 졸업 여부", 
        "직업",
        "근무 연수", 
        "지출 수준", 
        "가족 수(본인 포함)",   
        "고객 세그먼트(A, B, C, D 중 하나)"
    ]
})

Unnamed: 0,변수,설명
0,ID,고객 ID 번호
1,Gender,성별
2,Ever_Married,결혼 여부
3,Age,나이
4,Graduated,대학 졸업 여부
5,Profession,직업
6,Work_Experience,근무 연수
7,Spending_Score,지출 수준
8,Family_Size,가족 수(본인 포함)
9,Segmentation,"고객 세그먼트(A, B, C, D 중 하나)"


## 데이터 확인

In [5]:
X_train.head(1)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0


In [6]:
X_test.head(1)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size
0,458989,Female,Yes,36,Yes,Engineer,0.0,Low,1.0


In [7]:
y_train.head(1)

Unnamed: 0,ID,Segmentation
0,462809,D


## 결측치 확인

In [8]:
X_train.isnull().sum()

ID                 0
Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
dtype: int64

In [9]:
X_test.isnull().sum()

ID                 0
Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
dtype: int64

## 데이터 확인

In [10]:
y_train['Segmentation'].value_counts()

D    1772
C    1735
A    1628
B    1583
Name: Segmentation, dtype: int64

In [11]:
X_train.describe()

Unnamed: 0,ID,Age,Work_Experience,Family_Size
count,6718.0,6718.0,6718.0,6718.0
mean,463516.571152,43.524263,2.629652,2.841768
std,2566.017254,16.513115,3.404639,1.523319
min,458982.0,18.0,0.0,1.0
25%,461347.25,31.0,0.0,2.0
50%,463566.0,41.0,1.0,2.0
75%,465739.75,53.0,4.0,4.0
max,467974.0,89.0,14.0,9.0


- StandardScaler를 사용한다. 

In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6718 entries, 0 to 6717
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               6718 non-null   int64  
 1   Gender           6718 non-null   object 
 2   Ever_Married     6718 non-null   object 
 3   Age              6718 non-null   int64  
 4   Graduated        6718 non-null   object 
 5   Profession       6718 non-null   object 
 6   Work_Experience  6718 non-null   float64
 7   Spending_Score   6718 non-null   object 
 8   Family_Size      6718 non-null   float64
dtypes: float64(2), int64(2), object(5)
memory usage: 472.5+ KB


In [13]:
X_train['Gender'].value_counts() # OneHot

Male      3702
Female    3016
Name: Gender, dtype: int64

In [14]:
X_train['Ever_Married'].value_counts() # OneHot

Yes    3975
No     2743
Name: Ever_Married, dtype: int64

In [15]:
X_train['Graduated'].value_counts() # OneHot

Yes    4278
No     2440
Name: Graduated, dtype: int64

In [16]:
X_train['Profession'].value_counts() # OneHot

Artist           2211
Healthcare       1089
Entertainment     815
Doctor            594
Engineer          586
Executive         509
Lawyer            503
Marketing         233
Homemaker         178
Name: Profession, dtype: int64

In [17]:
X_train['Spending_Score'].value_counts() # Ordinal

Low        4029
Average    1677
High       1012
Name: Spending_Score, dtype: int64

## ID 제거 

In [18]:
X_train_id = X_train.pop("ID")
X_test_id = X_test.pop("ID")

X_train_id.shape, X_test.shape

((6718,), (2178, 8))

## 컬럼 추출하기

In [19]:
import numpy as np
cat_cols = list(X_train.select_dtypes(exclude=np.number).columns)
cat_cols.remove("Spending_Score")
cat_cols

['Gender', 'Ever_Married', 'Graduated', 'Profession']

In [20]:
int_cols = list(X_train.select_dtypes(include=np.number).columns)
int_cols

['Age', 'Work_Experience', 'Family_Size']

## 데이터셋 분리

In [21]:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train['Segmentation'], 
    stratify = y_train['Segmentation'], 
    test_size = 0.3, 
    random_state=42
)

X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((4702, 8), (2016, 8), (4702,), (2016,))

## 모델 학습

In [37]:
# preprocessing
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# modeling
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier 
from sklearn.model_selection import RandomizedSearchCV

# metrics
from sklearn.metrics import f1_score

# utils
from sklearn.utils.fixes import loguniform

param_grid = {
    "clf__learning_rate": loguniform(0.001, 0.1), 
    "clf__n_estimators": np.arange(50, 200),
    "clf__max_depth": np.arange(3, 30, 3), 
    "clf__num_leaves" : np.arange(20, 50), 
    "clf__min_split_gain" : np.arange(0, 1.1, 0.1), ## called gamma
    "clf__subsample" : np.arange(0.6, 1.0, 0.1)
}

column_transformer = ColumnTransformer([
    ("scaler", StandardScaler(), int_cols), 
    ("ohc_encoder", OneHotEncoder(), cat_cols), 
    ("ord_encoder", OrdinalEncoder(categories=[["Low", "Average", "High"]]), ['Spending_Score'])
], remainder = "passthrough")

pipeline = Pipeline([
    ("preprocessor", column_transformer), 
    ("clf", LGBMClassifier(random_state=42, objective='multiclass', num_class=4))
])


random_search = RandomizedSearchCV(
    estimator = pipeline, 
    param_distributions = param_grid, 
    n_iter = 10, 
    scoring="f1_macro", 
    cv=5, 
    verbose=3, 
    n_jobs=-1
)

random_search.fit(X_tr, y_tr)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('scaler',
                                                                               StandardScaler(),
                                                                               ['Age',
                                                                                'Work_Experience',
                                                                                'Family_Size']),
                                                                              ('ohc_encoder',
                                                                               OneHotEncoder(),
                                                                               ['Gender',
                                                                           

In [36]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [38]:
def get_scores(model, X_tr, X_val, y_tr, y_val):
    y_tr_pred = model.predict(X_tr)
    y_val_pred = model.predict(X_val)
    tr_score = a(y_tr, y_tr_pred, average='macro')
    val_score = f1_score(y_val, y_val_pred, average='macro')
    return f'train : {round(tr_score, 3)}, valid : {round(val_score, 4)}'
get_scores(random_search, X_tr, X_val, y_tr, y_val)

'train : 0.572, valid : 0.5149'

In [40]:
best_params = random_search.best_params_
best_score = random_search.best_score_
print('Best Parameters:', best_params)
print('Best Score:', best_score)

Best Parameters: {'clf__learning_rate': 0.024587303804950753, 'clf__max_depth': 6, 'clf__min_split_gaina': 0.7000000000000001, 'clf__n_estimators': 69, 'clf__num_leaves': 29, 'clf__subsample': 0.7999999999999999}
Best Score: 0.5127940391918597


## 결과 제출

In [43]:
final_preds = random_search.predict(X_test)
result = pd.DataFrame({"ID" : X_test_id, "pred": final_preds})
result.head()

Unnamed: 0,ID,pred
0,458989,A
1,458994,D
2,459000,B
3,459003,C
4,459005,B
