# 기출 4회_작업형2
- 자동차 회사는 새로운 전략을 수립하기 위해 4개의 시장으로 세분화했습니다.
- 기존 고객 분류 자료를 바탕으로 신규 고객이 어떤 분류에 속할지 예측해주세요!


- 예측할 값(y): "Segmentation" (1,2,3,4)
- 평가: Macro f1-score
- data: train.csv, test.csv
- 제출 형식:
~~~
ID,Segmentation
458989,1
458994,2
459000,3
459003,4
~~~

## 답안 제출 참고
- 아래 코드 예측변수와 수험번호를 개인별로 변경하여 활용
- pd.DataFrame({'ID': test.ID, 'Segmentation': pred}).to_csv('003000000.csv', index=False)

## 노트북 구분
- basic: 수치형 데이터만 활용 -> 학습 및 test데이터 예측
- intermediate: 범주형 데이터도 활용 -> 학습 및 test데이터 예측
- advanced: 학습 및 교차 검증(모델 평가) -> 하이퍼파라미터 튜닝 -> test데이터 예측

## 📌 My Code

In [30]:
import pandas as pd

# 데이터 불러오기
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/inf/refs/heads/main/p4/4_2/train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/inf/refs/heads/main/p4/4_2/test.csv")

# 1. 데이터 유형 파악
# 결측값: 없음, 
# 범주형: ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1']
# target: 'Segmentation'
# print(train.info())
# print(test.info())
# print(train.select_dtypes('object').columns)
# print(train.shape, test.shape)

# 2. 데이터 전처리
# X_full
X = train.drop(['Segmentation'], axis=1)
y = train['Segmentation']
X_full = pd.concat([X, test], axis=0)
X_full = X_full.drop(['ID'], axis=1)
# print(X_full.shape)


# 결측치 처리 => 없음.
# print(X_full.isna().sum())

# 수치형 변수 => 생략.


# 범주형 변수
X_full = pd.get_dummies(X_full)
# print(X_full.shape)

# 3. 데이터 분리
X_train = X_full[:train.shape[0]]
X_test = X_full[train.shape[0]:]
print(X_train.shape, X_test.shape)

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y, test_size=0.2)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# 4. 모델 학습 및 검증
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)
# print(y_val_pred.shape)

# 5. 모델 평가
from sklearn.metrics import f1_score
f1_macro_score = f1_score(y_val, y_val_pred, average='macro')
print(f1_macro_score)
# help(sklearn.metrics.f1_score)

# 6. 결과 저장
y_pred = model.predict(X_test)
result = pd.DataFrame({'ID': test['ID'], 'Segmentation': y_pred})
result.to_csv('result4_2.csv', index=False)

check = pd.read_csv('result4_2.csv')
print(check)

(6665, 28) (2154, 28)
(5332, 28) (1333, 28) (5332,) (1333,)
0.4940375859305005
          ID  Segmentation
0     458989             2
1     458994             3
2     459000             3
3     459003             3
4     459005             1
...      ...           ...
2149  467950             1
2150  467954             4
2151  467958             1
2152  467961             3
2153  467968             4

[2154 rows x 2 columns]


# 🍭 basic 단계 🍭  
- 목표: 수치형 데이터만이라도 활용해 제출하자!!!👍

## EDA

In [None]:
# 데이터 크기 확인
train.shape, test.shape

((6665, 11), (2154, 10))

In [None]:
# train 샘플 확인
train.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,4
1,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,2
2,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,2
3,461319,Male,Yes,56,No,Artist,0.0,Average,2.0,Cat_6,3
4,460156,Male,No,32,Yes,Healthcare,1.0,Low,3.0,Cat_6,3


In [None]:
# test 샘플 확인
test.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,458989,Female,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6
1,458994,Male,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6
2,459000,Male,Yes,59,No,Executive,11.0,High,2.0,Cat_6
3,459003,Male,Yes,47,Yes,Doctor,0.0,High,5.0,Cat_4
4,459005,Male,Yes,61,Yes,Doctor,5.0,Low,3.0,Cat_6


In [None]:
# target 확인
train['Segmentation'].value_counts()

Unnamed: 0_level_0,count
Segmentation,Unnamed: 1_level_1
4,1757
3,1720
1,1616
2,1572


In [None]:
# 결측치 확인(train)
train.isnull().sum()

Unnamed: 0,0
ID,0
Gender,0
Ever_Married,0
Age,0
Graduated,0
Profession,0
Work_Experience,0
Spending_Score,0
Family_Size,0
Var_1,0


In [None]:
# 결측치 확인(test)
test.isnull().sum()

Unnamed: 0,0
ID,0
Gender,0
Ever_Married,0
Age,0
Graduated,0
Profession,0
Work_Experience,0
Spending_Score,0
Family_Size,0
Var_1,0


In [None]:
# type 확인
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6665 entries, 0 to 6664
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               6665 non-null   int64  
 1   Gender           6665 non-null   object 
 2   Ever_Married     6665 non-null   object 
 3   Age              6665 non-null   int64  
 4   Graduated        6665 non-null   object 
 5   Profession       6665 non-null   object 
 6   Work_Experience  6665 non-null   float64
 7   Spending_Score   6665 non-null   object 
 8   Family_Size      6665 non-null   float64
 9   Var_1            6665 non-null   object 
 10  Segmentation     6665 non-null   int64  
dtypes: float64(2), int64(3), object(6)
memory usage: 572.9+ KB


## 전처리

In [None]:
# target(y, label) 값 복사
target = train.pop('Segmentation')
target

Unnamed: 0,Segmentation
0,4
1,2
2,2
3,3
4,3
...,...
6660,2
6661,4
6662,4
6663,2


In [None]:
# test데이터 ID 복사
test_ID = test.pop('ID')

In [None]:
# 수치형 컬럼(train)
# ['ID', 'Age', 'Work_Experience', 'Family_Size', 'Segmentation']
num_cols = ['Age', 'Work_Experience', 'Family_Size']
train = train[num_cols]
train.head(2)

Unnamed: 0,Age,Work_Experience,Family_Size
0,22,1.0,4.0
1,67,1.0,1.0


In [None]:
# 수치형 컬럼(test)
test = test[num_cols]
test.head(2)

Unnamed: 0,Age,Work_Experience,Family_Size
0,36,0.0,1.0
1,37,8.0,4.0


## model 학습 및 예측

In [None]:
# 모델 선택 및 학습
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(train, target)
pred = rf.predict(test)
pred

array([2, 3, 3, ..., 4, 3, 1])

In [None]:
# 예측 결과 -> 데이터 프레임
# pd.DataFrame({'cust_id': X_test.cust_id, 'gender': pred}).to_csv('003000000.csv', index=False)

submit = pd.DataFrame({
    'ID': test_ID,
    'Segmentation': pred
})
submit

Unnamed: 0,ID,Segmentation
0,458989,2
1,458994,3
2,459000,3
3,459003,3
4,459005,2
...,...,...
2149,467950,1
2150,467954,4
2151,467958,4
2152,467961,3


In [None]:
submit.to_csv("submission.csv", index=False)
# Score: 0.30477

# 🍭 intermediate 단계 🍭
- 목표: 범주형(카테고리)데이터 활용하기

In [None]:
# 라이브러리 불러오기
import pandas as pd

In [None]:
# 데이터 불러오기
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/inf/refs/heads/main/p4/4_2/train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/inf/refs/heads/main/p4/4_2/test.csv")

## EDA

In [None]:
# train 샘플 확인
train.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,4
1,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,2
2,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,2
3,461319,Male,Yes,56,No,Artist,0.0,Average,2.0,Cat_6,3
4,460156,Male,No,32,Yes,Healthcare,1.0,Low,3.0,Cat_6,3


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6665 entries, 0 to 6664
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               6665 non-null   int64  
 1   Gender           6665 non-null   object 
 2   Ever_Married     6665 non-null   object 
 3   Age              6665 non-null   int64  
 4   Graduated        6665 non-null   object 
 5   Profession       6665 non-null   object 
 6   Work_Experience  6665 non-null   float64
 7   Spending_Score   6665 non-null   object 
 8   Family_Size      6665 non-null   float64
 9   Var_1            6665 non-null   object 
 10  Segmentation     6665 non-null   int64  
dtypes: float64(2), int64(3), object(6)
memory usage: 572.9+ KB


In [None]:
train.describe(include="O")

Unnamed: 0,Gender,Ever_Married,Graduated,Profession,Spending_Score,Var_1
count,6665,6665,6665,6665,6665,6665
unique,2,2,2,9,3,7
top,Male,Yes,Yes,Artist,Low,Cat_6
freq,3677,3944,4249,2192,3999,4476


## 전처리

In [None]:
# 원핫 인코딩
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [None]:
# type 확인
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6665 entries, 0 to 6664
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        6665 non-null   int64  
 1   Age                       6665 non-null   int64  
 2   Work_Experience           6665 non-null   float64
 3   Family_Size               6665 non-null   float64
 4   Segmentation              6665 non-null   int64  
 5   Gender_Female             6665 non-null   bool   
 6   Gender_Male               6665 non-null   bool   
 7   Ever_Married_No           6665 non-null   bool   
 8   Ever_Married_Yes          6665 non-null   bool   
 9   Graduated_No              6665 non-null   bool   
 10  Graduated_Yes             6665 non-null   bool   
 11  Profession_Artist         6665 non-null   bool   
 12  Profession_Doctor         6665 non-null   bool   
 13  Profession_Engineer       6665 non-null   bool   
 14  Professi

In [None]:
# target(y, label) 값 복사
target = train.pop('Segmentation')
target

Unnamed: 0,Segmentation
0,4
1,2
2,2
3,3
4,3
...,...
6660,2
6661,4
6662,4
6663,2


In [None]:
train = train.drop("ID", axis=1)
train.head(1)

Unnamed: 0,Age,Work_Experience,Family_Size,Gender_Female,Gender_Male,Ever_Married_No,Ever_Married_Yes,Graduated_No,Graduated_Yes,Profession_Artist,...,Spending_Score_Average,Spending_Score_High,Spending_Score_Low,Var_1_Cat_1,Var_1_Cat_2,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7
0,22,1.0,4.0,False,True,True,False,True,False,False,...,False,False,True,False,False,False,True,False,False,False


In [None]:
# test데이터 ID 복사
test_ID = test.pop('ID')
test_ID

Unnamed: 0,ID
0,458989
1,458994
2,459000
3,459003
4,459005
...,...
2149,467950
2150,467954
2151,467958
2152,467961


## model 학습 및 예측

In [None]:
# 모델 선택 및 학습
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(train, target)
pred = rf.predict(test)
pred

array([1, 3, 3, ..., 2, 3, 4])

In [None]:
# 예측 결과 -> 데이터 프레임
# pd.DataFrame({'cust_id': X_test.cust_id, 'gender': pred}).to_csv('003000000.csv', index=False)

submit = pd.DataFrame({
    'ID': test_ID,
    'Segmentation': pred
})
submit

Unnamed: 0,ID,Segmentation
0,458989,1
1,458994,3
2,459000,3
3,459003,3
4,459005,1
...,...,...
2149,467950,1
2150,467954,4
2151,467958,2
2152,467961,3


In [None]:
submit.to_csv("submission.csv", index=False)
# Score: 0.30381

# 🍭 advanced 단계 🍭
- 목표: 교차검증 및 평가 후 제출하기

In [None]:
# 데이터 불러오기
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/inf/refs/heads/main/p4/4_2/train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/inf/refs/heads/main/p4/4_2/test.csv")

In [None]:
# 범주형 변수
# train.select_dtypes(include='object').columns
# ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score','Var_1']
cat_cols = ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score','Var_1']

In [None]:
## label encoding
## Series.astype('category').cat.codes
train['Gender'] = train['Gender'].astype('category').cat.codes
train['Ever_Married'] = train['Ever_Married'].astype('category').cat.codes
train['Graduated'] = train['Graduated'].astype('category').cat.codes
train['Profession'] = train['Profession'].astype('category').cat.codes
train['Spending_Score'] = train['Spending_Score'].astype('category').cat.codes
train['Var_1'] = train['Var_1'].astype('category').cat.codes
train

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,1,0,22,0,5,1.0,2,4.0,3,4
1,466315,0,1,67,1,2,1.0,2,1.0,5,2
2,461735,1,1,67,1,7,0.0,1,2.0,5,2
3,461319,1,1,56,0,0,0.0,0,2.0,5,3
4,460156,1,0,32,1,5,1.0,2,3.0,5,3
...,...,...,...,...,...,...,...,...,...,...,...
6660,463002,1,1,41,1,0,0.0,1,5.0,5,2
6661,464685,1,0,35,0,4,3.0,2,4.0,3,4
6662,465406,0,0,33,1,5,1.0,2,1.0,5,4
6663,467299,0,0,27,1,5,1.0,2,4.0,5,2


In [None]:
## cat.codes의 label 인코딩은 ABC 순대로 되는 것을 확인할 수 있다
test['Profession'].astype('category').cat.categories

Index(['Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive',
       'Healthcare', 'Homemaker', 'Lawyer', 'Marketing'],
      dtype='object')

In [None]:
## label encoding
test['Gender'] = test['Gender'].astype('category').cat.codes
test['Ever_Married'] = test['Ever_Married'].astype('category').cat.codes
test['Graduated'] = test['Graduated'].astype('category').cat.codes
test['Profession'] = test['Profession'].astype('category').cat.codes
test['Spending_Score'] = test['Spending_Score'].astype('category').cat.codes
test['Var_1'] = test['Var_1'].astype('category').cat.codes
test

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,458989,0,1,36,1,2,0.0,2,1.0,5
1,458994,1,1,37,1,5,8.0,0,4.0,5
2,459000,1,1,59,0,4,11.0,1,2.0,5
3,459003,1,1,47,1,1,0.0,1,5.0,3
4,459005,1,1,61,1,1,5.0,2,3.0,5
...,...,...,...,...,...,...,...,...,...,...
2149,467950,0,0,35,1,3,1.0,2,2.0,5
2150,467954,1,0,29,0,5,9.0,2,4.0,5
2151,467958,0,0,35,1,1,1.0,2,1.0,5
2152,467961,1,1,47,1,4,1.0,1,5.0,3


In [None]:
# ID, target 처리
target = train.pop('Segmentation')
train = train.drop("ID", axis=1)
test_ID = test.pop('ID')

In [None]:
# 모델 선택
# 하이퍼파라미터 튜닝: max_depth, n_estimators
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0, max_depth=7, n_estimators=500)

In [None]:
# 모델 선택
# 하이퍼파라미터 튜닝: max_depth, n_estimators
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0, max_depth=7, n_estimators=500)

In [None]:
# 학습
rf.fit(train, target)
pred = rf.predict(test)
pred

array([1, 3, 2, ..., 1, 2, 4])

In [None]:
# 예측 결과 -> 데이터 프레임
# pd.DataFrame({'cust_id': X_test.cust_id, 'gender': pred}).to_csv('003000000.csv', index=False)

submit = pd.DataFrame({
    'ID': test_ID,
    'Segmentation': pred
})
submit.to_csv("submission.csv", index=False)
# Score: 0.32046