### 백화점 고객의 1년 간 구매 데이터를 활용해
- 데이터 전처리
- Feature Engineering
- 모델링 (분류 알고리즘 사용)
- 하이퍼파라미터 튜닝 (초매개변수 최적화)
- 모형 앙상블
- csv제출

### 유의사항
- 수험번호.csv 파일이 만들어지도록 코드를 제출함
- 제출한 모델의 성능은 ROC-AUC 평가지표에 따라 채점함

### 데이터 출처 및 연결
- data 출처: https://www.dataq.or.kr/ - 공지사항 - 759번 제2회 빅데이터분석기사 실기 안내 - 첨부파일

### 데이터셋 업로드
- 데이터셋 프라이빗 업로드 : https://youtu.be/BZlEQ5JwLiA
    - Datasets - new dataset - (drag&drop) - Create / 반드시 Private
- 작업형2 예시: https://youtu.be/_GIBVt5-khk

- 아래 코드는 베이스라인 예시입니다

# 라이브러리 및 데이터 불러오기

In [1]:
# 라이브러리
import pandas as pd

In [2]:
# 데이터 불러오기
path = "../input/d/datasets/chai97/department-store-customer/"
X = pd.read_csv(path + 'X_train.csv', encoding="euc-kr") # 구름 IDE환경에서는 encoding="euc-kr"가 없어도 됨
y = pd.read_csv(path + "y_train.csv")
test = pd.read_csv(path + "X_test.csv", encoding="euc-kr")

# 간단EDA

In [3]:
# EDA
X.shape, y.shape, test.shape

((3500, 10), (3500, 2), (2482, 10))

In [4]:
X.head()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,0,68282840,11264000,6860000.0,기타,강남점,19,3.894737,0.527027,17
1,1,2136000,2136000,300000.0,스포츠,잠실점,2,1.5,0.0,1
2,2,3197000,1639000,,남성 캐주얼,관악점,2,2.0,0.0,1
3,3,16077620,4935000,,기타,광주점,18,2.444444,0.318182,16
4,4,29050000,24000000,,보석,본 점,2,1.5,0.0,85


In [5]:
y.head()

Unnamed: 0,cust_id,gender
0,0,0
1,1,0
2,2,1
3,3,1
4,4,0


In [6]:
# 결측치 확인
X.isnull().sum()

cust_id       0
총구매액          0
최대구매액         0
환불금액       2295
주구매상품         0
주구매지점         0
내점일수          0
내점당구매건수       0
주말방문비율        0
구매주기          0
dtype: int64

In [7]:
# X_train 데이터 기초통계 
X.describe()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,내점일수,내점당구매건수,주말방문비율,구매주기
count,3500.0,3500.0,3500.0,1205.0,3500.0,3500.0,3500.0,3500.0
mean,1749.5,91919250.0,19664240.0,24078220.0,19.253714,2.834963,0.307246,20.958286
std,1010.507298,163506500.0,31992350.0,47464530.0,27.174942,1.912368,0.289752,24.748682
min,0.0,-52421520.0,-2992000.0,5600.0,1.0,1.0,0.0,0.0
25%,874.75,4747050.0,2875000.0,2259000.0,2.0,1.666667,0.027291,4.0
50%,1749.5,28222700.0,9837000.0,7392000.0,8.0,2.333333,0.25641,13.0
75%,2624.25,106507900.0,22962500.0,24120000.0,25.0,3.375,0.44898,28.0
max,3499.0,2323180000.0,706629000.0,563753000.0,285.0,22.083333,1.0,166.0


In [8]:
X.describe(include='object')

Unnamed: 0,주구매상품,주구매지점
count,3500,3500
unique,42,24
top,기타,본 점
freq,595,1077


In [9]:
# X_test 데이터 기초통계 
test.describe()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,내점일수,내점당구매건수,주말방문비율,구매주기
count,2482.0,2482.0,2482.0,871.0,2482.0,2482.0,2482.0,2482.0
mean,4740.5,101027500.0,21770480.0,25547160.0,19.516922,2.819388,0.293812,20.28606
std,716.636007,173213200.0,35049190.0,59440740.0,25.973972,1.75455,0.2826,24.108756
min,3500.0,-37440000.0,-37440000.0,10000.0,1.0,1.0,0.0,0.0
25%,4120.25,5076868.0,2884350.0,2414000.0,2.0,1.75,0.023456,4.0
50%,4740.5,30516860.0,10752500.0,8100000.0,9.0,2.430952,0.25,13.0
75%,5360.75,126425500.0,26277000.0,22280900.0,26.75,3.375,0.423566,27.0
max,5981.0,2861238000.0,593225000.0,871514400.0,222.0,15.875,1.0,177.0


In [10]:
X.describe(include='object')

Unnamed: 0,주구매상품,주구매지점
count,3500,3500
unique,42,24
top,기타,본 점
freq,595,1077


In [11]:
# label값 확인 
y['gender'].value_counts()

0    2184
1    1316
Name: gender, dtype: int64

# 데이터 전처리

In [12]:
# 결측치처리
X = X.fillna(0) # 환불금액 0값으로 채움
test = test.fillna(0)

In [13]:
X = X.drop(['cust_id'], axis=1)
cust_id = test.pop('cust_id')

# 피처엔지니어링

In [14]:
# Label Encoding (범주형 변수 레이블인코딩)
from sklearn.preprocessing import LabelEncoder

cols = ['주구매상품', '주구매지점']
for col in cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    test[col] = le.transform(test[col])

X.head()

Unnamed: 0,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,68282840,11264000,6860000.0,5,0,19,3.894737,0.527027,17
1,2136000,2136000,300000.0,21,19,2,1.5,0.0,1
2,3197000,1639000,0.0,6,1,2,2.0,0.0,1
3,16077620,4935000,0.0,5,2,18,2.444444,0.318182,16
4,29050000,24000000,0.0,15,8,2,1.5,0.0,85


# 모델링 & 하이퍼파라미터 튜닝

In [15]:
# 모델링 & 하이퍼파라미터 튜닝 & 앙상블
from sklearn.ensemble import RandomForestClassifier
# import sklearn

model = RandomForestClassifier()
model.fit(X, y['gender'])
print(model.score(X, y['gender']))
predictions = model.predict_proba(test)

0.9994285714285714


In [16]:
import xgboost as xgb

# 모델 선언
model2 = xgb.XGBClassifier() 

# 모델 훈련
model2.fit(X,y['gender']) 
print(model2.score(X, y['gender']))

# 모델 예측
y_pred = model2.predict(test)




0.9702857142857143


In [17]:
import lightgbm as lgbm

dir(lgbm)

model3 = lgbm.LGBMClassifier()

model3.fit(X,y['gender']) 
print(model3.score(X,y['gender']))

y_pred = model3.predict(test)

0.8825714285714286


In [18]:
predictions[:,1]

array([0.4 , 0.27, 0.12, ..., 0.48, 0.46, 0.48])

In [19]:
# csv생성
output = pd.DataFrame({'cust_id': cust_id, 'gender': predictions[:,1]})

In [20]:
output.head()

Unnamed: 0,cust_id,gender
0,3500,0.4
1,3501,0.27
2,3502,0.12
3,3503,0.58
4,3504,0.37


In [21]:
output.to_csv("123456789.csv", index=False)

# csv확인

In [22]:
pd.read_csv("123456789.csv")

Unnamed: 0,cust_id,gender
0,3500,0.40
1,3501,0.27
2,3502,0.12
3,3503,0.58
4,3504,0.37
...,...,...
2477,5977,0.67
2478,5978,0.49
2479,5979,0.48
2480,5980,0.46
