# 컴피티션 규칙
- 1등팀 10만원
- 2등팀 5만원
- 3등부터는 고생 많았어요
- 외부데이터 사용 금지
- 데이터 유출 금지
- 최종 선택한 점수에 대한 파일은 다음과 같이 제출할것
    - 구매기록 데이터에서 특성 추출하는 파일
    - 모델링(특성선택,스케일링 등과같은 전처리 및 학습및 예측파일 생성) 하는 파일
    - 팀원들의 테스트데이터에 대한 예측파일을 앙상블하는 파일 
- 9월23일 12:30(PM) 까지 제출 가능
- 컴피티션 접속 주소
    - https://www.kaggle.com/t/9f93417de0e0445893436faa5ad82774

- category_encoders 설치

In [1]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.0-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 2.5 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.0


- 구글드라이브 마운트

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


- 데이터 경로및 시드고정

In [3]:
DATA_PATH = "/content/drive/MyDrive/01-python/data/2022_ML_project/data/"
SEED = 42

- 라이브러리 

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import category_encoders as ce

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectPercentile

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

- 데이터 불러오기

In [5]:
ft_train = pd.read_csv(f"{DATA_PATH}ft_train.csv") # 학습 데이터
target_train = pd.read_csv(f"{DATA_PATH}target_train.csv") # 학습데이터 정답

ft_test = pd.read_csv(f"{DATA_PATH}ft_test.csv") # 테스트 데이터
sample_submission = pd.read_csv(f"{DATA_PATH}sample_submission.csv") # 제출파일

- 테스트 데이터 피쳐추가

## pd.pivot_table 을 이용해서 피쳐 추가하기

- 각 고객번호에 대하여 중분류별 구매 개수 

In [6]:
# train_tmp = pd.pivot_table(ft_train,index="고객번호",columns="중분류수",values="총구매액",aggfunc="count",fill_value=0).reset_index()
# ft_train = ft_train.merge(train_tmp,how="left",on="고객번호")

- 위에 피쳐에 경우 테스트데이터에 추가시 주의해야한다.
- 고객별로 중분류에 대한 구매 종류가 다르기 때문에 train에는 없는 중분류가 카운팅 되거나 train 에는 있지만 test에는 있는 중분류가 카운팅 될수도 있다.
- 학습데이터와 테스트데이터에 피쳐는 동일해야한다.

- 테스트 데이터 특성 추출

In [7]:
# test_tmp = pd.pivot_table(ft_test,index="고객번호",columns="중분류수",values="총구매액",aggfunc="count",fill_value=0).reset_index()
# test_tmp.head()

- 학습데이터에서 뽑아낸 특성 컬럼을 기준으로 테스트데이터도 맞춰줘야함

In [8]:
# for col in train_tmp.columns:
#     if col not in test_tmp.columns:# 학습피쳐에는 있으나 테스트피쳐에는 없는 컬럼
#         test_tmp[col] = 0 # 0으로 채우기

# # 학습피쳐와 컬럼및 컬럼 순서 맞추기
# test_tmp = test_tmp[train_tmp.columns]

In [9]:
# ft_test = ft_test.merge(test_tmp,how="left",on="고객번호")

In [10]:
ft_train.shape , ft_test.shape

((2554, 498), (946, 498))

- 결측치 채우기

In [11]:
ft_train.shape , ft_test.shape

((2554, 498), (946, 498))

In [12]:
# me_tr = ft_train.median()
# me_ts = ft_test.median()

In [13]:
ft_train = ft_train.fillna(0)
ft_test = ft_test.fillna(0)

In [14]:
ft_train.isnull().sum().sum() , ft_test.isnull().sum().sum()

(0, 0)

- 범주형 인코딩하기

In [15]:
ft_train.select_dtypes("object").nunique()

주구매지점      24
주구매중분류    169
주구매대분류     40
월방문최빈달    117
주환불대분류    345
dtype: int64

In [16]:
cols = ["주구매지점","주구매대분류","월방문최빈달","주환불대분류"]
enc = ce.one_hot.OneHotEncoder() 

ft_train = pd.concat([
    ft_train,
    enc.fit_transform(ft_train[cols])
],axis=1).drop(columns=cols)

ft_test = pd.concat([
    ft_test,
    enc.transform(ft_test[cols])
],axis=1).drop(columns=cols)

In [17]:
cols = ["주구매중분류"]
enc = ce.count.CountEncoder()

ft_train = pd.concat([
    ft_train,
    enc.fit_transform(ft_train[cols]).add_suffix("_cnt")
],axis=1).drop(columns=cols)

ft_test = pd.concat([
    ft_test,
    enc.transform(ft_test[cols]).add_suffix("_cnt")
],axis=1).drop(columns=cols)

- 학습 데이터 , 정답 데이터, 테스트 데이터 

- 스케일링

In [22]:
ft_train == np.inf

Unnamed: 0,고객번호,내점일수,구매주기,주말방문비율,주구매요일,봄구매비율,여름구매비율,가을구매비율,겨울구매비율,일평균구매건수,...,주환불대분류_337,주환불대분류_338,주환불대분류_339,주환불대분류_340,주환불대분류_341,주환불대분류_342,주환불대분류_343,주환불대분류_344,주환불대분류_345,주구매중분류_cnt
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2549,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2550,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2551,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2552,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
mask = (ft_train == np.inf).sum() > 0
(ft_train == np.inf).sum()[mask]
mask2 = ft_train["지표"] == np.inf
ft_train.loc[mask2,"지표"] = 0
mask = (ft_test == np.inf).sum() > 0
(ft_test == np.inf).sum()[mask]
mask2 = ft_test["지표"] == np.inf
ft_test.loc[mask2,"지표"] = 0

In [None]:
train = ft_train.iloc[:,1:]
target = target_train.iloc[:,1]

test = ft_test.iloc[:,1:]

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

In [None]:
# cv_scores = [] 
# for p in tqdm(range(5,96,1)): 
#     select = SelectPercentile(percentile=p)   
#     select.fit(train,target) 
#     model = LogisticRegression(random_state=SEED) 
#     scores = cross_val_score(model,select.transform(train),target,cv = cv ,scoring='roc_auc',n_jobs = -1)
#     cv_scores.append( [p,scores.mean()] ) 

# cv_scores = np.array(cv_scores) 
# idx = np.argmax(cv_scores[:,1]) 
# best_score = cv_scores[idx] 
# best_score

In [None]:
# select = SelectPercentile(percentile=best_score[0])
# select.fit(train,target)
# train = select.transform(train)
# test = select.transform(test)
# train.shape , test.shape

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=SEED)
model.fit(train,target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=42)

In [None]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

model_list = []
score_list = []
for tri,vai in cv.split(train):
    # 학습데이터
    x_train = train.iloc[tri]
    y_train = target.iloc[tri]
    # 검증데이터
    x_valid = train.iloc[vai]
    y_valid = target.iloc[vai]
    
    # 모델 학습
    model = XGBClassifier(random_state=SEED,n_estimators=1000)
    model.fit(x_train,y_train , eval_set = [(x_valid,y_valid)],early_stopping_rounds=100) #early_stopping_rounds 50 회 이상 개선이없으면 n_estimators(부스팅) 을 멈추겠다,eval_set 부스팅될떄마다 검증셋 검증

    pred = model.predict_proba(x_valid)[:,1] 
    score = roc_auc_score(y_valid,pred) 
    score_list.append(score) 
    model_list.append(model) 

score_list

[0]	validation_0-error:0.305284
Will train until validation_0-error hasn't improved in 100 rounds.
[1]	validation_0-error:0.303327
[2]	validation_0-error:0.311155
[3]	validation_0-error:0.317025
[4]	validation_0-error:0.303327
[5]	validation_0-error:0.293542
[6]	validation_0-error:0.295499
[7]	validation_0-error:0.30137
[8]	validation_0-error:0.291585
[9]	validation_0-error:0.295499
[10]	validation_0-error:0.291585
[11]	validation_0-error:0.287671
[12]	validation_0-error:0.283757
[13]	validation_0-error:0.283757
[14]	validation_0-error:0.289628
[15]	validation_0-error:0.287671
[16]	validation_0-error:0.291585
[17]	validation_0-error:0.293542
[18]	validation_0-error:0.291585
[19]	validation_0-error:0.287671
[20]	validation_0-error:0.287671
[21]	validation_0-error:0.287671
[22]	validation_0-error:0.283757
[23]	validation_0-error:0.2818
[24]	validation_0-error:0.2818
[25]	validation_0-error:0.279843
[26]	validation_0-error:0.279843
[27]	validation_0-error:0.285714
[28]	validation_0-error:

[0.762678262028522,
 0.750137154290509,
 0.7507479058635819,
 0.7486990316843423,
 0.725734256661423]

In [None]:
model_list = [
    XGBClassifier(random_state=SEED), 
    LogisticRegression(random_state=SEED),
    RandomForestClassifier(random_state=SEED),
    MLPClassifier(random_state=SEED),
    LGBMClassifier(random_state=SEED)
]
for model in tqdm(model_list):
    model.fit(x_train,y_train)

  0%|          | 0/5 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
# pred_list = [] 
# for model in model_list:
#     pred = model.predict_proba(test)[:,1] 
#     # score = roc_auc_score(y_valid,pred)
#     # print(score) # 모델들의 성능 보기 비교해서 불필요한 모델은 걸래내자
#     pred_list.append(pred)
    
# # pred = np.mean(pred_list,axis=0) 



In [None]:
from itertools import combinations

In [None]:

best_score = 0 
for i in range(2, len(model_list)): # 2~model_list 만큼 비교하겠다.
    for models in combinations(model_list, i):

        preds = [ model.predict_proba(x_valid)[:,1] for model in models]
        pred = np.mean(preds,axis=0) 
        score = roc_auc_score(y_valid,pred) 
        if best_score < score: 
            best_score = score 
            best_models = models
best_score



0.7743143964580529

In [None]:
best_models

(XGBClassifier(random_state=42),
 LogisticRegression(random_state=42),
 RandomForestClassifier(random_state=42))

In [None]:
pred_list =[]
for model in best_models:
    pred = model.predict_proba(test)[:,1]
    pred_list.append(pred)
    # for j in len(best_models[i]):
    #     print(best_models[i][j])




In [None]:
pred = np.mean(pred_list,axis=0)

In [None]:
sample_submission["target"] = pred
sample_submission

Unnamed: 0,고객번호,target
0,10001,0.602531
1,10070,0.448243
2,10075,0.325954
3,10086,0.358779
4,10128,0.609759
...,...,...
941,49903,0.503832
942,49918,0.158814
943,49937,0.476924
944,49949,0.296403


In [None]:
sample_submission.to_csv("submit.csv",index=False)