### Import library

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb

### Load data

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,ID,Click,F01,F02,F03,F04,F05,F06,F07,F08,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TRAIN_00000000,1,NSLHFNS,AVKQTCL,DTZFPRW,114.0,ISVXFVA,1,PQZBVMG,LPYPUNA,...,NZGEZLW,GTISJWW,380.0,2.0,AXQFZWC,IRUDRFB,,TFJMLCZ,0.0,AURZYDY
1,TRAIN_00000001,0,VGIVWZQ,LSUSMVO,PQGWFJZ,26.0,NFRVLWS,43,IMPIGJT,MIGYEEG,...,NZGEZLW,GTISJWW,466.0,1.0,DRVVDHZ,IRUDRFB,19.0,AUGTURV,0.0,LUZRMLU
2,TRAIN_00000002,0,JCDXFYU,PILDDJU,IAGJDOH,119.0,LFPUEOV,0,FFUTIRZ,OFKQGTY,...,VHXETCF,KHZNEZF,197.0,0.0,QMOULXS,IRUDRFB,8.0,ZVSTLNM,0.0,MHBRSQK
3,TRAIN_00000003,1,PSMFWTP,ZYAVJHP,,15.0,ATQPZSJ,26,ZDTZNSB,THBWWCD,...,IVIRTPR,GTISJWW,8640.0,0.0,IZLJUJS,IRUDRFB,14.0,ZBSRLCQ,0.0,GAZBSSZ
4,TRAIN_00000004,0,SLCRICD,QPQWGXA,,13.0,CHZGJZR,20,PQZBVMG,MIGYEEG,...,NZGEZLW,WHSRKIM,41774.0,0.0,BHBIZCL,IRUDRFB,13.0,QHYLSBX,0.0,QTATWAY


In [4]:
df2 = pd.read_csv('test.csv')

In [5]:
# 공통 레이블 인코더 객체 생성
from sklearn.preprocessing import LabelEncoder
label_encoders = {}

object_features = ['F01', 'F02', 'F03','F05','F07','F08','F09','F10','F12','F13','F15','F16','F17','F20','F21','F22','F23','F25','F26','F28','F30','F31','F34','F35','F37'] 

# 훈련 데이터셋 처리
for feature in object_features:
    top_categories = df[feature].value_counts().nlargest(5).index
    df[feature + '_reduced'] = df[feature].apply(lambda x: x if x in top_categories else 'Other')
    
    # 레이블 인코더 저장 및 적용
    le = LabelEncoder()
    df[feature + '_encoded'] = le.fit_transform(df[feature + '_reduced'])
    label_encoders[feature] = le  # 레이블 인코더 저장

# 검증 데이터셋 처리
for feature in object_features:
    df2[feature + '_reduced'] = df2[feature].apply(lambda x: x if x in label_encoders[feature].classes_ else 'Other')
    df2[feature + '_encoded'] = label_encoders[feature].transform(df2[feature + '_reduced'])  # 저장된 레이블 인코더 사용

# 데이터 타입 및 결측값 처리
for df_temp in [df, df2]:
    df_temp.fillna('NAN', inplace=True)
    float_columns = df_temp.select_dtypes(include=['float64']).columns
    df_temp[float_columns] = df_temp[float_columns].astype('int64')
    object_columns = df_temp.select_dtypes(include=['object']).columns
    df_temp[object_columns] = df_temp[object_columns].astype('category')


  df_temp.fillna('NAN', inplace=True)
  df_temp.fillna('NAN', inplace=True)


In [6]:
train = df[['Click','F01_encoded', 'F02_encoded', 'F03_encoded','F04','F05_encoded','F07_encoded','F08_encoded','F09_encoded','F10_encoded','F11','F12_encoded','F13_encoded','F14','F15_encoded','F16_encoded','F17_encoded','F18','F19','F20_encoded','F21_encoded','F22_encoded','F23_encoded','F24','F25_encoded','F26_encoded','F27','F28_encoded','F30_encoded','F31_encoded','F34_encoded','F35_encoded','F36','F37_encoded']] 
test = df2[['F01_encoded', 'F02_encoded', 'F03_encoded','F04','F05_encoded','F07_encoded','F08_encoded','F09_encoded','F10_encoded','F11','F12_encoded','F13_encoded','F14','F15_encoded','F16_encoded','F17_encoded','F18','F19','F20_encoded','F21_encoded','F22_encoded','F23_encoded','F24','F25_encoded','F26_encoded','F27','F28_encoded','F30_encoded','F31_encoded','F34_encoded','F35_encoded','F36','F37_encoded']] 

### Train

In [7]:
model = lgb.LGBMClassifier(objective='binary')
model.fit(train.drop('Click',axis=1),train['Click'],eval_metric='AUC')

[LightGBM] [Info] Number of positive: 5569860, number of negative: 23035531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.169098 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4642
[LightGBM] [Info] Number of data points in the train set: 28605391, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194714 -> initscore=-1.419668
[LightGBM] [Info] Start training from score -1.419668


### Infer

In [8]:
pred = model.predict_proba(test)

### Submit

In [9]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission

sample_submission['Click'] = pred[:,1]
sample_submission

sample_submission.to_csv('lgbm_v2.csv', index=False)