In [16]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
train_df = pd.read_csv("./data/train1.csv")
test_df = pd.read_csv("./data/test1.csv")

In [3]:
train_df = train_df.drop(['ID'], axis=1)

In [4]:
train_df.columns

Index(['ac_ctr_diff', 'hsp_avg_optt_bilg_isamt_s',
       'hsp_avg_surop_bilg_isamt_s', 'fds_cust_yn', 'hspz_dys_s', 'inamt_nvcd',
       'hsp_avg_diag_bilg_isamt_s', 'blrs_cd', 'dsas_ltwt_gcd',
       'dsas_avg_diag_bilg_isamt_s', 'dsas_acd_rst_dcd', 'base_ym', 'kcd_gcd',
       'hsp_avg_hspz_bilg_isamt_s', 'mtad_cntr_yn', 'surop_blcnt_s',
       'dsas_avg_optt_bilg_isamt_s', 'isrd_age_dcd', 'hspz_blcnt_s',
       'dsas_avg_surop_bilg_isamt_s', 'dsas_avg_hspz_bilg_isamt_s',
       'bilg_isamt_s', 'target'],
      dtype='object')

In [5]:
category_list=["dsas_ltwt_gcd", "kcd_gcd", "dsas_acd_rst_dcd",
              "blrs_cd", "isrd_age_dcd","mtad_cntr_yn", 
               "fds_cust_yn", "inamt_nvcd", "ac_ctr_diff"]

In [6]:
for i in category_list:
    train_df[i] = train_df[i].astype("category")
    test_df[i] = test_df[i].astype("category")

In [7]:
train_df["month"]=train_df["base_ym"] - 201900
train_df = train_df.drop("base_ym", axis=1)

In [8]:
params = dict(
    learning_rate=0.01,
    objective="multiclass",
    num_leaves = 60,
    max_depth = -1,
    num_class=3,
    metric="custom",
    verbosity=-1,
    bagging_fraction=0.9,
    feature_fraction=0.9,
    bagging_freq=5
)

In [9]:
def create_dataset(data, dep_var="target"):
    x, y = data.drop(columns=[dep_var]), data[dep_var]
    cat_vars = list(x.columns.values[x.dtypes == "category"])
    return lgb.Dataset(x, y, categorical_feature=cat_vars)

In [10]:
def f1(pred, data):
    true = data.get_label()
    pred = pred.reshape(3, -1).T.argmax(axis=1)
    return "f1_score", f1_score(true, pred, average="macro"), True

In [11]:
def ts_split(data, start_month, target_month):
    trainset = data[(data.month < target_month) & (data.month >= start_month)].drop("month", axis=1)
    testset = data[data.month == target_month].drop("month", axis=1)
    return trainset, testset

In [12]:
def train(trainset, testset, params):
    test_result = {}
    model = lgb.train(
        params=params,
        train_set=trainset,
        valid_sets=testset,
        num_boost_round=3000,
        verbose_eval=False,
        feval=f1,
        early_stopping_rounds=100,
        evals_result=test_result
    )
    test_score = round(max(test_result["valid_0"]['f1_score']), 4)
    return test_score, model

In [13]:
class LightGBM:
    def tscv(self, data, interval, params=params):
        scores = []

        row_format = "{:^15}|{:^15}|{:^15}|"
        print(row_format.format("start_month","target_month", "f1_score"))
        print(row_format.format("=" * 15, "=" * 15, "=" * 15))

        for target_month in range(2, 12):
            if interval == 0:
                start_month = 1
            else:
                start_month = target_month - interval
                if start_month < 1:
                    continue
            trainset, testset = ts_split(data, start_month, target_month)
            trainset, testset = create_dataset(trainset), create_dataset(testset)
            score, _ = train(trainset, testset, params)
            scores.append(score)
            print(row_format.format(start_month, target_month, score))
        print(f"CV score: {np.round(np.mean(scores), 4)} Std: {np.round(np.std(scores), 4)}")

In [14]:
LG = LightGBM()

In [17]:
for i in range(0,13):
    LG.tscv(train_df, i)

  start_month  | target_month  |   f1_score    |
       1       |       2       |    0.7821     |
       1       |       3       |    0.7394     |
       1       |       4       |    0.7416     |
       1       |       5       |    0.7469     |
       1       |       6       |    0.7395     |
       1       |       7       |    0.7585     |
       1       |       8       |    0.7474     |


KeyboardInterrupt: 

In [None]:
train_df.head()

In [None]:
lgbtrain=create_dataset(train_df)
model = lgb.train(
        params,
        lgbtrain
    )

In [None]:
pred=model.predict(test_df)
pred

In [None]:
pred = [np.argmax(line) for line in pred]
set(pred)

In [None]:
for i in set(pred):
    print(pred.count(i))

In [None]:
result=pd.DataFrame(pred, columns=["target"])
result=result.reset_index()
result=result.rename(columns={"index":"ID"})
result

In [None]:
result.to_csv("second.csv", index=False)

In [None]:
lgb.plot_importance(model, figsize=(10,10))