In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, f1_score
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from src.data_processor.phase_2.prob1.v1 import  Phase2Prob1FeatureProcessor
import numpy as np

In [2]:
df = pd.read_parquet("D:/Data/MLOPS_2023/data_phase-2/phase-2/prob-1/raw_train.parquet")

In [3]:
df.shape

(61841, 42)

In [4]:
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)
df.shape

(43405, 42)

In [5]:
df.reset_index(drop=True, inplace=True)
x = df.groupby(['feature1', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6',
                'feature7', 'feature8', 'feature9', 'feature10', 'feature11',
                'feature12', 'feature13', 'feature14', 'feature15', 'feature16',
                'feature17', 'feature18', 'feature19', 'feature20', 'feature21',
                'feature22', 'feature23', 'feature24', 'feature25', 'feature26',
                'feature27', 'feature28', 'feature29', 'feature30', 'feature31',
                'feature32', 'feature33', 'feature34', 'feature35', 'feature36',
                'feature37', 'feature38', 'feature39', 'feature40', 'feature41']).apply(lambda x : list(x.index)).reset_index()
a = x[x[0].apply(lambda x: len(x)>1)]
blacklist = []
for t in a[0].values:
    blacklist.extend(t)
    index = np.array(blacklist)
not_index = np.setdiff1d(df.index.to_numpy(), index)
df = df.iloc[not_index]
df.reset_index(inplace=True, drop=True)
df['label'].value_counts()

label
1    21787
0    21480
Name: count, dtype: int64

In [6]:
df.shape

(43267, 42)

In [11]:
HyperParameters={'n_estimators':100,
                 'learning_rate':0.1,
                 'max_depth':7,
                 'colsample_bytree':0.5,
                 'subsample':0.8,
                 'reg_alpha':8,
                 'reg_lambda':32,
                 'random_state':42}
hyper_parameters = HyperParameters.copy()

In [12]:
processor = Phase2Prob1FeatureProcessor()
new_df = processor.fit_transform(df)

kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
FEATURES = processor.data_features['features']
categorical = processor.data_features['categorical_features']
TARGET = 'label'

print(new_df.shape, df.shape)

models = []
scores = []
oofs = np.zeros(df.shape[0])
for i, (train_idx, valid_idx) in enumerate(kfold.split(new_df, df['label'])):
    X_train = new_df.iloc[train_idx][FEATURES]
    X_valid = new_df.iloc[valid_idx][FEATURES]
    y_train = df.iloc[train_idx][TARGET]
    y_valid = df.iloc[valid_idx][TARGET]
    model = LGBMClassifier(**hyper_parameters)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric=["logloss", "auc"],
              categorical_feature=categorical,
              early_stopping_rounds=50,
              verbose=50)
    # model = CatBoostClassifier(iterations=100, learning_rate=0.05, max_depth=7, subsample=0.8, random_state=42,eval_metric='AUC',class_weights={1:1.5,0:1})
    # model.fit(X_train, y_train,
    #           cat_features=categorical,
    #           eval_set=[(X_train, y_train), (X_valid, y_valid)])
    models.append(model)
    y_pred_proba = model.predict_proba(X_valid)[:,1]
    y_pred = (y_pred_proba>0.5).astype(int)
    oofs[valid_idx] = y_pred_proba

    print(f"fold {i} : {roc_auc_score(y_valid, y_pred_proba)}")
    print(classification_report(y_valid, y_pred))
    scores.append(roc_auc_score(y_valid, y_pred_proba))

print(np.mean(scores), np.std(scores))

(43267, 41) (43267, 42)
[50]	training's binary_logloss: 0.182853	training's auc: 0.9809	valid_1's binary_logloss: 0.190932	valid_1's auc: 0.978629
[100]	training's binary_logloss: 0.162769	training's auc: 0.983826	valid_1's binary_logloss: 0.174908	valid_1's auc: 0.980656
fold 0 : 0.9806564138983679
              precision    recall  f1-score   support

           0       0.90      0.93      0.91      4296
           1       0.93      0.90      0.91      4358

    accuracy                           0.91      8654
   macro avg       0.91      0.91      0.91      8654
weighted avg       0.91      0.91      0.91      8654

[50]	training's binary_logloss: 0.18396	training's auc: 0.980831	valid_1's binary_logloss: 0.189307	valid_1's auc: 0.979284
[100]	training's binary_logloss: 0.164492	training's auc: 0.983579	valid_1's binary_logloss: 0.174081	valid_1's auc: 0.980771
fold 1 : 0.9807708783606509
              precision    recall  f1-score   support

           0       0.91      0.91      

In [11]:
gmodel = LGBMClassifier(**models[0].get_params())
gmodel.fit(new_df[FEATURES], df[TARGET],
           eval_set=[(new_df[FEATURES],df[TARGET])],
           eval_metric=["logloss", "auc"],
           categorical_feature=categorical,
           verbose=50)

with open(f'../checkpoints/phase-2/prob-1/v1.pkl','wb') as file:
    pickle.dump(gmodel, file)

[50]	valid_0's binary_logloss: 0.182324	valid_0's auc: 0.980857
[100]	valid_0's binary_logloss: 0.163481	valid_0's auc: 0.983564


In [12]:
pd.DataFrame({'feature':gmodel.feature_name_, 'score': gmodel.feature_importances_}).sort_values('score',ascending=False)

Unnamed: 0,feature,score
25,feature26,302
6,feature7,283
10,feature11,165
34,feature35,133
26,feature27,128
7,feature8,114
29,feature30,99
14,feature15,96
11,feature12,92
0,feature1,83
