<a href="https://colab.research.google.com/github/jeffrey82221/cc_fraud_delection/blob/main/FraudDetectionTrainModulized_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Functions 

In [3]:
import copy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import recall_score, precision_score, precision_recall_curve
from sklearn.model_selection import train_test_split
############################ Preprocessing ###################################
def extend_with_log_scale_features(data, log_scale_feature_list):
  c_data = copy.copy(data)
  for f_name in log_scale_feature_list:
    c_data[f_name + '_LOG_SCALE'] = np.log10(data[f_name])
  return c_data
def extend_with_null_or_not_features(data, has_null_feature_list):
  c_data = copy.copy(data)
  for f_name in has_null_feature_list:
    c_data[f_name + '_NULL_OR_NOT'] = data[f_name].isna().astype(int)
  return c_data
def extend_with_detailed_time(data, weekday = True, hour = True):
  '''
  Add WEEKDAY and HOUR and convert DATETIME into strptime format. 
  '''
  c_data = copy.copy(data)
  c_data["DATETIME"] = c_data["DATETIME"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
  if weekday:
    c_data["WEEKDAY"] = c_data["DATETIME"].apply(lambda x: x.weekday() + 1)
  if hour:
    c_data["HOUR"] = c_data["DATETIME"].apply(lambda x: x.hour + 1)
  return c_data 
### Features calculated from current and previous transaction 
def extend_with_same_shop_features(data, max_time_shift = 5, pivot_feature = 'CHID'):
  # CHID: 卡人ID
  # CANO: 交易卡號
  c_data = copy.copy(data)
  assert max_time_shift > 2
  def identical_shop_index(data, time_shift, pivot_feature):
    df = copy.copy(data)
    df["shift"] = df.groupby([pivot_feature])['MCHNO'].shift(time_shift)
    name = "MCHNO" + '_SAME' + str(time_shift)
    df[name] = (df["MCHNO"] == df['shift']).astype(int)
    df[name][df['MCHNO'].isna()] = -1
    df[name][df['shift'].isna()] = -1
    df = df.drop("shift", 1)
    return df
  for time_shift in range(1, max_time_shift + 1):
    print("add shop identical index between current and " + str(time_shift) + "th-last transaction")
    c_data = identical_shop_index(c_data, time_shift, pivot_feature)
  return c_data
def extend_with_same_MCC(data, max_time_shift = 5, pivot_feature = 'CHID'):
  # CHID: 卡人ID
  # CANO: 交易卡號
  c_data = copy.copy(data)
  assert max_time_shift > 2
  def identical_MCC_index(data, time_shift, pivot_feature):
    df = copy.copy(data)
    df["shift"] = df.groupby([pivot_feature])['MCC'].shift(time_shift)
    name = "MCC" + '_SAME' + str(time_shift)
    df[name] = (df["MCC"] == df['shift']).astype(int)
    df = df.drop("shift", 1)
    return df
  for time_shift in range(1, max_time_shift + 1):
    print("add MCC identical index between current and " + str(time_shift) + "th-last transaction")
    c_data = identical_MCC_index(c_data, time_shift, pivot_feature)
  return c_data
def extend_with_same_STOCN(data, max_time_shift = 5, pivot_feature = 'CHID'):
  # CHID: 卡人ID
  # CANO: 交易卡號
  c_data = copy.copy(data)
  assert max_time_shift > 2
  def identical_STOCN_index(data, time_shift, pivot_feature):
    df = copy.copy(data)
    df["shift"] = df.groupby([pivot_feature])['STOCN'].shift(time_shift)
    name = "STOCN" + '_SAME' + str(time_shift)
    df[name] = (df["STOCN"] == df['shift']).astype(int)
    df = df.drop("shift", 1)
    return df
  for time_shift in range(1, max_time_shift + 1):
    print("add STOCN identical index between current and " + str(time_shift) + "th-last transaction")
    c_data = identical_STOCN_index(c_data, time_shift, pivot_feature)
  return c_data
def extend_with_same_FLAM1(data, max_time_shift = 5, pivot_feature = 'CHID'):
  # CHID: 卡人ID
  # CANO: 交易卡號
  c_data = copy.copy(data)
  assert max_time_shift > 2
  def identical_FLAM1_index(data, time_shift, pivot_feature):
    df = copy.copy(data)
    df["shift"] = df.groupby([pivot_feature])['FLAM1'].shift(time_shift)
    name = "FLAM1" + '_DIFF' + str(time_shift)
    df[name] = (df["FLAM1"] - df['shift']).fillna(0)
    df = df.drop("shift", 1)
    return df
  for time_shift in range(1, max_time_shift + 1):
    print("add FLAM1 identical index between current and " + str(time_shift) + "th-last transaction")
    c_data = identical_FLAM1_index(c_data, time_shift, pivot_feature)
  return c_data
def extend_with_same_class_between_transactions(data, f_name, max_time_shift = 5, pivot_feature = 'CHID'):
  # CHID: 卡人ID
  # CANO: 交易卡號
  c_data = copy.copy(data)
  assert max_time_shift > 2
  def identical_MCC_index(data, time_shift, pivot_feature):
    df = copy.copy(data)
    df["shift"] = df.groupby([pivot_feature])[f_name].shift(time_shift)
    name = f_name + '_SAME' + str(time_shift)
    df[name] = (df[f_name] == df['shift']).astype(int)
    df = df.drop("shift", 1)
    return df
  for time_shift in range(1, max_time_shift + 1):
    print("add " + f_name + " identical index between current and " + str(time_shift) + "th-last transaction")
    c_data = identical_MCC_index(c_data, time_shift, pivot_feature)
  return c_data
def extend_with_strang_weekday_transaction_change(data, max_time_shift = 5, pivot_feature = 'CHID'):
  # CHID: 卡人ID
  # CANO: 交易卡號
  c_data = copy.copy(data)
  assert max_time_shift > 2
  def strange_week_index(data, time_shift, pivot_feature):
    df = copy.copy(data)
    df["shift"] = df.groupby([pivot_feature])['WEEKDAY'].shift(time_shift)
    name = 'WEEKLY_TRANS' + '_STRANGE' + str(time_shift)
    df[name] = ((df['WEEKDAY']!=6) & (df['WEEKDAY']!=7) & ((df['shift']==6)|(df['shift']==7))).astype(int)
    df = df.drop("shift", 1)
    return df
  for time_shift in range(1, max_time_shift + 1):
    print("add " + 'WEEKDAY' + " identical index between current and " + str(time_shift) + "th-last transaction")
    c_data = strange_week_index(c_data, time_shift, pivot_feature)
  return c_data
def overall_preprocessing(train_data):
  has_null_feature_list = [
    "AVAILABLE_LIMIT_AMT",
    "BONUS_POINTS",
    "CURRENT_CASH_ADV_AMT",
    "CURRENT_FEE",
    "CURRENT_INSTALLMENT_PURCH_AMT",
    "CURRENT_PURCH_AMT",
    "LST_CYCLE_UNPAID_BAL"
    ]
  tmp_data = extend_with_null_or_not_features(train_data, has_null_feature_list)

  log_scale_feature_list = [
    'BNSPT',
    'FLAM1',
    'ACCT_VINTAGE',
    'AVAILABLE_LIMIT_AMT',
    'BONUS_POINTS',
    'CREDIT_LIMIT_AMT',
    'CREDIT_REVOLVING_RATE',
    'CREDIT_USE_RATE',
    'CURRENT_CASH_ADV_AMT',
    'CURRENT_FEE',
    'CURRENT_INSTALLMENT_BAL',
    'CURRENT_INSTALLMENT_PURCH_AMT',
    'CURRENT_PURCH_AMT',
    'LST_CYCLE_UNPAID_BAL',
    'REVOLVING_AMT'
  ]
  tmp_data = extend_with_log_scale_features(tmp_data, log_scale_feature_list)
  tmp_data = extend_with_detailed_time(tmp_data, 
    weekday = True, hour = True)
  tmp_data = extend_with_time_difference_features(tmp_data, 
    max_time_shift = 20, pivot_feature = 'CHID')
  tmp_data = extend_with_same_FLAM1(tmp_data, 
    max_time_shift = 20, pivot_feature = 'CHID')
  tmp_data = extend_with_strang_weekday_transaction_change(tmp_data, 
    max_time_shift = 5, pivot_feature = 'CHID')
  for class_name in ['ECFG', 'PAY_TYPE', 'CONTP', 'ETYMD', 'STOCN', 'SCITY', 'APPFG', 'MCC', 'MCHNO', 'FALLBACK_IND']:
    tmp_data = extend_with_same_class_between_transactions(tmp_data, class_name,
      max_time_shift = 20, pivot_feature = 'CHID')
    
  tmp_data = preprocessing(tmp_data)
  return tmp_data
def extend_with_time_difference_features(data, max_time_shift = 5, pivot_feature = 'CHID'):
  # CHID: 卡人ID
  # CANO: 交易卡號
  c_data = copy.copy(data)
  assert max_time_shift > 2
  def date_diff(data, time_shift, pivot_feature):
    df = copy.copy(data)
    df["shift"] = df.groupby([pivot_feature])["DATETIME"].shift(time_shift)
    name = 'DATETIME' + '_DIF' + str(time_shift)
    df[name] = (df["DATETIME"] - df['shift']).dt.total_seconds().fillna(0)
    # 
    df = df.drop("shift", 1)
    return df
  for time_shift in range(1, max_time_shift + 1):
    print("add time difference between current and " + str(time_shift) + "th-last transaction")
    c_data = date_diff(c_data, time_shift, pivot_feature)
  return c_data

def preprocess_null_values(data):
  # 將空值填補
  c_data = copy.copy(data)
  c_data[
        c_data.select_dtypes(include=['object']).columns
      ] = c_data[
        c_data.select_dtypes(include=['object']).columns
      ].fillna("NULL")
  c_data[
      c_data.select_dtypes(include=['float64', 'int64']).columns
    ] = c_data[
      c_data.select_dtypes(include=['float64', 'int64']).columns
    ].fillna(-1)
  return c_data


def encode_labels(data):
  #將object欄位使用Label Encoder
  c_data = copy.copy(data)
  labelencoder = LabelEncoder()
  obj_col = c_data.select_dtypes(include=['object']).columns.to_list()
  for col in obj_col:
    c_data[col] = labelencoder.fit_transform(c_data[col])
  return c_data
def preprocessing(data):
  r_data = preprocess_null_values(data)
  return encode_labels(r_data)
############################ Training Preprocess ############################
def resample(data, sampling_rate=0.7, sample_type='downsample'):
  # note that testing data should not be re-sampled. 
  assert sample_type == 'downsample' or sample_type == 'upsample'
  c_data = copy.copy(data) 
  #將資料切分為train&test
  if sample_type == 'downsample': 
    df_fraud = c_data[c_data["FRAUD_IND"] == 1]
    df_not_fraud = c_data[c_data["FRAUD_IND"] != 1].sample(frac=sampling_rate, random_state=42)
  elif sample_type == 'upsample':
    df_fraud = c_data[c_data["FRAUD_IND"] == 1].sample(frac=1./sampling_rate, replace = True, random_state=42)
    df_not_fraud = c_data[c_data["FRAUD_IND"] != 1]
  df_train = pd.concat([df_fraud, df_not_fraud], 0)
  return df_train

def create_X(data, drop_list = []):
  if drop_list:
    return data.drop(drop_list, 1)
  else:
    return data

def create_X_y(data, drop_list = ['FRAUD_IND']):
  X = data.drop(drop_list, 1)
  y = data["FRAUD_IND"]
  return X,y

############################ Model Build ####################################
def train_lgb(x_train, x_test, y_train, y_test, max_depth = 8, learning_rate = 0.05, n_estimators = 1000):
  # n_estimators: number of trees 
  lgb_train = lgb.Dataset(x_train, y_train)
  lgb_test = lgb.Dataset(x_test, y_test)
  params = {
      "boosting_type": "gbdt",
      "objective": "binary",
      "metric": "binary_logloss",
      "max_depth": max_depth,
      "learning_rate": learning_rate,
      "n_estimators": n_estimators,
  }
  trained_model = lgb.train(
      params,
      lgb_train,
      num_boost_round=5000,
      valid_sets=[lgb_train, lgb_test],
      early_stopping_rounds=30,
      verbose_eval=50
  )
  return trained_model
##### Get Result Generated from Model #####################################
def evaluate(clf, x_test, y_test):
  y_pred = clf.predict(x_test)
  precision, recall, threshold = precision_recall_curve(y_test, y_pred)
  performance = {"precision": precision[0:-1],
                "recall": recall[0:-1],
                "threshold": threshold
                }
  performance["f1"] = 2 * (performance["precision"] * performance["recall"]) / (performance["precision"] + performance["recall"])
  performance = pd.DataFrame(performance)
  thr = performance[performance["f1"] == max(performance["f1"])]["threshold"].values[0]
  recall = performance[performance["f1"] == max(performance["f1"])]["recall"].values[0]
  precision = performance[performance["f1"] == max(performance["f1"])]["precision"].values[0]
  print("Recall Score:", round(recall,4))
  print("Precision Score:", round(precision,4))
  f1 = 2 * (precision * recall) / (precision + recall)
  print("F1 Score:", round(f1,4))
  print("Threshold: ", round(thr,4))
def get_important_feature_table(clf, x_train):
  importance = {
  "col": np.array(x_train.columns),
  "imp": lgb.Booster.feature_importance(clf)
  }
  df_imp = pd.DataFrame(importance).sort_values(by='imp', ascending=False)
  return df_imp


def expend_by_onehot_encoded_features(input_data):
  '''
  features_to_be_onehot_encoded = [
    "CONTP", 
    "ETYMD", 
    #"STOCN", 
    "PAY_TYPE", 
    "CATP1", 
    "CUORG", 
    "TSCFG", 
    #"EDU_CODE", 
    #"INCOME_RANGE_CODE", 
    "OCUP_CODE", 
    "POSITION_CODE", 
    ]
  '''
  features_to_be_onehot_encoded = [
    "CONTP", 
    "ETYMD", 
    #"STOCN", 
    "PAY_TYPE", 
    "CATP1", 
    "CUORG", 
    "TSCFG", 
    #"EDU_CODE", 
    #"INCOME_RANGE_CODE", 
    "OCUP_CODE", 
    "POSITION_CODE", 
    ]
  data = copy.copy(input_data)
  for f_name in features_to_be_onehot_encoded:
    data[f_name] = data[f_name].fillna('NULL').astype(str)
    classes = list(set(data[f_name]))
    for c in classes:
      data[f_name + "_" + c] = (data[f_name] == c).astype(int)
  return data
def overall_preprocessing(train_data):
  '''has_null_feature_list = [
    "AVAILABLE_LIMIT_AMT",
    "BONUS_POINTS",
    "CURRENT_CASH_ADV_AMT",
    "CURRENT_FEE",
    "CURRENT_INSTALLMENT_PURCH_AMT",
    "CURRENT_PURCH_AMT",
    "LST_CYCLE_UNPAID_BAL"
    ]
  tmp_data = extend_with_null_or_not_features(train_data, has_null_feature_list)
  print("after extend_with_null_or_not_features:",tmp_data.shape[1])'''
  
  tmp_data = expend_by_onehot_encoded_features(train_data)
  print("after expend_by_onehot_encoded_features: ", tmp_data.shape[1])
  '''log_scale_feature_list = [
    'BNSPT',
    'FLAM1',
    'ACCT_VINTAGE',
    'AVAILABLE_LIMIT_AMT',
    'BONUS_POINTS',
    'CREDIT_LIMIT_AMT',
    'CREDIT_REVOLVING_RATE',
    'CREDIT_USE_RATE',
    'CURRENT_CASH_ADV_AMT',
    'CURRENT_FEE',
    'CURRENT_INSTALLMENT_BAL',
    'CURRENT_INSTALLMENT_PURCH_AMT',
    'CURRENT_PURCH_AMT',
    'LST_CYCLE_UNPAID_BAL',
    'REVOLVING_AMT'
  ]
  tmp_data = extend_with_log_scale_features(tmp_data, log_scale_feature_list)
  print("after extend_with_log_scale_features: ", tmp_data.shape[1])'''
  tmp_data = extend_with_detailed_time(tmp_data, 
    weekday = True, hour = True)
  print("after extend_with_detailed_time: ", tmp_data.shape[1])
  tmp_data = extend_with_time_difference_features(tmp_data, 
    max_time_shift = 10, pivot_feature = 'CHID')
  print("after extend_with_time_difference_features: ", tmp_data.shape[1])
  tmp_data = extend_with_same_FLAM1(tmp_data, 
    max_time_shift = 10, pivot_feature = 'CHID')
  print("after extend_with_same_FLAM1: ", tmp_data.shape[1])
  '''tmp_data = extend_with_strang_weekday_transaction_change(tmp_data, 
    max_time_shift = 10, pivot_feature = 'CHID')
  print("after extend_with_strang_weekday_transaction_change: ", tmp_data.shape[1])'''
  for class_name in ['ECFG', 'PAY_TYPE', 'CONTP', 'ETYMD', 'STOCN', 'SCITY', 'APPFG', 'MCC', 'MCHNO', 'FALLBACK_IND']:
    tmp_data = extend_with_same_class_between_transactions(tmp_data, class_name,
      max_time_shift = 3, pivot_feature = 'CHID')
  print("after extend_with_same_class_between_transactions: ", tmp_data.shape[1])
  tmp_data = preprocessing(tmp_data)
  print("after preprocessing: ", tmp_data.shape[1])
  return tmp_data
def model_train_and_evaluate(tmp_data, sampling_rate, n_estimators):
  resampled_train_data = resample(tmp_data, 
      sampling_rate=sampling_rate, sample_type='upsample')
  X, y = create_X_y(resampled_train_data, 
    drop_list = list(set(["FRAUD_IND", "TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"]))
  )
  val_percentage = 0.33
  x_train, x_test, y_train, y_test = train_test_split(X, y, 
    test_size=val_percentage, shuffle=True, random_state=42)
  clf = train_lgb(x_train, x_test, y_train, y_test, 
    max_depth = 8, learning_rate = 0.05, n_estimators = n_estimators)
  evaluate(clf, x_test, y_test)
  return clf, x_train
'''for threshold in [0.9991, 0.9993, 0.9995, 0.9997]: # 0.5, 0.7, 0.9, 0.99, 0.999
  y_result = (y_pred > threshold).astype(int).T
  result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
  result_table.columns = ['TXKEY', 'FRAUD_IND']
  result_table.set_index('TXKEY')
  imb_ratio = result_table['FRAUD_IND'].mean()
  print("imbalance rate of test data:", imb_ratio)
  file_name = 'tmp_submission_th_'+str(threshold)+"_imr_"+str(imb_ratio)+'.csv'
  result_table.to_csv(file_name, mode = 'w', index= False)
  print(file_name+ ' saved.')'''
'''for threshold in [0.9991, 0.9993, 0.9995, 0.9997]: # 0.5, 0.7, 0.9, 0.99, 0.999
  y_result = (y_pred > threshold).astype(int).T
  result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
  result_table.columns = ['TXKEY', 'FRAUD_IND']
  result_table.set_index('TXKEY')
  imb_ratio = result_table['FRAUD_IND'].mean()
  print("imbalance rate of test data:", imb_ratio)
  file_name = 'tmp_submission_th_'+str(threshold)+"_imr_"+str(imb_ratio)+'.csv'
  result_table.to_csv(file_name, mode = 'w', index= False)
  print(file_name+ ' saved.')'''

def save_submition_file_with_optimal_threshold(test_data, y_pred, imb_ratio=0.006, new_parameter = None):
  def calculate_threshold_from_imb_ratio(imb_ratio):
    sorted_y_pred = copy.copy(y_pred)
    sorted_y_pred.sort()
    sorted_y_pred = sorted_y_pred[::-1]
    threshold = sorted_y_pred[int(len(y_pred) * imb_ratio)]
    return threshold
  threshold = calculate_threshold_from_imb_ratio(imb_ratio)
  y_result = (y_pred > threshold).astype(int).T
  result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
  result_table.columns = ['TXKEY', 'FRAUD_IND']
  result_table.set_index('TXKEY')
  imb_ratio = result_table['FRAUD_IND'].mean()
  print("imbalance rate of test data:", imb_ratio)
  if new_parameter:
    file_name = str(new_parameter) + 'tmp_submission_th_'+str(threshold)+"_imr_"+str(imb_ratio)+'.csv'
  else:
    file_name = 'tmp_submission_th_'+str(threshold)+"_imr_"+str(imb_ratio)+'.csv'
  result_table.to_csv(file_name, mode = 'w', index= False)
  print(file_name+ ' saved.')

# First Run (for selecting unimportant features) 

In [None]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
train_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv')
test_data = "先不給你們"
#查看資料筆數
print("shape of train data:" , train_data.shape)
#print("shape of test data:" , test_data.shape)
# add AGE 
# remove weekday and hour 
tmp_train_data = extend_with_detailed_time(train_data, 
  weekday = False, hour = False)
preprocessed_train_data = preprocessing(tmp_train_data)
resampled_train_data = resample(preprocessed_train_data, 
  sampling_rate=0.7, sample_type='downsample')
X, y = create_X_y(resampled_train_data, 
  drop_list = ["FRAUD_IND", "TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"])
val_percentage = 0.33
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=val_percentage, 
  shuffle=True, random_state=42)
clf = train_lgb(x_train, x_test, y_train, y_test, 
  max_depth = 8, learning_rate = 0.05, n_estimators = 1000)
evaluate(clf, x_test, y_test)
important_feature_table = get_important_feature_table(clf, x_train)
important_feature_table.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
shape of train data: (533202, 59)




Training until validation scores don't improve for 30 rounds.
[50]	training's binary_logloss: 0.162201	valid_1's binary_logloss: 0.162768
[100]	training's binary_logloss: 0.13457	valid_1's binary_logloss: 0.13604
[150]	training's binary_logloss: 0.12292	valid_1's binary_logloss: 0.125427
[200]	training's binary_logloss: 0.115362	valid_1's binary_logloss: 0.118842
[250]	training's binary_logloss: 0.109307	valid_1's binary_logloss: 0.113742
[300]	training's binary_logloss: 0.103638	valid_1's binary_logloss: 0.108953
[350]	training's binary_logloss: 0.0989313	valid_1's binary_logloss: 0.105065
[400]	training's binary_logloss: 0.0948469	valid_1's binary_logloss: 0.101827
[450]	training's binary_logloss: 0.0910325	valid_1's binary_logloss: 0.0986804
[500]	training's binary_logloss: 0.0874282	valid_1's binary_logloss: 0.0957317
[550]	training's binary_logloss: 0.0840682	valid_1's binary_logloss: 0.0930062
[600]	training's binary_logloss: 0.081057	valid_1's binary_logloss: 0.0906978
[650]	tra

Unnamed: 0,col,imp
27,CC_VINTAGE,2721
0,MCC,2437
10,SCITY,1885
8,FLAM1,1810
37,BONUS_POINTS,1545


# Best in v2 (Strategy 8)

In [None]:
def overall_preprocessing(train_data):
  has_null_feature_list = [
    "AVAILABLE_LIMIT_AMT",
    "BONUS_POINTS",
    "CURRENT_CASH_ADV_AMT",
    "CURRENT_FEE",
    "CURRENT_INSTALLMENT_PURCH_AMT",
    "CURRENT_PURCH_AMT",
    "LST_CYCLE_UNPAID_BAL"
    ]
  tmp_data = extend_with_null_or_not_features(train_data, has_null_feature_list)

  log_scale_feature_list = [
    'BNSPT',
    'FLAM1',
    'ACCT_VINTAGE',
    'AVAILABLE_LIMIT_AMT',
    'BONUS_POINTS',
    'CREDIT_LIMIT_AMT',
    'CREDIT_REVOLVING_RATE',
    'CREDIT_USE_RATE',
    'CURRENT_CASH_ADV_AMT',
    'CURRENT_FEE',
    'CURRENT_INSTALLMENT_BAL',
    'CURRENT_INSTALLMENT_PURCH_AMT',
    'CURRENT_PURCH_AMT',
    'LST_CYCLE_UNPAID_BAL',
    'REVOLVING_AMT'
  ]
  tmp_data = extend_with_log_scale_features(tmp_data, log_scale_feature_list)
  tmp_data = extend_with_detailed_time(tmp_data, 
    weekday = True, hour = True)
  tmp_data = extend_with_time_difference_features(tmp_data, 
    max_time_shift = 20, pivot_feature = 'CHID')
  tmp_data = extend_with_same_FLAM1(tmp_data, 
    max_time_shift = 20, pivot_feature = 'CHID')
  tmp_data = extend_with_strang_weekday_transaction_change(tmp_data, 
    max_time_shift = 20, pivot_feature = 'CHID')
  for class_name in ['ECFG', 'PAY_TYPE', 'CONTP', 'ETYMD', 'STOCN', 'SCITY', 'APPFG', 'MCC', 'MCHNO', 'FALLBACK_IND']:
    tmp_data = extend_with_same_class_between_transactions(tmp_data, class_name,
      max_time_shift = 5, pivot_feature = 'CHID')
    
  tmp_data = preprocessing(tmp_data)
  return tmp_data

In [None]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
train_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv')
#查看資料筆數
print("shape of train data:" , train_data.shape)
tmp_data = overall_preprocessing(train_data)

shape of train data: (533202, 59)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
add time difference between current and 11th-last transaction
add time difference between current and 12th-last transaction
add time difference between current and 13th-last transaction
add time difference between current and 14th-last transaction
add time difference between current and 15th-last transaction
add time difference between current and 16th-last transaction
add time differen

## training parameter tuning 

In [None]:
import warnings
warnings.filterwarnings("ignore")
for sampling_rate in [0.05]:
  for n_estimators in [800, 1600]:
    print(sampling_rate, n_estimators)
    model_train_and_evaluate(tmp_data, sampling_rate, n_estimators)

## Model Training 

In [None]:
import warnings
warnings.filterwarnings("ignore")
clf, x_train = model_train_and_evaluate(tmp_data, 0.1, 800)

Training until validation scores don't improve for 30 rounds.
[50]	training's binary_logloss: 0.104635	valid_1's binary_logloss: 0.10486
[100]	training's binary_logloss: 0.0583369	valid_1's binary_logloss: 0.0589735
[150]	training's binary_logloss: 0.046379	valid_1's binary_logloss: 0.0470974
[200]	training's binary_logloss: 0.0407136	valid_1's binary_logloss: 0.0415769
[250]	training's binary_logloss: 0.0364612	valid_1's binary_logloss: 0.0374831
[300]	training's binary_logloss: 0.033114	valid_1's binary_logloss: 0.0342815
[350]	training's binary_logloss: 0.0299493	valid_1's binary_logloss: 0.0312621
[400]	training's binary_logloss: 0.0275395	valid_1's binary_logloss: 0.028957
[450]	training's binary_logloss: 0.0250833	valid_1's binary_logloss: 0.0266414
[500]	training's binary_logloss: 0.023127	valid_1's binary_logloss: 0.0248117
[550]	training's binary_logloss: 0.0214406	valid_1's binary_logloss: 0.023288
[600]	training's binary_logloss: 0.0197817	valid_1's binary_logloss: 0.0217171

## Generate Testing Result 

In [None]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
test_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/test.csv')
#查看資料筆數
print("shape of test data:" , test_data.shape)
tmp_data = overall_preprocessing(test_data)
X = create_X(tmp_data, 
  drop_list = list(set(["TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"]
  ))
)
y_pred = clf.predict(X)
save_submition_file_with_optimal_threshold(y_pred, imb_ratio=0.006)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
shape of test data: (472335, 58)
add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
add time difference between current and 11th-last transaction
add time difference between current and 12th-last transaction
add time difference between current and 13th-last transaction
add time difference between current and 1

# Strategy 9: Add One-Hot Encoding 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Model Training 

In [None]:
import warnings
warnings.filterwarnings("ignore")
clf, x_train = model_train_and_evaluate(
    overall_preprocessing(
        pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv')
        ), 0.1, 800)

after expend_by_onehot_encoded_features:  149
after extend_with_detailed_time:  151
add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
after extend_with_time_difference_features:  161
add FLAM1 identical index between current and 1th-last transaction
add FLAM1 identical index between current and 2th-last transaction
add FLAM1 identical index between current and 3th-last transaction
add FLAM1 identical index between current and 4th-last 

## Generate Testing Result 

In [None]:
from google.colab import drive
drive.mount('/content/drive')
test_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/test.csv')
y_pred = clf.predict(
    create_X(
        overall_preprocessing(
          test_data
        ), 
        drop_list = list(set(["TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"]))
    )
  )
save_submition_file_with_optimal_threshold(test_data, y_pred, imb_ratio=0.006)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
after expend_by_onehot_encoded_features:  148
after extend_with_detailed_time:  150
add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
after extend_with_time_difference_features:  160
add FLAM1 identical index between current and 1th-last transaction
add FLAM1 identical index between current and 2th-last trans

# Strategy 10: Increase n_estimate 

In [17]:
import warnings
warnings.filterwarnings("ignore")
from google.colab import drive
drive.mount('/content/drive')
for n_estimate in [1000, 1600, 2000, 3200]:
  clf, _ = model_train_and_evaluate(
      overall_preprocessing(
          pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv')
          ), 0.1, n_estimate)
  test_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/test.csv')
  y_pred = clf.predict(
      create_X(
          overall_preprocessing(
            test_data
          ), 
          drop_list = list(set(["TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"]))
      )
    )
  save_submition_file_with_optimal_threshold(test_data, y_pred, imb_ratio=0.006, new_parameter=n_estimate)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
after expend_by_onehot_encoded_features:  149
after extend_with_detailed_time:  151
add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
after extend_with_time_difference_features:  161
add FLAM1 identical index between current and 1th-last transaction
add FLAM1 identical index between current and 2th-last trans

## n_estimate=1000 is the best 

# Strategy 11: Change Feature Count 

In [7]:
def drop_features(input_data, remaining_feature_count):
  data = copy.copy(input_data)
  assert remaining_feature_count < data.shape[1]
  drop_count = data.shape[1] - remaining_feature_count
  dropped_features = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/imb_table.csv').col.tolist()[-(drop_count):]
  for f_name in dropped_features:
    try:
      del data[f_name]
    except:
      print(f_name, "does not exist to be deleted")
  return data
def overall_preprocessing(train_data, remaining_feature_count = 65):
  '''has_null_feature_list = [
    "AVAILABLE_LIMIT_AMT",
    "BONUS_POINTS",
    "CURRENT_CASH_ADV_AMT",
    "CURRENT_FEE",
    "CURRENT_INSTALLMENT_PURCH_AMT",
    "CURRENT_PURCH_AMT",
    "LST_CYCLE_UNPAID_BAL"
    ]
  tmp_data = extend_with_null_or_not_features(train_data, has_null_feature_list)
  print("after extend_with_null_or_not_features:",tmp_data.shape[1])'''
  
  tmp_data = expend_by_onehot_encoded_features(train_data)
  print("after expend_by_onehot_encoded_features: ", tmp_data.shape[1])
  '''log_scale_feature_list = [
    'BNSPT',
    'FLAM1',
    'ACCT_VINTAGE',
    'AVAILABLE_LIMIT_AMT',
    'BONUS_POINTS',
    'CREDIT_LIMIT_AMT',
    'CREDIT_REVOLVING_RATE',
    'CREDIT_USE_RATE',
    'CURRENT_CASH_ADV_AMT',
    'CURRENT_FEE',
    'CURRENT_INSTALLMENT_BAL',
    'CURRENT_INSTALLMENT_PURCH_AMT',
    'CURRENT_PURCH_AMT',
    'LST_CYCLE_UNPAID_BAL',
    'REVOLVING_AMT'
  ]
  tmp_data = extend_with_log_scale_features(tmp_data, log_scale_feature_list)
  print("after extend_with_log_scale_features: ", tmp_data.shape[1])'''
  tmp_data = extend_with_detailed_time(tmp_data, 
    weekday = True, hour = True)
  print("after extend_with_detailed_time: ", tmp_data.shape[1])
  tmp_data = extend_with_time_difference_features(tmp_data, 
    max_time_shift = 10, pivot_feature = 'CHID')
  print("after extend_with_time_difference_features: ", tmp_data.shape[1])
  tmp_data = extend_with_same_FLAM1(tmp_data, 
    max_time_shift = 10, pivot_feature = 'CHID')
  print("after extend_with_same_FLAM1: ", tmp_data.shape[1])
  '''tmp_data = extend_with_strang_weekday_transaction_change(tmp_data, 
    max_time_shift = 10, pivot_feature = 'CHID')
  print("after extend_with_strang_weekday_transaction_change: ", tmp_data.shape[1])'''
  for class_name in ['ECFG', 'PAY_TYPE', 'CONTP', 'ETYMD', 'STOCN', 'SCITY', 'APPFG', 'MCC', 'MCHNO', 'FALLBACK_IND']:
    tmp_data = extend_with_same_class_between_transactions(tmp_data, class_name,
      max_time_shift = 3, pivot_feature = 'CHID')
  print("after extend_with_same_class_between_transactions: ", tmp_data.shape[1])
  tmp_data = preprocessing(tmp_data)
  print("after preprocessing: ", tmp_data.shape[1])
  tmp_data = drop_features(tmp_data, remaining_feature_count)
  print("after feature_dropped: ", tmp_data.shape[1])
  return tmp_data

In [10]:
import warnings
warnings.filterwarnings("ignore")
from google.colab import drive
drive.mount('/content/drive')
for remaining_feature_count in [120, 90, 60, 30]:
  clf, x_train = model_train_and_evaluate(
      overall_preprocessing(
          pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv'), remaining_feature_count = remaining_feature_count
          ), 0.1, 200)
  test_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/test.csv')
  y_pred = clf.predict(
      create_X(
          overall_preprocessing(
            test_data, remaining_feature_count = remaining_feature_count
          ), 
          drop_list = list(set(["TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"]))
      )
    )
  save_submition_file_with_optimal_threshold(test_data, y_pred, imb_ratio=0.006, new_parameter=remaining_feature_count)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
after expend_by_onehot_encoded_features:  149
after extend_with_detailed_time:  151
add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
after extend_with_time_difference_features:  161
add FLAM1 identical index between current and 1th-last transaction
add FLAM1 identical index between current and 2th-last trans

KeyboardInterrupt: ignored

In [None]:
import warnings
warnings.filterwarnings("ignore")
from google.colab import drive
drive.mount('/content/drive')
for n_estimate in [400, 800, 1000]:
  clf, x_train = model_train_and_evaluate(
      overall_preprocessing(
          pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv'), remaining_feature_count = 90
          ), 0.1, n_estimate)
  test_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/test.csv')
  y_pred = clf.predict(
      create_X(
          overall_preprocessing(
            test_data, remaining_feature_count = 90
          ), 
          drop_list = list(set(["TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"]))
      )
    )
  save_submition_file_with_optimal_threshold(test_data, y_pred, imb_ratio=0.006, new_parameter=n_estimate)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
after expend_by_onehot_encoded_features:  149
after extend_with_detailed_time:  151
add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
after extend_with_time_difference_features:  161
add FLAM1 identical index between current and 1th-last transaction
add FLAM1 identical index between current and 2th-last trans

## Run the best one again 

## Regenerate important table 

## Strategy 12: increase feature by increasing max_shift_count