<a href="https://colab.research.google.com/github/jeffrey82221/cc_fraud_delection/blob/main/FraudDetectionTrainModulized_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Packages 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import recall_score, precision_score, precision_recall_curve
from sklearn.model_selection import train_test_split

# Functions

In [37]:
import copy
############################ Preprocessing ###################################
def extend_with_detailed_time(data, weekday = True, hour = True):
  '''
  Add WEEKDAY and HOUR and convert DATETIME into strptime format. 
  '''
  c_data = copy.copy(data)
  c_data["DATETIME"] = c_data["DATETIME"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
  if weekday:
    c_data["WEEKDAY"] = c_data["DATETIME"].apply(lambda x: x.weekday() + 1)
  if hour:
    c_data["HOUR"] = c_data["DATETIME"].apply(lambda x: x.hour + 1)
  return c_data 

def extend_with_time_difference_features(data, max_time_shift = 5, pivot_feature = 'CHID'):
  # CHID: 卡人ID
  # CANO: 交易卡號
  c_data = copy.copy(data)
  assert max_time_shift > 2
  def date_diff(data, time_shift, pivot_feature):
    df = copy.copy(data)
    df["shift"] = df.groupby([pivot_feature])["DATETIME"].shift(time_shift)
    name = pivot_feature + '_DIF' + str(time_shift)
    df[name] = (df["DATETIME"] - df['shift']).dt.total_seconds().fillna(0)
    # 
    df = df.drop("shift", 1)
    return df
  for time_shift in range(1, max_time_shift + 1):
    print("add time difference between current and " + str(time_shift) + "th-last transaction")
    c_data = date_diff(c_data, time_shift, pivot_feature)
  return c_data

def preprocess_null_values(data):
  # 將空值填補
  c_data = copy.copy(data)
  c_data[
        c_data.select_dtypes(include=['object']).columns
      ] = c_data[
        c_data.select_dtypes(include=['object']).columns
      ].fillna("NULL")
  c_data[
      c_data.select_dtypes(include=['float64', 'int64']).columns
    ] = c_data[
      c_data.select_dtypes(include=['float64', 'int64']).columns
    ].fillna(-1)
  return c_data


def encode_labels(data):
  #將object欄位使用Label Encoder
  c_data = copy.copy(data)
  labelencoder = LabelEncoder()
  obj_col = c_data.select_dtypes(include=['object']).columns.to_list()
  for col in obj_col:
      c_data[col] = labelencoder.fit_transform(c_data[col])
  return c_data
def preprocessing(data):
  r_data = preprocess_null_values(data)
  return encode_labels(r_data)
############################ Training Preprocess ############################
def resample(data, sampling_rate=0.7, sample_type='downsample'):
  # note that testing data should not be re-sampled. 
  assert sample_type == 'downsample' or sample_type == 'upsample'
  c_data = copy.copy(data) 
  #將資料切分為train&test
  if sample_type == 'downsample': 
    df_fraud = c_data[c_data["FRAUD_IND"] == 1]
    df_not_fraud = c_data[c_data["FRAUD_IND"] != 1].sample(frac=sampling_rate, random_state=42)
  elif sample_type == 'upsample':
    df_fraud = c_data[c_data["FRAUD_IND"] == 1].sample(frac=1./sampling_rate, replace = True, random_state=42)
    df_not_fraud = c_data[c_data["FRAUD_IND"] != 1]
  df_train = pd.concat([df_fraud, df_not_fraud], 0)
  return df_train

def create_X(data, drop_list = []):
  if drop_list:
    return data.drop(drop_list, 1)
  else:
    return data

def create_X_y(data, drop_list = ['FRAUD_IND']):
  X = data.drop(drop_list, 1)
  y = data["FRAUD_IND"]
  return X,y

############################ Model Build ####################################
def train_lgb(x_train, x_test, y_train, y_test, max_depth = 8, learning_rate = 0.05, n_estimators = 1000):
  # n_estimators: number of trees 
  lgb_train = lgb.Dataset(x_train, y_train)
  lgb_test = lgb.Dataset(x_test, y_test)
  params = {
      "boosting_type": "gbdt",
      "objective": "binary",
      "metric": "binary_logloss",
      "max_depth": max_depth,
      "learning_rate": learning_rate,
      "n_estimators": n_estimators,
  }
  trained_model = lgb.train(
      params,
      lgb_train,
      num_boost_round=5000,
      valid_sets=[lgb_train, lgb_test],
      early_stopping_rounds=30,
      verbose_eval=50
  )
  return trained_model
##### Get Result Generated from Model #####################################
def evaluate(clf, x_test, y_test):
  y_pred = clf.predict(x_test)
  precision, recall, threshold = precision_recall_curve(y_test, y_pred)
  performance = {"precision": precision[0:-1],
                "recall": recall[0:-1],
                "threshold": threshold
                }
  performance["f1"] = 2 * (performance["precision"] * performance["recall"]) / (performance["precision"] + performance["recall"])
  performance = pd.DataFrame(performance)
  thr = performance[performance["f1"] == max(performance["f1"])]["threshold"].values[0]
  recall = performance[performance["f1"] == max(performance["f1"])]["recall"].values[0]
  precision = performance[performance["f1"] == max(performance["f1"])]["precision"].values[0]
  print("Recall Score:", recall)
  print("Precision Score:", precision)
  print("F1 Score:", 2 * (precision * recall) / (precision + recall))
  print("Threshold: ", thr)
def get_important_feature_table(clf, x_train):
  importance = {
  "col": np.array(x_train.columns),
  "imp": lgb.Booster.feature_importance(clf)
  }
  df_imp = pd.DataFrame(importance).sort_values(by='imp', ascending=False)
  return df_imp

# First Run (for selecting unimportant features) 

In [3]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
train_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv')
test_data = "先不給你們"
#查看資料筆數
print("shape of train data:" , train_data.shape)
#print("shape of test data:" , test_data.shape)
# add AGE 
# remove weekday and hour 
tmp_train_data = extend_with_detailed_time(train_data, 
  weekday = False, hour = False)
preprocessed_train_data = preprocessing(tmp_train_data)
resampled_train_data = resample(preprocessed_train_data, 
  sampling_rate=0.7, sample_type='downsample')
X, y = create_X_y(resampled_train_data, 
  drop_list = ["FRAUD_IND", "TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"])
val_percentage = 0.33
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=val_percentage, 
  shuffle=True, random_state=42)
clf = train_lgb(x_train, x_test, y_train, y_test, 
  max_depth = 8, learning_rate = 0.05, n_estimators = 1000)
evaluate(clf, x_test, y_test)
important_feature_table = get_important_feature_table(clf, x_train)
important_feature_table.head()

Mounted at /content/drive
shape of train data: (533202, 59)




Training until validation scores don't improve for 30 rounds.
[50]	training's binary_logloss: 0.162201	valid_1's binary_logloss: 0.162768
[100]	training's binary_logloss: 0.13457	valid_1's binary_logloss: 0.13604
[150]	training's binary_logloss: 0.12292	valid_1's binary_logloss: 0.125427
[200]	training's binary_logloss: 0.115362	valid_1's binary_logloss: 0.118842
[250]	training's binary_logloss: 0.109307	valid_1's binary_logloss: 0.113742
[300]	training's binary_logloss: 0.103638	valid_1's binary_logloss: 0.108953
[350]	training's binary_logloss: 0.0989313	valid_1's binary_logloss: 0.105065
[400]	training's binary_logloss: 0.0948469	valid_1's binary_logloss: 0.101827
[450]	training's binary_logloss: 0.0910325	valid_1's binary_logloss: 0.0986804
[500]	training's binary_logloss: 0.0874282	valid_1's binary_logloss: 0.0957317
[550]	training's binary_logloss: 0.0840682	valid_1's binary_logloss: 0.0930062
[600]	training's binary_logloss: 0.081057	valid_1's binary_logloss: 0.0906978
[650]	tra

Unnamed: 0,col,imp
27,CC_VINTAGE,2721
0,MCC,2437
10,SCITY,1885
8,FLAM1,1810
37,BONUS_POINTS,1545


# Best Run in v1

In [4]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
train_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv')
test_data = "先不給你們"
#查看資料筆數
print("shape of train data:" , train_data.shape)
#print("shape of test data:" , test_data.shape)
tmp_train_data = extend_with_detailed_time(train_data, 
  weekday = True, hour = True)
train_tmp_data = extend_with_time_difference_features(tmp_train_data, 
  max_time_shift = 20, pivot_feature = 'CHID')
preprocessed_train_data = preprocessing(train_tmp_data)
resampled_train_data = resample(preprocessed_train_data, 
  sampling_rate=0.7, sample_type='downsample')
removed_unimportant_feature_count = 5
X, y = create_X_y(resampled_train_data, 
  drop_list = list(set(["FRAUD_IND", "TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"] + \
  important_feature_table.set_index('col').index[-(removed_unimportant_feature_count):].tolist()))
)
val_percentage = 0.33
x_train, x_test, y_train, y_test = train_test_split(X, y, 
  test_size=val_percentage, shuffle=True, random_state=42)
clf = train_lgb(x_train, x_test, y_train, y_test, 
  max_depth = 8, learning_rate = 0.05, n_estimators = 1000)
evaluate(clf, x_test, y_test)
#important_feature_table = get_important_feature_table(clf, x_train)
#important_feature_table

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
shape of train data: (533202, 59)
add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
add time difference between current and 11th-last transaction
add time difference between current and 12th-last transaction
add time difference between current and 13th-last transaction
add time difference between current and 



Training until validation scores don't improve for 30 rounds.
[50]	training's binary_logloss: 0.083666	valid_1's binary_logloss: 0.0839237
[100]	training's binary_logloss: 0.0522189	valid_1's binary_logloss: 0.0540179
[150]	training's binary_logloss: 0.0434371	valid_1's binary_logloss: 0.0461507
[200]	training's binary_logloss: 0.0390603	valid_1's binary_logloss: 0.0424071
[250]	training's binary_logloss: 0.0351947	valid_1's binary_logloss: 0.0391872
[300]	training's binary_logloss: 0.0323212	valid_1's binary_logloss: 0.036856
[350]	training's binary_logloss: 0.0296248	valid_1's binary_logloss: 0.0347479
[400]	training's binary_logloss: 0.027224	valid_1's binary_logloss: 0.0327181
[450]	training's binary_logloss: 0.0251559	valid_1's binary_logloss: 0.0310547
[500]	training's binary_logloss: 0.0229929	valid_1's binary_logloss: 0.0294841
[550]	training's binary_logloss: 0.0211482	valid_1's binary_logloss: 0.0281075
[600]	training's binary_logloss: 0.0193928	valid_1's binary_logloss: 0.02

## Tuning Threshold 

In [None]:
y_pred = clf.predict(X)
tolerance = 0.05
boundary = (0., 1.)
train_imbalance_rate = train_data['FRAUD_IND'].mean()
print("imbalance rate of train data:", train_imbalance_rate)
threshold = 0.5
y_result = (y_pred > threshold).astype(int).T
result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
result_table.columns = ['TXKEY', 'FRAUD_IND']
imbalance_rate = result_table['FRAUD_IND'].mean()
print("imbalance rate of test data:", imbalance_rate)
while np.abs(train_imbalance_rate-imbalance_rate) >= tolerance:
  print('threshold:', threshold)
  y_result = (y_pred > threshold).astype(int).T
  result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
  result_table.columns = ['TXKEY', 'FRAUD_IND']
  imbalance_rate = result_table['FRAUD_IND'].mean()
  print("imbalance rate of test data:", imbalance_rate)
  if imbalance_rate > train_imbalance_rate:
    threshold = (boundary[1] + threshold)/2.
    boundary[0] = threshold
  else:
    threshold = (boundary[0] + threshold)/2.
    boundary[1] = threshold
  print("boundary",boundary)


## Generate Result 

In [6]:
y_pred = clf.predict(X)
threshold = 0.97
y_result = (y_pred > threshold).astype(int).T
result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
result_table.columns = ['TXKEY', 'FRAUD_IND']
print("imbalance rate of test data:", result_table['FRAUD_IND'].mean())
result_table.to_csv('tmp_submission.csv')

TypeError: ignored

# Strategy 1

## add log scale 

In [13]:
def extend_with_log_scale_features(data, log_scale_feature_list):
  c_data = copy.copy(data)
  for f_name in log_scale_feature_list:
    c_data[f_name + '_LOG_SCALE'] = np.log10(data[f_name])
  return c_data

In [14]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
train_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv')
test_data = "先不給你們"
#查看資料筆數
print("shape of train data:" , train_data.shape)
#print("shape of test data:" , test_data.shape)

log_scale_feature_list = [
  'BNSPT',
  'FLAM1',
  'ACCT_VINTAGE',
  'AVAILABLE_LIMIT_AMT',
  'BONUS_POINTS',
  'CREDIT_LIMIT_AMT',
  'CREDIT_REVOLVING_RATE',
  'CREDIT_USE_RATE',
  'CURRENT_CASH_ADV_AMT',
  'CURRENT_FEE',
  'CURRENT_INSTALLMENT_BAL',
  'CURRENT_INSTALLMENT_PURCH_AMT',
  'CURRENT_PURCH_AMT',
  'LST_CYCLE_UNPAID_BAL',
  'REVOLVING_AMT'
]
tmp_train_data = extend_with_log_scale_features(train_data, log_scale_feature_list)
tmp_train_data = extend_with_detailed_time(tmp_train_data, 
  weekday = True, hour = True)
train_tmp_data = extend_with_time_difference_features(tmp_train_data, 
  max_time_shift = 20, pivot_feature = 'CHID')
preprocessed_train_data = preprocessing(train_tmp_data)
resampled_train_data = resample(preprocessed_train_data, 
  sampling_rate=0.7, sample_type='downsample')
removed_unimportant_feature_count = 5
X, y = create_X_y(resampled_train_data, 
  drop_list = list(set(["FRAUD_IND", "TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"] + \
  important_feature_table.set_index('col').index[-(removed_unimportant_feature_count):].tolist()))
)
val_percentage = 0.33
x_train, x_test, y_train, y_test = train_test_split(X, y, 
  test_size=val_percentage, shuffle=True, random_state=42)
clf = train_lgb(x_train, x_test, y_train, y_test, 
  max_depth = 8, learning_rate = 0.05, n_estimators = 1000)
evaluate(clf, x_test, y_test)
#important_feature_table = get_important_feature_table(clf, x_train)
#important_feature_table

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
shape of train data: (533202, 59)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
add time difference between current and 11th-last transaction
add time difference between current and 12th-last transaction
add time difference between current and 13th-last transaction
add time difference between current and 14th-last transaction
add time difference between current and 15th-last transaction
add time difference between current and 16th-last transaction
add time differen



Training until validation scores don't improve for 30 rounds.
[50]	training's binary_logloss: 0.0834672	valid_1's binary_logloss: 0.0836714
[100]	training's binary_logloss: 0.0522038	valid_1's binary_logloss: 0.0537736
[150]	training's binary_logloss: 0.0434702	valid_1's binary_logloss: 0.0459496
[200]	training's binary_logloss: 0.0393683	valid_1's binary_logloss: 0.0424598
[250]	training's binary_logloss: 0.0356321	valid_1's binary_logloss: 0.0393477
[300]	training's binary_logloss: 0.0325958	valid_1's binary_logloss: 0.0368766
[350]	training's binary_logloss: 0.0299755	valid_1's binary_logloss: 0.0347561
[400]	training's binary_logloss: 0.0270696	valid_1's binary_logloss: 0.0324058
[450]	training's binary_logloss: 0.024545	valid_1's binary_logloss: 0.0303967
[500]	training's binary_logloss: 0.0225986	valid_1's binary_logloss: 0.0289356
[550]	training's binary_logloss: 0.0208548	valid_1's binary_logloss: 0.0276641
[600]	training's binary_logloss: 0.0191038	valid_1's binary_logloss: 0.

## check if the log-scale version features have larger importance 

In [16]:
important_feature_table_with_log_features = get_important_feature_table(clf, x_train)
important_feature_table_with_log_features = important_feature_table_with_log_features.set_index('col')
for f_name in log_scale_feature_list:
  if f_name not in important_feature_table.set_index('col').index[
              -(removed_unimportant_feature_count):].tolist():
    linear_importance_score = important_feature_table_with_log_features.loc[f_name]['imp']
    log_importance_score = important_feature_table_with_log_features.loc[f_name+'_LOG_SCALE']['imp']
    print(f_name, linear_importance_score, log_importance_score)
    if log_importance_score > linear_importance_score:
      print('log-scale is better')

## remove unimportant log-scale feature and run again 

In [27]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
train_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv')
test_data = "先不給你們"
#查看資料筆數
print("shape of train data:" , train_data.shape)
#print("shape of test data:" , test_data.shape)

log_scale_feature_list = [
  'CREDIT_USE_RATE',
  'REVOLVING_AMT'
]
tmp_train_data = extend_with_log_scale_features(train_data, log_scale_feature_list)
tmp_train_data = extend_with_detailed_time(tmp_train_data, 
  weekday = True, hour = True)
train_tmp_data = extend_with_time_difference_features(tmp_train_data, 
  max_time_shift = 20, pivot_feature = 'CHID')
preprocessed_train_data = preprocessing(train_tmp_data)
resampled_train_data = resample(preprocessed_train_data, 
  sampling_rate=0.7, sample_type='downsample')
removed_unimportant_feature_count = 5
X, y = create_X_y(resampled_train_data, 
  drop_list = list(set(["FRAUD_IND", "TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"] + \
  important_feature_table.set_index('col').index[-(removed_unimportant_feature_count):].tolist()))
)
val_percentage = 0.33
x_train, x_test, y_train, y_test = train_test_split(X, y, 
  test_size=val_percentage, shuffle=True, random_state=42)
clf = train_lgb(x_train, x_test, y_train, y_test, 
  max_depth = 8, learning_rate = 0.05, n_estimators = 1000)
evaluate(clf, x_test, y_test)
#important_feature_table = get_important_feature_table(clf, x_train)
#important_feature_table

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
shape of train data: (533202, 59)


  result = getattr(ufunc, method)(*inputs, **kwargs)


add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
add time difference between current and 11th-last transaction
add time difference between current and 12th-last transaction
add time difference between current and 13th-last transaction
add time difference between current and 14th-last transaction
add time difference between current and 15th-last transaction
add time difference between current and 16th-last transaction
add time differen



Training until validation scores don't improve for 30 rounds.
[50]	training's binary_logloss: 0.0838535	valid_1's binary_logloss: 0.0840345
[100]	training's binary_logloss: 0.0523608	valid_1's binary_logloss: 0.0539582
[150]	training's binary_logloss: 0.0434902	valid_1's binary_logloss: 0.0460947
[200]	training's binary_logloss: 0.0390921	valid_1's binary_logloss: 0.0422317
[250]	training's binary_logloss: 0.0355646	valid_1's binary_logloss: 0.0392506
[300]	training's binary_logloss: 0.0323527	valid_1's binary_logloss: 0.0366978
[350]	training's binary_logloss: 0.0297353	valid_1's binary_logloss: 0.0346748
[400]	training's binary_logloss: 0.0271154	valid_1's binary_logloss: 0.0325232
[450]	training's binary_logloss: 0.0248799	valid_1's binary_logloss: 0.0307638
[500]	training's binary_logloss: 0.0227364	valid_1's binary_logloss: 0.0291772
[550]	training's binary_logloss: 0.0209168	valid_1's binary_logloss: 0.0278375
[600]	training's binary_logloss: 0.0194334	valid_1's binary_logloss: 0

Without selection of log-scale feature 
Recall Score: 0.9848148441836492
Precision Score: 0.9814683104156788
F1 Score: 0.983138729467643
Threshold:  0.48441210277113395

With selection of log-scale feature 
Recall Score: 0.9824359685988423
Precision Score: 0.9836449525624231
F1 Score: 0.9830400888659672
Threshold:  0.5309037539994582

## conclusion: should try both above cases 

# Strategy 2: add NULL-OR-NOT factor

In [31]:
def extend_with_null_or_not_features(data, has_null_feature_list):
  c_data = copy.copy(data)
  for f_name in has_null_feature_list:
    c_data[f_name + '_NULL_OR_NOT'] = data[f_name].isna().astype(int)
  return c_data

In [33]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
train_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv')
test_data = "先不給你們"
#查看資料筆數
print("shape of train data:" , train_data.shape)
#print("shape of test data:" , test_data.shape)
has_null_feature_list = [
   "AVAILABLE_LIMIT_AMT",
   "BONUS_POINTS",
   "CURRENT_CASH_ADV_AMT",
   "CURRENT_FEE",
   "CURRENT_INSTALLMENT_PURCH_AMT",
   "CURRENT_PURCH_AMT",
   "LST_CYCLE_UNPAID_BAL"
  ]
tmp_train_data = extend_with_null_or_not_features(train_data, has_null_feature_list)


log_scale_feature_list = [
  'BNSPT',
  'FLAM1',
  'ACCT_VINTAGE',
  'AVAILABLE_LIMIT_AMT',
  'BONUS_POINTS',
  'CREDIT_LIMIT_AMT',
  'CREDIT_REVOLVING_RATE',
  'CREDIT_USE_RATE',
  'CURRENT_CASH_ADV_AMT',
  'CURRENT_FEE',
  'CURRENT_INSTALLMENT_BAL',
  'CURRENT_INSTALLMENT_PURCH_AMT',
  'CURRENT_PURCH_AMT',
  'LST_CYCLE_UNPAID_BAL',
  'REVOLVING_AMT'
]
tmp_train_data = extend_with_log_scale_features(tmp_train_data, log_scale_feature_list)
tmp_train_data = extend_with_detailed_time(tmp_train_data, 
  weekday = True, hour = True)
train_tmp_data = extend_with_time_difference_features(tmp_train_data, 
  max_time_shift = 20, pivot_feature = 'CHID')
preprocessed_train_data = preprocessing(train_tmp_data)
resampled_train_data = resample(preprocessed_train_data, 
  sampling_rate=0.7, sample_type='downsample')
removed_unimportant_feature_count = 5
X, y = create_X_y(resampled_train_data, 
  drop_list = list(set(["FRAUD_IND", "TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"] + \
  important_feature_table.set_index('col').index[-(removed_unimportant_feature_count):].tolist()))
)
val_percentage = 0.33
x_train, x_test, y_train, y_test = train_test_split(X, y, 
  test_size=val_percentage, shuffle=True, random_state=42)
clf = train_lgb(x_train, x_test, y_train, y_test, 
  max_depth = 8, learning_rate = 0.05, n_estimators = 1000)
evaluate(clf, x_test, y_test)
#important_feature_table = get_important_feature_table(clf, x_train)
#important_feature_table

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
shape of train data: (533202, 59)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
add time difference between current and 11th-last transaction
add time difference between current and 12th-last transaction
add time difference between current and 13th-last transaction
add time difference between current and 14th-last transaction
add time difference between current and 15th-last transaction
add time difference between current and 16th-last transaction
add time differen



Training until validation scores don't improve for 30 rounds.
[50]	training's binary_logloss: 0.0834672	valid_1's binary_logloss: 0.0836714
[100]	training's binary_logloss: 0.0522038	valid_1's binary_logloss: 0.0537736
[150]	training's binary_logloss: 0.0434702	valid_1's binary_logloss: 0.0459496
[200]	training's binary_logloss: 0.0393683	valid_1's binary_logloss: 0.0424598
[250]	training's binary_logloss: 0.0356321	valid_1's binary_logloss: 0.0393477
[300]	training's binary_logloss: 0.0325958	valid_1's binary_logloss: 0.0368766
[350]	training's binary_logloss: 0.0299755	valid_1's binary_logloss: 0.0347561
[400]	training's binary_logloss: 0.0270696	valid_1's binary_logloss: 0.0324058
[450]	training's binary_logloss: 0.024545	valid_1's binary_logloss: 0.0303967
[500]	training's binary_logloss: 0.0225986	valid_1's binary_logloss: 0.0289356
[550]	training's binary_logloss: 0.0208548	valid_1's binary_logloss: 0.0276641
[600]	training's binary_logloss: 0.0191038	valid_1's binary_logloss: 0.

In [34]:
important_feature_table_with_null_features = get_important_feature_table(clf, x_train)
important_feature_table_with_null_features = important_feature_table_with_null_features.set_index('col')
for f_name in has_null_feature_list:
  if f_name not in important_feature_table.set_index('col').index[
              -(removed_unimportant_feature_count):].tolist():
    linear_importance_score = important_feature_table_with_null_features.loc[f_name]['imp']
    null_importance_score = important_feature_table_with_null_features.loc[f_name+'_LOG_SCALE']['imp']
    print(f_name, linear_importance_score, null_importance_score)
    if null_importance_score > linear_importance_score:
      print('null feature is better')

AVAILABLE_LIMIT_AMT 616 454
BONUS_POINTS 771 370
CURRENT_CASH_ADV_AMT 55 3
CURRENT_FEE 802 273
CURRENT_INSTALLMENT_PURCH_AMT 331 201
CURRENT_PURCH_AMT 747 558
LST_CYCLE_UNPAID_BAL 109 88


## Generate Testing Result 

In [38]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
test_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/test.csv')
#查看資料筆數
#print("shape of test data:" , test_data.shape)
has_null_feature_list = [
   "AVAILABLE_LIMIT_AMT",
   "BONUS_POINTS",
   "CURRENT_CASH_ADV_AMT",
   "CURRENT_FEE",
   "CURRENT_INSTALLMENT_PURCH_AMT",
   "CURRENT_PURCH_AMT",
   "LST_CYCLE_UNPAID_BAL"
  ]
tmp_data = extend_with_null_or_not_features(test_data, has_null_feature_list)
log_scale_feature_list = [
  'BNSPT',
  'FLAM1',
  'ACCT_VINTAGE',
  'AVAILABLE_LIMIT_AMT',
  'BONUS_POINTS',
  'CREDIT_LIMIT_AMT',
  'CREDIT_REVOLVING_RATE',
  'CREDIT_USE_RATE',
  'CURRENT_CASH_ADV_AMT',
  'CURRENT_FEE',
  'CURRENT_INSTALLMENT_BAL',
  'CURRENT_INSTALLMENT_PURCH_AMT',
  'CURRENT_PURCH_AMT',
  'LST_CYCLE_UNPAID_BAL',
  'REVOLVING_AMT'
]
tmp_data = extend_with_log_scale_features(tmp_data, log_scale_feature_list)
tmp_data = extend_with_detailed_time(tmp_data, 
  weekday = True, hour = True)
tmp_data = extend_with_time_difference_features(tmp_data, 
  max_time_shift = 20, pivot_feature = 'CHID')
preprocessed_data = preprocessing(tmp_data)
removed_unimportant_feature_count = 5
X = create_X(preprocessed_data, 
  drop_list = list(set(["TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"] + \
  important_feature_table.set_index('col').index[-(removed_unimportant_feature_count):].tolist()))
)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
add time difference between current and 11th-last transaction
add time difference between current and 12th-last transaction
add time difference between current and 13th-last transaction
add time difference between current and 14th-last transaction
add time difference between current and 15th-last transaction
add time difference between current and 16th-last transaction
add time differen

In [46]:
def get_best_threshold(tolerance = 0.05, boundary = (0., 1.)):
  y_pred = clf.predict(X)
  train_imbalance_rate = train_data['FRAUD_IND'].mean()
  print("imbalance rate of train data:", train_imbalance_rate)
  threshold = 0.5
  y_result = (y_pred > threshold).astype(int).T
  result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
  result_table.columns = ['TXKEY', 'FRAUD_IND']
  imbalance_rate = result_table['FRAUD_IND'].mean()
  print("imbalance rate of test data:", imbalance_rate)
  while np.abs(train_imbalance_rate-imbalance_rate) >= tolerance:
    print('threshold:', threshold)
    y_result = (y_pred > threshold).astype(int).T
    result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
    result_table.columns = ['TXKEY', 'FRAUD_IND']
    imbalance_rate = result_table['FRAUD_IND'].mean()
    print("imbalance rate of test data:", imbalance_rate)
    if imbalance_rate > train_imbalance_rate:
      threshold = (boundary[1] + threshold)/2.
      boundary = threshold, boundary[1]
    else:
      threshold = (boundary[0] + threshold)/2.
      boundary = boundary[0], threshold
    print("boundary",boundary)
  return threshold

In [None]:
threshold = get_best_threshold(tolerance = 0.05, boundary = (0., 1.))

In [41]:
threshold = 0.96875
y_result = (y_pred > threshold).astype(int).T
result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
result_table.columns = ['TXKEY', 'FRAUD_IND']
print("imbalance rate of test data:", result_table['FRAUD_IND'].mean())
result_table.to_csv('tmp_submission.csv')

imbalance rate of test data: 0.1362740427874284


## performance: 0.03998

# Strategy 3 : Add same shop feature or not (Not Effect)

In [42]:
def extend_with_same_shop_features(data, max_time_shift = 5, pivot_feature = 'CHID'):
  # CHID: 卡人ID
  # CANO: 交易卡號
  c_data = copy.copy(data)
  assert max_time_shift > 2
  def identical_shop_index(data, time_shift, pivot_feature):
    df = copy.copy(data)
    df["shift"] = df.groupby([pivot_feature])['MCHNO'].shift(time_shift)
    name = pivot_feature + '_SAME' + str(time_shift)
    df[name] = (df["MCHNO"] == df['shift']).astype(int)
    df[name][df['MCHNO'].isna()] = -1
    df[name][df['shift'].isna()] = -1
    df = df.drop("shift", 1)
    return df
  for time_shift in range(1, max_time_shift + 1):
    print("add shop identical index between current and " + str(time_shift) + "th-last transaction")
    c_data = identical_shop_index(c_data, time_shift, pivot_feature)
  return c_data

In [43]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
train_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv')
test_data = "先不給你們"
#查看資料筆數
print("shape of train data:" , train_data.shape)
#print("shape of test data:" , test_data.shape)
has_null_feature_list = [
   "AVAILABLE_LIMIT_AMT",
   "BONUS_POINTS",
   "CURRENT_CASH_ADV_AMT",
   "CURRENT_FEE",
   "CURRENT_INSTALLMENT_PURCH_AMT",
   "CURRENT_PURCH_AMT",
   "LST_CYCLE_UNPAID_BAL"
  ]
tmp_train_data = extend_with_null_or_not_features(train_data, has_null_feature_list)


log_scale_feature_list = [
  'BNSPT',
  'FLAM1',
  'ACCT_VINTAGE',
  'AVAILABLE_LIMIT_AMT',
  'BONUS_POINTS',
  'CREDIT_LIMIT_AMT',
  'CREDIT_REVOLVING_RATE',
  'CREDIT_USE_RATE',
  'CURRENT_CASH_ADV_AMT',
  'CURRENT_FEE',
  'CURRENT_INSTALLMENT_BAL',
  'CURRENT_INSTALLMENT_PURCH_AMT',
  'CURRENT_PURCH_AMT',
  'LST_CYCLE_UNPAID_BAL',
  'REVOLVING_AMT'
]
tmp_train_data = extend_with_log_scale_features(tmp_train_data, log_scale_feature_list)
tmp_train_data = extend_with_detailed_time(tmp_train_data, 
  weekday = True, hour = True)
train_tmp_data = extend_with_time_difference_features(tmp_train_data, 
  max_time_shift = 20, pivot_feature = 'CHID')

train_tmp_data = extend_with_same_shop_features(tmp_train_data, 
  max_time_shift = 20, pivot_feature = 'CHID')
preprocessed_train_data = preprocessing(train_tmp_data)
resampled_train_data = resample(preprocessed_train_data, 
  sampling_rate=0.7, sample_type='downsample')
removed_unimportant_feature_count = 5
X, y = create_X_y(resampled_train_data, 
  drop_list = list(set(["FRAUD_IND", "TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"] + \
  important_feature_table.set_index('col').index[-(removed_unimportant_feature_count):].tolist()))
)
val_percentage = 0.33
x_train, x_test, y_train, y_test = train_test_split(X, y, 
  test_size=val_percentage, shuffle=True, random_state=42)
clf = train_lgb(x_train, x_test, y_train, y_test, 
  max_depth = 8, learning_rate = 0.05, n_estimators = 1000)
evaluate(clf, x_test, y_test)
#important_feature_table = get_important_feature_table(clf, x_train)
#important_feature_table

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
shape of train data: (533202, 59)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
add time difference between current and 11th-last transaction
add time difference between current and 12th-last transaction
add time difference between current and 13th-last transaction
add time difference between current and 14th-last transaction
add time difference between current and 15th-last transaction
add time difference between current and 16th-last transaction
add time differen

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


add shop identical index between current and 2th-last transaction
add shop identical index between current and 3th-last transaction
add shop identical index between current and 4th-last transaction
add shop identical index between current and 5th-last transaction
add shop identical index between current and 6th-last transaction
add shop identical index between current and 7th-last transaction
add shop identical index between current and 8th-last transaction
add shop identical index between current and 9th-last transaction
add shop identical index between current and 10th-last transaction
add shop identical index between current and 11th-last transaction
add shop identical index between current and 12th-last transaction
add shop identical index between current and 13th-last transaction
add shop identical index between current and 14th-last transaction
add shop identical index between current and 15th-last transaction
add shop identical index between current and 16th-last transaction
add



Training until validation scores don't improve for 30 rounds.
[50]	training's binary_logloss: 0.0966026	valid_1's binary_logloss: 0.097329
[100]	training's binary_logloss: 0.065091	valid_1's binary_logloss: 0.0667404
[150]	training's binary_logloss: 0.0537046	valid_1's binary_logloss: 0.0563106
[200]	training's binary_logloss: 0.0470863	valid_1's binary_logloss: 0.0504664
[250]	training's binary_logloss: 0.0422039	valid_1's binary_logloss: 0.0463074
[300]	training's binary_logloss: 0.0382488	valid_1's binary_logloss: 0.0430045
[350]	training's binary_logloss: 0.0349299	valid_1's binary_logloss: 0.0403005
[400]	training's binary_logloss: 0.0322256	valid_1's binary_logloss: 0.0381258
[450]	training's binary_logloss: 0.0297539	valid_1's binary_logloss: 0.0362427
[500]	training's binary_logloss: 0.0274873	valid_1's binary_logloss: 0.0344338
[550]	training's binary_logloss: 0.0256312	valid_1's binary_logloss: 0.0330677
[600]	training's binary_logloss: 0.0239068	valid_1's binary_logloss: 0.0

# Strategy 4 : Upsample

In [44]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
train_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv')
test_data = "先不給你們"
#查看資料筆數
print("shape of train data:" , train_data.shape)
#print("shape of test data:" , test_data.shape)
has_null_feature_list = [
   "AVAILABLE_LIMIT_AMT",
   "BONUS_POINTS",
   "CURRENT_CASH_ADV_AMT",
   "CURRENT_FEE",
   "CURRENT_INSTALLMENT_PURCH_AMT",
   "CURRENT_PURCH_AMT",
   "LST_CYCLE_UNPAID_BAL"
  ]
tmp_train_data = extend_with_null_or_not_features(train_data, has_null_feature_list)


log_scale_feature_list = [
  'BNSPT',
  'FLAM1',
  'ACCT_VINTAGE',
  'AVAILABLE_LIMIT_AMT',
  'BONUS_POINTS',
  'CREDIT_LIMIT_AMT',
  'CREDIT_REVOLVING_RATE',
  'CREDIT_USE_RATE',
  'CURRENT_CASH_ADV_AMT',
  'CURRENT_FEE',
  'CURRENT_INSTALLMENT_BAL',
  'CURRENT_INSTALLMENT_PURCH_AMT',
  'CURRENT_PURCH_AMT',
  'LST_CYCLE_UNPAID_BAL',
  'REVOLVING_AMT'
]
tmp_train_data = extend_with_log_scale_features(tmp_train_data, log_scale_feature_list)
tmp_train_data = extend_with_detailed_time(tmp_train_data, 
  weekday = True, hour = True)
train_tmp_data = extend_with_time_difference_features(tmp_train_data, 
  max_time_shift = 20, pivot_feature = 'CHID')
preprocessed_train_data = preprocessing(train_tmp_data)
resampled_train_data = resample(preprocessed_train_data, 
  sampling_rate=0.7, sample_type='upsample')
removed_unimportant_feature_count = 5
X, y = create_X_y(resampled_train_data, 
  drop_list = list(set(["FRAUD_IND", "TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"] + \
  important_feature_table.set_index('col').index[-(removed_unimportant_feature_count):].tolist()))
)
val_percentage = 0.33
x_train, x_test, y_train, y_test = train_test_split(X, y, 
  test_size=val_percentage, shuffle=True, random_state=42)
clf = train_lgb(x_train, x_test, y_train, y_test, 
  max_depth = 8, learning_rate = 0.05, n_estimators = 1000)
evaluate(clf, x_test, y_test)
#important_feature_table = get_important_feature_table(clf, x_train)
#important_feature_table

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
shape of train data: (533202, 59)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
add time difference between current and 11th-last transaction
add time difference between current and 12th-last transaction
add time difference between current and 13th-last transaction
add time difference between current and 14th-last transaction
add time difference between current and 15th-last transaction
add time difference between current and 16th-last transaction
add time differen



Training until validation scores don't improve for 30 rounds.
[50]	training's binary_logloss: 0.0823547	valid_1's binary_logloss: 0.082804
[100]	training's binary_logloss: 0.0510114	valid_1's binary_logloss: 0.0525342
[150]	training's binary_logloss: 0.0424765	valid_1's binary_logloss: 0.0444875
[200]	training's binary_logloss: 0.0382781	valid_1's binary_logloss: 0.0405338
[250]	training's binary_logloss: 0.0348107	valid_1's binary_logloss: 0.0374528
[300]	training's binary_logloss: 0.0316671	valid_1's binary_logloss: 0.0345926
[350]	training's binary_logloss: 0.0291227	valid_1's binary_logloss: 0.0323767
[400]	training's binary_logloss: 0.0267736	valid_1's binary_logloss: 0.030258
[450]	training's binary_logloss: 0.0243756	valid_1's binary_logloss: 0.0282147
[500]	training's binary_logloss: 0.0223505	valid_1's binary_logloss: 0.0264626
[550]	training's binary_logloss: 0.0206306	valid_1's binary_logloss: 0.0249432
[600]	training's binary_logloss: 0.0192743	valid_1's binary_logloss: 0.0

## Generate Testing Result 

In [49]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
test_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/test.csv')
#查看資料筆數
#print("shape of test data:" , test_data.shape)
has_null_feature_list = [
   "AVAILABLE_LIMIT_AMT",
   "BONUS_POINTS",
   "CURRENT_CASH_ADV_AMT",
   "CURRENT_FEE",
   "CURRENT_INSTALLMENT_PURCH_AMT",
   "CURRENT_PURCH_AMT",
   "LST_CYCLE_UNPAID_BAL"
  ]
tmp_data = extend_with_null_or_not_features(test_data, has_null_feature_list)
log_scale_feature_list = [
  'BNSPT',
  'FLAM1',
  'ACCT_VINTAGE',
  'AVAILABLE_LIMIT_AMT',
  'BONUS_POINTS',
  'CREDIT_LIMIT_AMT',
  'CREDIT_REVOLVING_RATE',
  'CREDIT_USE_RATE',
  'CURRENT_CASH_ADV_AMT',
  'CURRENT_FEE',
  'CURRENT_INSTALLMENT_BAL',
  'CURRENT_INSTALLMENT_PURCH_AMT',
  'CURRENT_PURCH_AMT',
  'LST_CYCLE_UNPAID_BAL',
  'REVOLVING_AMT'
]
tmp_data = extend_with_log_scale_features(tmp_data, log_scale_feature_list)
tmp_data = extend_with_detailed_time(tmp_data, 
  weekday = True, hour = True)
tmp_data = extend_with_time_difference_features(tmp_data, 
  max_time_shift = 20, pivot_feature = 'CHID')
preprocessed_data = preprocessing(tmp_data)
removed_unimportant_feature_count = 5
X = create_X(preprocessed_data, 
  drop_list = list(set(["TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"] + \
  important_feature_table.set_index('col').index[-(removed_unimportant_feature_count):].tolist()))
)
### Threshold Tuning ########################################################
tolerance = 0.01
boundary = (0., 1.)
threshold = 0.5

y_pred = clf.predict(X)
train_imbalance_rate = train_data['FRAUD_IND'].mean()
print("imbalance rate of train data:", train_imbalance_rate)

y_result = (y_pred > threshold).astype(int).T
result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
result_table.columns = ['TXKEY', 'FRAUD_IND']
imbalance_rate = result_table['FRAUD_IND'].mean()
print("imbalance rate of test data:", imbalance_rate)
while np.abs(train_imbalance_rate-imbalance_rate) >= tolerance:
  print('threshold:', threshold)
  y_result = (y_pred > threshold).astype(int).T
  result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
  result_table.columns = ['TXKEY', 'FRAUD_IND']
  imbalance_rate = result_table['FRAUD_IND'].mean()
  print("imbalance rate of test data:", imbalance_rate)
  if imbalance_rate > train_imbalance_rate:
    threshold = (boundary[1] + threshold)/2.
    boundary = threshold, boundary[1]
  else:
    threshold = (boundary[0] + threshold)/2.
    boundary = boundary[0], threshold
  print("boundary",boundary)
### Generate CSV ########################################################
threshold = 0.96875
y_result = (y_pred > threshold).astype(int).T
result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
result_table.columns = ['TXKEY', 'FRAUD_IND']
print("imbalance rate of test data:", result_table['FRAUD_IND'].mean())
result_table.to_csv('tmp_submission.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
add time difference between current and 11th-last transaction
add time difference between current and 12th-last transaction
add time difference between current and 13th-last transaction
add time difference between current and 14th-last transaction
add time difference between current and 15th-last transaction
add time difference between current and 16th-last transaction
add time differen

## performance: 0.04284

# Strategy 5: Add same shop type, same country, or same MCC, FLAM1

In [61]:
def extend_with_same_MCC(data, max_time_shift = 5, pivot_feature = 'CHID'):
  # CHID: 卡人ID
  # CANO: 交易卡號
  c_data = copy.copy(data)
  assert max_time_shift > 2
  def identical_MCC_index(data, time_shift, pivot_feature):
    df = copy.copy(data)
    df["shift"] = df.groupby([pivot_feature])['MCC'].shift(time_shift)
    name = pivot_feature + '_SAME' + str(time_shift)
    df[name] = (df["MCC"] == df['shift']).astype(int)
    df = df.drop("shift", 1)
    return df
  for time_shift in range(1, max_time_shift + 1):
    print("add MCC identical index between current and " + str(time_shift) + "th-last transaction")
    c_data = identical_MCC_index(c_data, time_shift, pivot_feature)
  return c_data
def extend_with_same_STOCN(data, max_time_shift = 5, pivot_feature = 'CHID'):
  # CHID: 卡人ID
  # CANO: 交易卡號
  c_data = copy.copy(data)
  assert max_time_shift > 2
  def identical_STOCN_index(data, time_shift, pivot_feature):
    df = copy.copy(data)
    df["shift"] = df.groupby([pivot_feature])['STOCN'].shift(time_shift)
    name = pivot_feature + '_SAME' + str(time_shift)
    df[name] = (df["STOCN"] == df['shift']).astype(int)
    df = df.drop("shift", 1)
    return df
  for time_shift in range(1, max_time_shift + 1):
    print("add STOCN identical index between current and " + str(time_shift) + "th-last transaction")
    c_data = identical_STOCN_index(c_data, time_shift, pivot_feature)
  return c_data
def extend_with_same_FLAM1(data, max_time_shift = 5, pivot_feature = 'CHID'):
  # CHID: 卡人ID
  # CANO: 交易卡號
  c_data = copy.copy(data)
  assert max_time_shift > 2
  def identical_FLAM1_index(data, time_shift, pivot_feature):
    df = copy.copy(data)
    df["shift"] = df.groupby([pivot_feature])['FLAM1'].shift(time_shift)
    name = pivot_feature + '_DIFF' + str(time_shift)
    df[name] = (df["FLAM1"] - df['shift']).fillna(0)
    df = df.drop("shift", 1)
    return df
  for time_shift in range(1, max_time_shift + 1):
    print("add FLAM1 identical index between current and " + str(time_shift) + "th-last transaction")
    c_data = identical_FLAM1_index(c_data, time_shift, pivot_feature)
  return c_data

In [64]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
train_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv')
test_data = "先不給你們"
#查看資料筆數
print("shape of train data:" , train_data.shape)
#print("shape of test data:" , test_data.shape)
has_null_feature_list = [
   "AVAILABLE_LIMIT_AMT",
   "BONUS_POINTS",
   "CURRENT_CASH_ADV_AMT",
   "CURRENT_FEE",
   "CURRENT_INSTALLMENT_PURCH_AMT",
   "CURRENT_PURCH_AMT",
   "LST_CYCLE_UNPAID_BAL"
  ]
tmp_train_data = extend_with_null_or_not_features(train_data, has_null_feature_list)

log_scale_feature_list = [
  'BNSPT',
  'FLAM1',
  'ACCT_VINTAGE',
  'AVAILABLE_LIMIT_AMT',
  'BONUS_POINTS',
  'CREDIT_LIMIT_AMT',
  'CREDIT_REVOLVING_RATE',
  'CREDIT_USE_RATE',
  'CURRENT_CASH_ADV_AMT',
  'CURRENT_FEE',
  'CURRENT_INSTALLMENT_BAL',
  'CURRENT_INSTALLMENT_PURCH_AMT',
  'CURRENT_PURCH_AMT',
  'LST_CYCLE_UNPAID_BAL',
  'REVOLVING_AMT'
]
tmp_train_data = extend_with_log_scale_features(tmp_train_data, log_scale_feature_list)
tmp_train_data = extend_with_detailed_time(tmp_train_data, 
  weekday = True, hour = True)
train_tmp_data = extend_with_time_difference_features(tmp_train_data, 
  max_time_shift = 20, pivot_feature = 'CHID')
train_tmp_data = extend_with_same_MCC(tmp_train_data, 
  max_time_shift = 5, pivot_feature = 'CHID')
train_tmp_data = extend_with_same_STOCN(tmp_train_data, 
  max_time_shift = 5, pivot_feature = 'CHID')
train_tmp_data = extend_with_same_FLAM1(tmp_train_data, 
  max_time_shift = 5, pivot_feature = 'CHID')
preprocessed_train_data = preprocessing(train_tmp_data)
resampled_train_data = resample(preprocessed_train_data, 
  sampling_rate=0.14, sample_type='upsample')
removed_unimportant_feature_count = 5
X, y = create_X_y(resampled_train_data, 
  drop_list = list(set(["FRAUD_IND", "TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"] + \
  important_feature_table.set_index('col').index[-(removed_unimportant_feature_count):].tolist()))
)
val_percentage = 0.33
x_train, x_test, y_train, y_test = train_test_split(X, y, 
  test_size=val_percentage, shuffle=True, random_state=42)
clf = train_lgb(x_train, x_test, y_train, y_test, 
  max_depth = 8, learning_rate = 0.1, n_estimators = 3000)
evaluate(clf, x_test, y_test)
#important_feature_table = get_important_feature_table(clf, x_train)
#important_feature_table

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
shape of train data: (533202, 59)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
add time difference between current and 11th-last transaction
add time difference between current and 12th-last transaction
add time difference between current and 13th-last transaction
add time difference between current and 14th-last transaction
add time difference between current and 15th-last transaction
add time difference between current and 16th-last transaction
add time differen



Training until validation scores don't improve for 30 rounds.
[50]	training's binary_logloss: 0.117393	valid_1's binary_logloss: 0.11786
[100]	training's binary_logloss: 0.0900356	valid_1's binary_logloss: 0.0916258
[150]	training's binary_logloss: 0.0760189	valid_1's binary_logloss: 0.0782397
[200]	training's binary_logloss: 0.0658633	valid_1's binary_logloss: 0.0686938
[250]	training's binary_logloss: 0.0578633	valid_1's binary_logloss: 0.0611055
[300]	training's binary_logloss: 0.0516989	valid_1's binary_logloss: 0.0552256
[350]	training's binary_logloss: 0.0469727	valid_1's binary_logloss: 0.0508441
[400]	training's binary_logloss: 0.0425604	valid_1's binary_logloss: 0.0467101
[450]	training's binary_logloss: 0.0389268	valid_1's binary_logloss: 0.0433301
[500]	training's binary_logloss: 0.0357765	valid_1's binary_logloss: 0.0404164
[550]	training's binary_logloss: 0.0330955	valid_1's binary_logloss: 0.0379361
[600]	training's binary_logloss: 0.0303492	valid_1's binary_logloss: 0.03

## Generate Testing Result 

In [65]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
test_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/test.csv')
#查看資料筆數
#print("shape of test data:" , test_data.shape)
has_null_feature_list = [
   "AVAILABLE_LIMIT_AMT",
   "BONUS_POINTS",
   "CURRENT_CASH_ADV_AMT",
   "CURRENT_FEE",
   "CURRENT_INSTALLMENT_PURCH_AMT",
   "CURRENT_PURCH_AMT",
   "LST_CYCLE_UNPAID_BAL"
  ]
tmp_data = extend_with_null_or_not_features(test_data, has_null_feature_list)
log_scale_feature_list = [
  'BNSPT',
  'FLAM1',
  'ACCT_VINTAGE',
  'AVAILABLE_LIMIT_AMT',
  'BONUS_POINTS',
  'CREDIT_LIMIT_AMT',
  'CREDIT_REVOLVING_RATE',
  'CREDIT_USE_RATE',
  'CURRENT_CASH_ADV_AMT',
  'CURRENT_FEE',
  'CURRENT_INSTALLMENT_BAL',
  'CURRENT_INSTALLMENT_PURCH_AMT',
  'CURRENT_PURCH_AMT',
  'LST_CYCLE_UNPAID_BAL',
  'REVOLVING_AMT'
]
tmp_data = extend_with_log_scale_features(tmp_data, log_scale_feature_list)
tmp_data = extend_with_detailed_time(tmp_data, 
  weekday = True, hour = True)
tmp_data = extend_with_time_difference_features(tmp_data, 
  max_time_shift = 20, pivot_feature = 'CHID')
tmp_data = extend_with_same_MCC(tmp_data, 
  max_time_shift = 5, pivot_feature = 'CHID')
tmp_data = extend_with_same_STOCN(tmp_data, 
  max_time_shift = 5, pivot_feature = 'CHID')
tmp_data = extend_with_same_FLAM1(tmp_data, 
  max_time_shift = 5, pivot_feature = 'CHID')
preprocessed_data = preprocessing(tmp_data)
removed_unimportant_feature_count = 5
X = create_X(preprocessed_data, 
  drop_list = list(set(["TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"] + \
  important_feature_table.set_index('col').index[-(removed_unimportant_feature_count):].tolist()))
)
### Threshold Tuning ########################################################
tolerance = 0.01
boundary = (0.9, 0.99)
threshold = 0.95

y_pred = clf.predict(X)
train_imbalance_rate = train_data['FRAUD_IND'].mean()
print("imbalance rate of train data:", train_imbalance_rate)

y_result = (y_pred > threshold).astype(int).T
result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
result_table.columns = ['TXKEY', 'FRAUD_IND']
imbalance_rate = result_table['FRAUD_IND'].mean()
print("imbalance rate of test data:", imbalance_rate)
while np.abs(train_imbalance_rate-imbalance_rate) >= tolerance and boundary[0]<boundary[1]:
  print('threshold:', threshold)
  y_result = (y_pred > threshold).astype(int).T
  result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
  result_table.columns = ['TXKEY', 'FRAUD_IND']
  imbalance_rate = result_table['FRAUD_IND'].mean()
  print("imbalance rate of test data:", imbalance_rate)
  if imbalance_rate > train_imbalance_rate:
    threshold = (boundary[1] + threshold)/2.
    boundary = threshold, boundary[1]
  else:
    threshold = (boundary[0] + threshold)/2.
    boundary = boundary[0], threshold
  print("boundary",boundary)
### Generate CSV ########################################################
threshold = threshold
y_result = (y_pred > threshold).astype(int).T
result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
result_table.columns = ['TXKEY', 'FRAUD_IND']
print("imbalance rate of test data:", result_table['FRAUD_IND'].mean())
result_table.to_csv('tmp_submission.csv')
print('csv saved.')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
add time difference between current and 11th-last transaction
add time difference between current and 12th-last transaction
add time difference between current and 13th-last transaction
add time difference between current and 14th-last transaction
add time difference between current and 15th-last transaction
add time difference between current and 16th-last transaction
add time differen

# Strategy 6: Add same ECFG, PAY_TYPE, CONTP, ETYMD



In [88]:
def extend_with_same_class_between_transactions(data, f_name, max_time_shift = 5, pivot_feature = 'CHID'):
  # CHID: 卡人ID
  # CANO: 交易卡號
  c_data = copy.copy(data)
  assert max_time_shift > 2
  def identical_MCC_index(data, time_shift, pivot_feature):
    df = copy.copy(data)
    df["shift"] = df.groupby([pivot_feature])[f_name].shift(time_shift)
    name = pivot_feature + '_SAME' + str(time_shift)
    df[name] = (df[f_name] == df['shift']).astype(int)
    df = df.drop("shift", 1)
    return df
  for time_shift in range(1, max_time_shift + 1):
    print("add " + f_name + " identical index between current and " + str(time_shift) + "th-last transaction")
    c_data = identical_MCC_index(c_data, time_shift, pivot_feature)
  return c_data


In [None]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
train_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv')
test_data = "先不給你們"
#查看資料筆數
print("shape of train data:" , train_data.shape)
#print("shape of test data:" , test_data.shape)
has_null_feature_list = [
   "AVAILABLE_LIMIT_AMT",
   "BONUS_POINTS",
   "CURRENT_CASH_ADV_AMT",
   "CURRENT_FEE",
   "CURRENT_INSTALLMENT_PURCH_AMT",
   "CURRENT_PURCH_AMT",
   "LST_CYCLE_UNPAID_BAL"
  ]
tmp_train_data = extend_with_null_or_not_features(train_data, has_null_feature_list)

log_scale_feature_list = [
  'BNSPT',
  'FLAM1',
  'ACCT_VINTAGE',
  'AVAILABLE_LIMIT_AMT',
  'BONUS_POINTS',
  'CREDIT_LIMIT_AMT',
  'CREDIT_REVOLVING_RATE',
  'CREDIT_USE_RATE',
  'CURRENT_CASH_ADV_AMT',
  'CURRENT_FEE',
  'CURRENT_INSTALLMENT_BAL',
  'CURRENT_INSTALLMENT_PURCH_AMT',
  'CURRENT_PURCH_AMT',
  'LST_CYCLE_UNPAID_BAL',
  'REVOLVING_AMT'
]
tmp_train_data = extend_with_log_scale_features(tmp_train_data, log_scale_feature_list)
tmp_train_data = extend_with_detailed_time(tmp_train_data, 
  weekday = True, hour = True)
train_tmp_data = extend_with_time_difference_features(tmp_train_data, 
  max_time_shift = 20, pivot_feature = 'CHID')
train_tmp_data = extend_with_same_MCC(tmp_train_data, 
  max_time_shift = 5, pivot_feature = 'CHID')
train_tmp_data = extend_with_same_STOCN(tmp_train_data, 
  max_time_shift = 5, pivot_feature = 'CHID')
train_tmp_data = extend_with_same_FLAM1(tmp_train_data, 
  max_time_shift = 5, pivot_feature = 'CHID')
# ECFG, PAY_TYPE, CONTP, ETYMD
train_tmp_data = extend_with_same_class_between_transactions(tmp_train_data, 'ECFG',
  max_time_shift = 20, pivot_feature = 'CHID')
train_tmp_data = extend_with_same_class_between_transactions(tmp_train_data, 'PAY_TYPE', 
  max_time_shift = 20, pivot_feature = 'CHID')
train_tmp_data = extend_with_same_class_between_transactions(tmp_train_data, 'CONTP', 
  max_time_shift = 20, pivot_feature = 'CHID')
train_tmp_data = extend_with_same_class_between_transactions(tmp_train_data, 'ETYMD', 
  max_time_shift = 20, pivot_feature = 'CHID')

preprocessed_train_data = preprocessing(train_tmp_data)
resampled_train_data = resample(preprocessed_train_data, 
  sampling_rate=0.14, sample_type='upsample')
removed_unimportant_feature_count = 5
X, y = create_X_y(resampled_train_data, 
  drop_list = list(set(["FRAUD_IND", "TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"] + \
  important_feature_table.set_index('col').index[-(removed_unimportant_feature_count):].tolist()))
)
val_percentage = 0.33
x_train, x_test, y_train, y_test = train_test_split(X, y, 
  test_size=val_percentage, shuffle=True, random_state=42)
clf = train_lgb(x_train, x_test, y_train, y_test, 
  max_depth = 8, learning_rate = 0.1, n_estimators = 3000)
evaluate(clf, x_test, y_test)
#important_feature_table = get_important_feature_table(clf, x_train)
#important_feature_table
# performance F1: 0.9984

## generate testing result 

In [None]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
test_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/test.csv')
#查看資料筆數
#print("shape of test data:" , test_data.shape)
has_null_feature_list = [
   "AVAILABLE_LIMIT_AMT",
   "BONUS_POINTS",
   "CURRENT_CASH_ADV_AMT",
   "CURRENT_FEE",
   "CURRENT_INSTALLMENT_PURCH_AMT",
   "CURRENT_PURCH_AMT",
   "LST_CYCLE_UNPAID_BAL"
  ]
tmp_data = extend_with_null_or_not_features(test_data, has_null_feature_list)
log_scale_feature_list = [
  'BNSPT',
  'FLAM1',
  'ACCT_VINTAGE',
  'AVAILABLE_LIMIT_AMT',
  'BONUS_POINTS',
  'CREDIT_LIMIT_AMT',
  'CREDIT_REVOLVING_RATE',
  'CREDIT_USE_RATE',
  'CURRENT_CASH_ADV_AMT',
  'CURRENT_FEE',
  'CURRENT_INSTALLMENT_BAL',
  'CURRENT_INSTALLMENT_PURCH_AMT',
  'CURRENT_PURCH_AMT',
  'LST_CYCLE_UNPAID_BAL',
  'REVOLVING_AMT'
]
tmp_data = extend_with_log_scale_features(tmp_data, log_scale_feature_list)
tmp_data = extend_with_detailed_time(tmp_data, 
  weekday = True, hour = True)
tmp_data = extend_with_time_difference_features(tmp_data, 
  max_time_shift = 20, pivot_feature = 'CHID')
tmp_data = extend_with_same_MCC(tmp_data, 
  max_time_shift = 5, pivot_feature = 'CHID')
tmp_data = extend_with_same_STOCN(tmp_data, 
  max_time_shift = 5, pivot_feature = 'CHID')
tmp_data = extend_with_same_FLAM1(tmp_data, 
  max_time_shift = 5, pivot_feature = 'CHID')

# ECFG, PAY_TYPE, CONTP, ETYMD
tmp_data = extend_with_same_class_between_transactions(tmp_data, 'ECFG',
  max_time_shift = 20, pivot_feature = 'CHID')
tmp_data = extend_with_same_class_between_transactions(tmp_data, 'PAY_TYPE', 
  max_time_shift = 20, pivot_feature = 'CHID')
tmp_data = extend_with_same_class_between_transactions(tmp_data, 'CONTP', 
  max_time_shift = 20, pivot_feature = 'CHID')
tmp_data = extend_with_same_class_between_transactions(tmp_data, 'ETYMD', 
  max_time_shift = 20, pivot_feature = 'CHID')

preprocessed_data = preprocessing(tmp_data)
removed_unimportant_feature_count = 5
X = create_X(preprocessed_data, 
  drop_list = list(set(["TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"] + \
  important_feature_table.set_index('col').index[-(removed_unimportant_feature_count):].tolist()))
)
### Threshold Tuning ########################################################
'''tolerance = 0.01
boundary = (0., 1.)
threshold = 0.5

y_pred = clf.predict(X)
train_imbalance_rate = train_data['FRAUD_IND'].mean()
print("imbalance rate of train data:", train_imbalance_rate)

y_result = (y_pred > threshold).astype(int).T
result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
result_table.columns = ['TXKEY', 'FRAUD_IND']
imbalance_rate = result_table['FRAUD_IND'].mean()
print("imbalance rate of test data:", imbalance_rate)
while np.abs(train_imbalance_rate-imbalance_rate) >= tolerance and boundary[0]<boundary[1]:
  print('threshold:', threshold)
  y_result = (y_pred > threshold).astype(int).T
  result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
  result_table.columns = ['TXKEY', 'FRAUD_IND']
  imbalance_rate = result_table['FRAUD_IND'].mean()
  print("imbalance rate of test data:", imbalance_rate)
  if imbalance_rate > train_imbalance_rate:
    threshold = (boundary[1] + threshold)/2.
    boundary = threshold, boundary[1]
  else:
    threshold = (boundary[0] + threshold)/2.
    boundary = boundary[0], threshold
  print("boundary",boundary)'''
### Generate CSV ########################################################
y_pred = clf.predict(X)
threshold = 0.998
y_result = (y_pred > threshold).astype(int).T
result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
result_table.columns = ['TXKEY', 'FRAUD_IND']
print("imbalance rate of test data:", result_table['FRAUD_IND'].mean())
result_table.to_csv('tmp_submission.csv')
print('csv saved.')

## Performance: 0.05558

# Strategy 7: Re-tune threshold 

In [None]:
y_pred = clf.predict(X)
threshold = 0.99985
y_result = (y_pred > threshold).astype(int).T
result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
result_table.columns = ['TXKEY', 'FRAUD_IND']
print("imbalance rate of test data:", result_table['FRAUD_IND'].mean())
result_table.to_csv('tmp_submission.csv')
print('csv saved.')

# Strategy 8: Add same 'ECFG', 'PAY_TYPE', 'CONTP', 'ETYMD', 'STOCN', 'SCITY', 'APPFG', 'MCC' and Add weekday fraud factor 

In [90]:
def extend_with_strang_weekday_transaction_change(data, max_time_shift = 5, pivot_feature = 'CHID'):
  # CHID: 卡人ID
  # CANO: 交易卡號
  c_data = copy.copy(data)
  assert max_time_shift > 2
  def strange_week_index(data, time_shift, pivot_feature):
    df = copy.copy(data)
    df["shift"] = df.groupby([pivot_feature])['WEEKDAY'].shift(time_shift)
    name = pivot_feature + '_SAME' + str(time_shift)
    df[name] = ((df['WEEKDAY']!=6) & (df['WEEKDAY']!=7) & ((df['shift']==6)|(df['shift']==7))).astype(int)
    df = df.drop("shift", 1)
    return df
  for time_shift in range(1, max_time_shift + 1):
    print("add " + 'WEEKDAY' + " identical index between current and " + str(time_shift) + "th-last transaction")
    c_data = strange_week_index(c_data, time_shift, pivot_feature)
  return c_data
def overall_preprocessing(train_data):
  has_null_feature_list = [
    "AVAILABLE_LIMIT_AMT",
    "BONUS_POINTS",
    "CURRENT_CASH_ADV_AMT",
    "CURRENT_FEE",
    "CURRENT_INSTALLMENT_PURCH_AMT",
    "CURRENT_PURCH_AMT",
    "LST_CYCLE_UNPAID_BAL"
    ]
  tmp_data = extend_with_null_or_not_features(train_data, has_null_feature_list)

  log_scale_feature_list = [
    'BNSPT',
    'FLAM1',
    'ACCT_VINTAGE',
    'AVAILABLE_LIMIT_AMT',
    'BONUS_POINTS',
    'CREDIT_LIMIT_AMT',
    'CREDIT_REVOLVING_RATE',
    'CREDIT_USE_RATE',
    'CURRENT_CASH_ADV_AMT',
    'CURRENT_FEE',
    'CURRENT_INSTALLMENT_BAL',
    'CURRENT_INSTALLMENT_PURCH_AMT',
    'CURRENT_PURCH_AMT',
    'LST_CYCLE_UNPAID_BAL',
    'REVOLVING_AMT'
  ]
  tmp_data = extend_with_log_scale_features(tmp_data, log_scale_feature_list)
  tmp_data = extend_with_detailed_time(tmp_data, 
    weekday = True, hour = True)
  tmp_data = extend_with_time_difference_features(tmp_data, 
    max_time_shift = 20, pivot_feature = 'CHID')
  tmp_data = extend_with_same_FLAM1(tmp_data, 
    max_time_shift = 20, pivot_feature = 'CHID')
  tmp_data = extend_with_strang_weekday_transaction_change(tmp_data, 
    max_time_shift = 5, pivot_feature = 'CHID')
  for class_name in ['ECFG', 'PAY_TYPE', 'CONTP', 'ETYMD', 'STOCN', 'SCITY', 'APPFG', 'MCC', 'MCHNO', 'FALLBACK_IND']:
    tmp_data = extend_with_same_class_between_transactions(tmp_data, class_name,
      max_time_shift = 20, pivot_feature = 'CHID')
    
  tmp_data = preprocessing(tmp_data)
  return tmp_data

In [91]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
train_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/train.csv')
test_data = "先不給你們"
#查看資料筆數
print("shape of train data:" , train_data.shape)
#print("shape of test data:" , test_data.shape)

tmp_data = overall_preprocessing(train_data)
resampled_train_data = resample(tmp_data, 
  sampling_rate=0.14, sample_type='upsample')
removed_unimportant_feature_count = 5
X, y = create_X_y(resampled_train_data, 
  drop_list = list(set(["FRAUD_IND", "TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"] + \
  important_feature_table.set_index('col').index[-(removed_unimportant_feature_count):].tolist()))
)
val_percentage = 0.33
x_train, x_test, y_train, y_test = train_test_split(X, y, 
  test_size=val_percentage, shuffle=True, random_state=42)
clf = train_lgb(x_train, x_test, y_train, y_test, 
  max_depth = 8, learning_rate = 0.1, n_estimators = 3000)
evaluate(clf, x_test, y_test)
#important_feature_table = get_important_feature_table(clf, x_train)
#important_feature_table

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
shape of train data: (533202, 59)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


add time difference between current and 1th-last transaction
add time difference between current and 2th-last transaction
add time difference between current and 3th-last transaction
add time difference between current and 4th-last transaction
add time difference between current and 5th-last transaction
add time difference between current and 6th-last transaction
add time difference between current and 7th-last transaction
add time difference between current and 8th-last transaction
add time difference between current and 9th-last transaction
add time difference between current and 10th-last transaction
add time difference between current and 11th-last transaction
add time difference between current and 12th-last transaction
add time difference between current and 13th-last transaction
add time difference between current and 14th-last transaction
add time difference between current and 15th-last transaction
add time difference between current and 16th-last transaction
add time differen



Training until validation scores don't improve for 30 rounds.
[50]	training's binary_logloss: 0.0658336	valid_1's binary_logloss: 0.0669881
[100]	training's binary_logloss: 0.0472499	valid_1's binary_logloss: 0.0489691
[150]	training's binary_logloss: 0.0381733	valid_1's binary_logloss: 0.0403189
[200]	training's binary_logloss: 0.0320485	valid_1's binary_logloss: 0.0346059
[250]	training's binary_logloss: 0.0271291	valid_1's binary_logloss: 0.0299395
[300]	training's binary_logloss: 0.0230709	valid_1's binary_logloss: 0.0260649
[350]	training's binary_logloss: 0.0199578	valid_1's binary_logloss: 0.0232195
[400]	training's binary_logloss: 0.0172758	valid_1's binary_logloss: 0.0207946
[450]	training's binary_logloss: 0.0150305	valid_1's binary_logloss: 0.0186554
[500]	training's binary_logloss: 0.0131036	valid_1's binary_logloss: 0.0168639
[550]	training's binary_logloss: 0.0114064	valid_1's binary_logloss: 0.0158174
[600]	training's binary_logloss: 0.0102802	valid_1's binary_logloss: 0

## Generate Testing Result 

In [None]:
from google.colab import drive
drive.mount('/content/drive')
#匯入資料
test_data = pd.read_csv('/content/drive/MyDrive/智金輪習Kaggle/test.csv')
#查看資料筆數
print("shape of test data:" , test_data.shape)

tmp_data = overall_preprocessing(test_data)

X = create_X(resampled_train_data, 
  drop_list = list(set(["FRAUD_IND", "TXKEY", "DATETIME", "CANO", "CHID", "ACQIC", "MCHNO", "AGE"] + \
  important_feature_table.set_index('col').index[-(removed_unimportant_feature_count):].tolist()))
)
y_pred = clf.predict(X)

In [None]:
threshold = 0.998
y_result = (y_pred > threshold).astype(int).T
result_table = pd.DataFrame([test_data['TXKEY'], y_result]).T
result_table.columns = ['TXKEY', 'FRAUD_IND']
print("imbalance rate of test data:", result_table['FRAUD_IND'].mean())
result_table.to_csv('tmp_submission.csv')
print('csv saved.')

In [71]:
tmp_data = extend_with_detailed_time(train_data, 
  weekday = True, hour = True)

In [75]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
df3 = pd.DataFrame(ohe.fit_transform(train_data['']).toarray())
df3

Unnamed: 0_level_0,TXKEY,DATETIME,CHID,CANO,MCHNO,ACQIC,MCC,CONTP,ETYMD,ECFG,INSFG,ITERM,BNSFG,BNSPT,FLAM1,STOCN,SCITY,OVRLT,PAY_TYPE,FALLBACK_IND,AGNO,CATP1,CUORG,FEEFG,FEDFG,CATP2,TSCFG,LSCFG,CGDCT,APPFG,SAMFG,ANDFG,AGE,CC_CUST_LEVEL,CC_VINTAGE,EDU_CODE,GENDER_CODE,INCOME_RANGE_CODE,MARITAL_STATUS_CODE,NATION_CODE,OCUP_CODE,POSITION_CODE,ACCT_VINTAGE,AVAILABLE_LIMIT_AMT,BONUS_POINTS,CC_PAY_LEVEL_CODE,CREDIT_LIMIT_AMT,CREDIT_REVOLVING_RATE,CREDIT_USE_RATE,CURRENT_CASH_ADV_AMT,CURRENT_FEE,CURRENT_INSTALLMENT_BAL,CURRENT_INSTALLMENT_PURCH_AMT,CURRENT_PURCH_AMT,CURRENT_VIO_AMT,LST_CYCLE_UNPAID_BAL,REVOLVING_AMT,REVOLVING_INTEREST,FRAUD_IND,HOUR
WEEKDAY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
1,79577,79577,79577,79577,79577,79577,79577,79577,79577,79577,79577,79577,79577,79577,79577,79577,79577,79577,1590,79577,79577,79577,79577,79577,79577,79577,79577,79577,79577,79577,79577,79577,79577,76253,79577,79577,79577,79577,79577,79577,79577,73668,67787,67787,67787,67776,67787,67787,67787,67787,67787,67787,67787,67787,67787,67787,67787,67787,79577,79577
2,74044,74044,74044,74044,74044,74044,74044,74044,74044,74044,74044,74044,74044,74044,74044,74044,74044,74044,1570,74044,74044,74044,74044,74044,74044,74044,74044,74044,74044,74044,74044,74044,74044,70757,74044,74044,74044,74044,74044,74044,74044,68493,62969,62969,62969,62932,62969,62969,62969,62969,62969,62969,62969,62969,62969,62969,62969,62969,74044,74044
3,75445,75445,75445,75445,75445,75445,75445,75445,75445,75445,75445,75445,75445,75445,75445,75445,75445,75445,1590,75445,75445,75445,75445,75445,75445,75445,75445,75445,75445,75445,75445,75445,75445,72308,75445,75445,75445,75445,75445,75445,75445,69465,64576,64576,64576,64563,64576,64576,64576,64576,64576,64576,64576,64576,64576,64576,64576,64576,75445,75445
4,70868,70868,70868,70868,70868,70868,70868,70868,70868,70868,70868,70868,70868,70868,70868,70868,70868,70868,1624,70868,70868,70868,70868,70868,70868,70868,70868,70868,70868,70868,70868,70868,70868,67838,70868,70868,70868,70868,70868,70868,70868,65602,61955,61955,61955,61949,61955,61955,61955,61955,61955,61955,61955,61955,61955,61955,61955,61955,70868,70868
5,75843,75843,75843,75843,75843,75843,75843,75843,75843,75843,75843,75843,75843,75843,75843,75843,75843,75843,1698,75843,75843,75843,75843,75843,75843,75843,75843,75843,75843,75843,75843,75843,75843,72388,75843,75843,75843,75843,75843,75843,75843,70391,65926,65926,65926,65920,65926,65926,65926,65926,65926,65926,65926,65926,65926,65926,65926,65926,75843,75843
6,79315,79315,79315,79315,79315,79315,79315,79315,79315,79315,79315,79315,79315,79315,79315,79315,79315,79315,2015,79315,79315,79315,79315,79315,79315,79315,79315,79315,79315,79315,79315,79315,79315,75992,79315,79315,79315,79315,79315,79315,79315,73926,68850,68850,68850,68840,68850,68850,68850,68850,68850,68850,68850,68850,68850,68850,68850,68850,79315,79315
7,78110,78110,78110,78110,78110,78110,78110,78110,78110,78110,78110,78110,78110,78110,78110,78110,78110,78110,1776,78110,78110,78110,78110,78110,78110,78110,78110,78110,78110,78110,78110,78110,78110,75025,78110,78110,78110,78110,78110,78110,78110,72802,68355,68355,68355,68349,68355,68355,68355,68355,68355,68355,68355,68355,68355,68355,68355,68355,78110,78110
