In [1]:
import os
import pandas as pd
import numpy as np

In [3]:
def read_csv_file(file_name, index_col = 0):
    current_dir = os.getcwd()
    file_path = os.path.join(current_dir, file_name)
    
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"找不到文件：{file_path}")
    
    return pd.read_csv(file_path, index_col = index_col)

try:
    df_train_raw = read_csv_file('train.csv')
    df_test = read_csv_file('X_test.csv')

    print("raw train data:", df_train_raw.shape)
    print("test data:", df_test.shape)

    # print("Train data info:")
    # print(df_train_raw.info())
    # print("\nTest data info:")
    # print(df_test.info())

    X_train = df_train_raw.drop('label', axis=1)
    y_train = df_train_raw['label'].values.reshape(-1, 1)
    X_test =  df_test
    
    print("--------------------------------")
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_train shape:", y_train.shape)
except FileNotFoundError as e:
    print(f"錯誤：{e}")
except Exception as e:
    print(f"發生未知錯誤：{e}")

raw train data: (11000, 26)
test data: (2100, 25)
--------------------------------
X_train shape: (11000, 25)
X_test shape: (2100, 25)
y_train shape: (11000, 1)


In [7]:
# 步驟 1: 篩選出所有曾經被盜刷的卡片的交易記錄
df_steal = df_train_raw[df_train_raw['cano'].isin(df_train_raw[df_train_raw['label'] == 1]['cano'].unique())]

# 步驟 2: 對每張卡片進行盜刷統計
df_steal_again = df_steal.groupby('cano')['label'].agg(
    fraud_count = 'sum',  # 盜刷次數
    total_count = 'size'  # 總交易次數
).reset_index()

# 步驟 3: 計算每張卡片的盜刷比率
df_steal_again['ratio'] = df_steal_again['fraud_count'] / df_steal_again['total_count']

# 輸出結果
print(f'有盜刷經驗的卡片的總資料筆數: {df_steal.shape}')
print(f'有盜刷經驗的總卡片數: {df_steal_again.shape}')

print("\n前五張卡片的盜刷統計:")
df_steal_again.head()


# 額外分析：盜刷比率最高的10張卡片
# print("\n盜刷比率最高的10張卡片:")
# print(df_steal_again.sort_values('ratio', ascending=False).head(10))

# # 盜刷比率分布
# print("\n盜刷比率的描述性統計:")
# print(df_steal_again['ratio'].describe())

有盜刷經驗的卡片的總資料筆數: (2567, 26)
有盜刷經驗的總卡片數: (731, 4)

前五張卡片的盜刷統計:


Unnamed: 0,cano,fraud_count,total_count,ratio
0,00757fafb1856b35797405c7125dfb3a2a25333d432ba9...,1,1,1.0
1,0088c019c076f46d6d02ad92f88b868d06bdf176756bfd...,1,1,1.0
2,00e9f2e8467b425ee3b3eb0ce767ebb99b70a510b440f5...,1,1,1.0
3,010482e4160d45ca0040bdc49a8b14c69f1f08fae3c6cc...,1,1,1.0
4,0115d48817f7d91450a32b8a60b06eaa667eccfebffa42...,3,3,1.0


In [10]:
# Remove explanatory variables that do not contribute to modeling
# 先把幾個沒啥用的變數先拿掉，
df_raw_train = df_train_raw.drop(['bnsfg','iterm','flbmk','insfg','flam1'],
                                 axis = 1,
                                 inplace = False
                                 )

df_raw_test = df_test_raw.drop(['bnsfg','iterm','flbmk','insfg','flam1'],
                               axis = 1,
                               inplace = False
                               )

print(df_raw_train.shape)
print(df_raw_test.shape)

(11000, 21)
(11000, 21)
