In [7]:
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr.models import *
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

深度学习版本

In [275]:
train_file = "/Users/HaoShaochun/Yam/FinancialRiskControl/data/train.csv"
test_file = "/Users/HaoShaochun/Yam/FinancialRiskControl/data/testA.csv"

data_train = pd.read_csv(train_file)
data_test = pd.read_csv(test_file)

In [276]:
data = pd.concat([data_train, data_test], axis=0, ignore_index=True)

## 数据预处理

In [277]:
zero_one_feas = [
    "initialListStatus", "applicationType", 
]

num_not_bucket_feas = [
    "annualIncome",
    "term", "employmentLength", 
    "loanAmnt", "interestRate", "installment", "dti", 
    "delinquency_2years",
    "ficoRangeLow", "ficoRangeHigh", 
    "openAcc", 
    "pubRec", "pubRecBankruptcies",
    "revolBal", "revolUtil", "totalAcc",
    "earliesCreditLine", 
    "grade", "subGrade"
]

num_need_bucket_feas = [
    
]

obj_not_bucket_feas = [
    "homeOwnership", "verificationStatus", "purpose", "n11", "n12",
    "regionCode", 
    "n0", "n1", "n2", 
    'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n13', 'n14',
]

obj_need_bucket_feas = [
    "employmentTitle", "postCode", "title"
]

In [278]:
def is_float(x):
    return bool(int(x) - x)

def all_data_is_float(df: pd.DataFrame, feature: str):
    uniq = df[feature].unique()
    for i in uniq:
        if pd.isna(i):
            continue
        if is_float(i):
            return True
    return False

def drop_given_features(df: pd.DataFrame, feature_list: List[str]) -> pd.DataFrame:
    return df.drop(columns=feature_list)

def drop_uniquevalue_features(df: pd.DataFrame) -> pd.DataFrame:
    need_drop_feas = [col for col in df.columns if df[col].nunique() <= 1]
    return df.drop(columns=need_drop_feas)

def convert_num_to_obj(x: Union[int, float]):
    if pd.isna(x):
        return x
    else:
        return str(int(x))

def convert_float_to_int(x: float):
    if pd.isna(x):
        return np.nan
    else:
        return int(x)

def deal_employmentLength(x):
    if pd.notna(x):
        if x == "10+ years":
            return 10
        elif x == "< 1 year":
            return 0
        else:
            return int(x[0])
    else:
        return np.nan
    
insignificant_feas = ["n2.1", "n2.2", "n2.3", "issueDate"]
grade_dct = dict(zip(['A', 'B', 'C', 'D', 'E', 'F', 'G'], range(10, 80, 10)))

In [279]:
data = drop_uniquevalue_features(data)
data = drop_given_features(data, insignificant_feas)

data['grade'] = data['grade'].map(grade_dct)
data["subGrade"] = data["subGrade"].apply(lambda x: grade_dct.get(x[0]) + int(x[1]))
data["employmentLength"] = data["employmentLength"].apply(deal_employmentLength)
data["earliesCreditLine"] = data["earliesCreditLine"].apply(lambda x: int(x[-4:]))

In [281]:
ave_feas = ["dti", "revolUtil"]
ave_int_feas = ["employmentLength", "pubRecBankruptcies"]
mode_feas = ["employmentTitle", "postCode", "title"]

data[ave_feas] = data[ave_feas].fillna(data[ave_feas].mean())
data[ave_int_feas] = data[ave_int_feas].fillna(data[ave_int_feas].mean().apply(int))
for fea in mode_feas:
    data[fea] = data[fea].fillna(data[fea].mode()[0])

n_feas =["n0", "n1", "n2", 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']
for fea in n_feas:
    data[fea] = data[fea].fillna(-1.0)

In [282]:
features = [fea for fea in data.columns if fea not in ["isDefault"]]

In [283]:
for fea in features:
    c_min = data[fea].min()
    c_max = data[fea].max()
    if all_data_is_float(data, fea):
        if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
            data[fea] = data[fea].astype(np.float16)
        elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
            data[fea] = data[fea].astype(np.float32)
        else:
            data[fea] = data[fea].astype(np.float64)
    else:
        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
            data[fea] = data[fea].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
            data[fea] = data[fea].astype(np.int16)
        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
            data[fea] = data[fea].astype(np.int32)
        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
            data[fea] = data[fea].astype(np.int64) 

In [284]:
for i, fea in enumerate(obj_need_bucket_feas):
    data[fea+'_cnts'] = data.groupby([fea])['id'].transform('count')
    data[fea+'_rank'] = data.groupby([fea])['id'].rank(ascending=False).astype(int)
    data = data.drop(columns=[fea])
    obj_not_bucket_feas.append(fea + '_cnts')
    obj_not_bucket_feas.append(fea + '_rank')

In [285]:
sparse_features = obj_not_bucket_feas
dense_features = num_not_bucket_feas + zero_one_feas

In [286]:
set(data.columns) - set(sparse_features + dense_features)

{'id', 'isDefault'}

In [287]:
set(sparse_features + dense_features) - set(data.columns)

set()

In [288]:
label = "isDefault"

In [289]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [292]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                       for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [295]:
train = data[data[label].notnull()].reset_index(drop=True)
test = data[data[label].isnull()].reset_index(drop=True)

In [296]:
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

In [315]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy",
              metrics=['binary_crossentropy'], )

history = model.fit(train_model_input, train[label].values,
                    batch_size=256, epochs=30, verbose=2, validation_split=0.2, )

Epoch 1/30
2500/2500 - 46s - loss: 0.4570 - binary_crossentropy: 0.4541 - val_loss: 0.4538 - val_binary_crossentropy: 0.4494
Epoch 2/30
2500/2500 - 46s - loss: 0.3579 - binary_crossentropy: 0.3455 - val_loss: 0.5262 - val_binary_crossentropy: 0.5060
Epoch 3/30
2500/2500 - 49s - loss: 0.2312 - binary_crossentropy: 0.2110 - val_loss: 1.1081 - val_binary_crossentropy: 1.0879
Epoch 4/30
2500/2500 - 51s - loss: 0.1475 - binary_crossentropy: 0.1288 - val_loss: 1.3713 - val_binary_crossentropy: 1.3536
Epoch 5/30
2500/2500 - 50s - loss: 0.1123 - binary_crossentropy: 0.0963 - val_loss: 1.5539 - val_binary_crossentropy: 1.5388
Epoch 6/30
2500/2500 - 50s - loss: 0.1002 - binary_crossentropy: 0.0863 - val_loss: 1.5111 - val_binary_crossentropy: 1.4968
Epoch 7/30
2500/2500 - 52s - loss: 0.1104 - binary_crossentropy: 0.0963 - val_loss: 1.5207 - val_binary_crossentropy: 1.5046
Epoch 8/30
2500/2500 - 51s - loss: 0.1247 - binary_crossentropy: 0.1084 - val_loss: 1.6139 - val_binary_crossentropy: 1.5952


In [316]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [338]:
test_out_file = "/Users/HaoShaochun/Yam/FinancialRiskControl/data/testA_result_DL_V1.0.csv"

test['isDefault'] = pred_ans.reshape(-1)
test[['id','isDefault']].to_csv(test_out_file, index=False, float_format="%.10f")

In [323]:
np.set_printoptions(suppress=True)