# 환경 세팅

In [None]:
!pip install category_encoders

Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.1.post0


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
from sklearn.metrics import f1_score

# 데이터 불러오기

In [None]:
train = pd.read_csv("/content/drive/Shareddrives/빅콘테스트/221009/1009_train.csv")
test = pd.read_csv("/content/drive/Shareddrives/빅콘테스트/221009/1009_test.csv")
real_test = pd.read_csv("/preprocessed_test.csv")

In [None]:
test_fin = real_test.copy()

In [None]:
df_train = train[train['week_num'] <18].reset_index(drop=True)
df_val = train[train['week_num']>=18].reset_index(drop=True)
train = df_train.copy()
val = df_val.copy()

In [None]:
train['bank_id'] = train['bank_id'].astype(str)
train['product_id'] = train['product_id'].astype(str)
train['user_id'] = train['user_id'].astype(str)

In [None]:
val['bank_id'] = val['bank_id'].astype(str)
val['product_id'] = val['product_id'].astype(str)
val['user_id'] = val['user_id'].astype(str)

In [None]:
test['bank_id'] = test['bank_id'].astype(str)
test['product_id'] = test['product_id'].astype(str)
test['user_id'] = test['user_id'].astype(str)

In [None]:
real_test['bank_id'] = real_test['bank_id'].astype(str)
real_test['product_id'] = real_test['product_id'].astype(str)
real_test['user_id'] = real_test['user_id'].astype(str)

# 전처리
- standard scaling
- onehot encoding, ordinal encoding

In [None]:
numerical_feats = train.dtypes[train.dtypes != "object"].index.tolist()
numerical_feats.remove('is_applied')
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train.dtypes[train.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(categorical_feats))

test_numerical_feats = real_test.dtypes[real_test.dtypes != "object"].index.tolist()
test_numerical_feats.remove('is_applied')
print("Number of Numerical features for test: ", len(test_numerical_feats))

test_categorical_feats = real_test.dtypes[real_test.dtypes == "object"].index.tolist()
print("Number of Categorical features for test: ", len(test_categorical_feats))

In [None]:
ordinal_cate = ['bank_id','product_id','user_id']
onehot_cate = [ 'income_type','employment_type','houseown_type','purpose']

In [None]:
#OrdinalEncoder : 카테고리 변수는 ordinal_encoder 변환
from category_encoders.ordinal import OrdinalEncoder
encoder = OrdinalEncoder(ordinal_cate)
train[ordinal_cate] = encoder.fit_transform(train[ordinal_cate], train['is_applied'])
val[ordinal_cate] = encoder.transform(val[ordinal_cate])
test[ordinal_cate] = encoder.transform(test[ordinal_cate])
real_test[ordinal_cate] = encoder.transform(real_test[ordinal_cate])

In [None]:
from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train[onehot_cate]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(val[onehot_cate]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(test[onehot_cate]))
OH_cols_real_test = pd.DataFrame(OH_encoder.transform(real_test[onehot_cate]))

OH_cols_train.index = train.index
OH_cols_valid.index = val.index
OH_cols_test.index = test.index
OH_cols_real_test.index = real_test.index

OH_cols_train.columns = OH_encoder.get_feature_names(onehot_cate)
OH_cols_valid.columns = OH_encoder.get_feature_names(onehot_cate)
OH_cols_test.columns = OH_encoder.get_feature_names(onehot_cate)
OH_cols_real_test.columns = OH_encoder.get_feature_names(onehot_cate)

num_X_train = train.drop(onehot_cate, axis=1)
num_X_valid = val.drop(onehot_cate, axis=1)
num_X_test = test.drop(onehot_cate, axis=1)
num_X_real_test = real_test.drop(onehot_cate, axis=1)


train = pd.concat([num_X_train, OH_cols_train], axis=1)
val= pd.concat([num_X_valid, OH_cols_valid], axis=1)
test= pd.concat([num_X_test, OH_cols_test], axis=1)
real_test= pd.concat([num_X_real_test, OH_cols_real_test], axis=1)

In [None]:
scaler = StandardScaler()
train[numerical_feats] = scaler.fit_transform(train[numerical_feats])
val[numerical_feats] = scaler.transform(val[numerical_feats])
test[numerical_feats] = scaler.transform(test[numerical_feats])
real_test[numerical_feats] = scaler.transform(real_test[numerical_feats])

## 전처리한거 다시 다운

In [None]:
train = pd.read_csv("/train.csv")
val = pd.read_csv("/val.csv")
real_test = pd.read_csv("/real_test.csv")

In [None]:
target = 'is_applied'
X_train, X_valid, y_train, y_valid = train.drop(target, axis=1), val.drop(target, axis=1), train[target], val[target]
X_real_test = real_test.drop(target, axis=1)

# LGBM

In [None]:
from sklearn.model_selection import cross_val_score
def LGBM_objective(trial):
    max_depth = trial.suggest_int('max_depth', 1, 10)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 1000)
    n_estimators =  trial.suggest_int('n_estimators', 100, 500)

    model = LGBMClassifier(max_depth = max_depth, max_leaf_nodes = max_leaf_nodes,n_estimators = n_estimators,n_jobs=2,random_state=25)


    model.fit(X_train.drop(['application_id', 'loanapply_insert_time'],axis=1), y_train)
    y_pred = model.predict(X_valid.drop(['application_id', 'loanapply_insert_time'],axis=1))
    score = f1_score(y_pred, y_valid)

    return score

#Execute optuna and set hyperparameters
LGMB_study = optuna.create_study(direction='maximize')
LGMB_study.optimize(LGBM_objective, n_trials=10)

#Create an instance with tuned hyperparameters
optimized_LGMB = LGBMClassifier(max_depth = LGMB_study.best_params['max_depth'], max_leaf_nodes = LGMB_study.best_params['max_leaf_nodes'],
                                      n_estimators = LGMB_study.best_params['n_estimators'],n_jobs=2,random_state=25)

In [None]:
optimized_LGMB.fit(X_train.drop(['application_id', 'loanapply_insert_time'],axis=1), y_train)

LGBMClassifier(max_depth=9, max_leaf_nodes=327, n_estimators=125, n_jobs=2,
               random_state=25)

In [None]:
pred=optimized_LGMB.predict(X_test.drop(['application_id', 'loanapply_insert_time'],axis=1))
score_test=f1_score(pred, y_test)
print(score_test)

## test 예측

In [None]:
import joblib
optimized_LGBM = joblib.load("/content/drive/Shareddrives/빅콘테스트/데이터분석분야_퓨처스부문_이달의사원_추가제출파일/추가제출파일/Raw 데이터/lgbm.pkl")

In [None]:
pred=optimized_LGBM.predict(X_real_test.drop(['application_id', 'loanapply_insert_time'],axis=1))
pred_prob = optimized_LGBM.predict_proba(X_real_test.drop(['application_id', 'loanapply_insert_time'],axis=1))

pred

In [None]:
pred_lgbm_0 =[]
for i in range(len(test_fin)):
    pred_lgbm_0.append(pred_prob[i][0])

pred_lgbm_1 =[]
for i in range(len(test_fin)):
    pred_lgbm_1.append(pred_prob[i][1])

In [None]:
prediction_df = pd.DataFrame(test_fin['application_id'], columns=['application_id'])
prediction_df['loanapply_insert_time'] = test_fin['loanapply_insert_time']
prediction_df['bank_id'] = test_fin['bank_id']
prediction_df['product_id'] = test_fin['product_id']
prediction_df['loan_limit'] = test_fin['loan_limit']
prediction_df['loan_rate'] = test_fin['loan_rate']
prediction_df['pred_lgb0'] = pred_lgbm_0
prediction_df['pred_lgb1'] = pred_lgbm_1

prediction_df

In [None]:
pred.sum()

In [None]:
prediction_df = prediction_df.sort_values(by=['application_id', 'loanapply_insert_time', 'bank_id', 'product_id', 'loan_limit']).reset_index(drop=True)
prediction_df

# RF

## 추가 전처리

In [None]:
# 변수 select first_bank, product_type 일단 빼기
train.drop(labels=[ 'employment_type_계약직', 'income_type_EARNEDINCOME2', 'income_type_PRIVATEBUSINESS', 'purpose_사업자금', 'purpose_기타', 'income_type_FREELANCER',
                 'income_type_OTHERINCOME', 'purpose_전월세보증금', 'employment_type_일용직', 'income_type_PRACTITIONER', 'purpose_투자', 'purpose_주택구입', 'foreign',
                 'purpose_자동차구입', 'personal_rehabilitation_type', 'houseown_type_배우자'],axis=1,inplace=True)
val.drop(labels=[ 'employment_type_계약직', 'income_type_EARNEDINCOME2', 'income_type_PRIVATEBUSINESS', 'purpose_사업자금', 'purpose_기타', 'income_type_FREELANCER',
                 'income_type_OTHERINCOME', 'purpose_전월세보증금', 'employment_type_일용직', 'income_type_PRACTITIONER', 'purpose_투자', 'purpose_주택구입', 'foreign',
                 'purpose_자동차구입', 'personal_rehabilitation_type', 'houseown_type_배우자'],axis=1,inplace=True)
real_test.drop(labels=[ 'employment_type_계약직', 'income_type_EARNEDINCOME2', 'income_type_PRIVATEBUSINESS', 'purpose_사업자금', 'purpose_기타', 'income_type_FREELANCER',
                   'income_type_OTHERINCOME', 'purpose_전월세보증금', 'employment_type_일용직', 'income_type_PRACTITIONER', 'purpose_투자', 'purpose_주택구입', 'foreign',
                   'purpose_자동차구입', 'personal_rehabilitation_type', 'houseown_type_배우자'],axis=1,inplace=True)

In [None]:
target = 'is_applied'
X_train, X_valid, y_train, y_valid = train.drop(target, axis=1), val.drop(target, axis=1), train[target], val[target]
X_real_test = real_test.drop(target, axis=1)

## 모델링

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
model =  RandomForestClassifier(n_estimators=100, random_state=42, min_samples_leaf = 5)
model.fit(X_train.drop(['application_id','loanapply_insert_time'], axis=1), y_train)
model_pred = model.predict(X_valid.drop(['application_id','loanapply_insert_time'], axis=1))
print(f'F1 Score: {f1_score(y_valid, model_pred):.6f}')

F1 Score: 0.502221


In [None]:
import joblib

joblib.dump(model,"/RF.pkl")

In [None]:
test_pred = model.predict(X_test.drop(['application_id','loanapply_insert_time'], axis=1))
test_pred_prob = model.predict_proba(X_test.drop(['application_id','loanapply_insert_time'], axis=1))
print(f'F1 Score: {f1_score(y_test, test_pred):.6f}')

F1 Score: 0.433371


In [None]:
test_pred

array([1., 1., 0., ..., 1., 1., 1.])

# test 예측

In [None]:
import joblib
model = joblib.load("/content/drive/Shareddrives/빅콘테스트/데이터분석분야_퓨처스부문_이달의사원_추가제출파일/추가제출파일/Raw 데이터/RF.pkl")

In [None]:
pred=model.predict(X_real_test.drop(['application_id','loanapply_insert_time'],axis=1))
pred_prob = model.predict_proba(X_real_test.drop(['application_id', 'loanapply_insert_time'],axis=1))
pred

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
pred.sum()

226647.0

In [None]:
pred_rf_0 =[]
for i in range(len(test_fin)):
    pred_rf_0.append(pred_prob[i][0])

pred_rf_1 =[]
for i in range(len(test_fin)):
    pred_rf_1.append(pred_prob[i][1])

In [None]:
prediction_df = pd.DataFrame(test_fin['application_id'], columns=['application_id'])
prediction_df['loanapply_insert_time'] = test_fin['loanapply_insert_time']
prediction_df['bank_id'] = test_fin['bank_id']
prediction_df['product_id'] = test_fin['product_id']
prediction_df['loan_limit'] = test_fin['loan_limit']
prediction_df['loan_rate'] = test_fin['loan_rate']
prediction_df['pred_rf0'] = pred_rf_0
prediction_df['pred_rf1'] = pred_rf_1

prediction_df

In [None]:
prediction_df = prediction_df.sort_values(by=['application_id', 'loanapply_insert_time', 'bank_id', 'product_id', 'loan_limit']).reset_index(drop=True)
prediction_df

In [None]:
prediction_df[['application_id', 'product_id','pred_rf0', 'pred_rf1']].to_csv("//pred_rf.csv", index=False)

# 앙상블

In [None]:
pred_lgbm = pd.read_csv("/pred_lgbm.csv")

In [None]:
pred_rf = pd.read_csv("/pred_rf.csv")

In [None]:
pred_lgbm

In [None]:
pred_rf

In [None]:
pred_lgbm['pred_rf0'] = pred_rf['pred_rf0']
pred_lgbm['pred_rf1'] = pred_rf['pred_rf1']

In [None]:
pred_lgbm['fin_0'] = pred_lgbm['pred_lgb0']*0.6+pred_lgbm['pred_rf0']*0.4
pred_lgbm['fin_1'] = pred_lgbm['pred_lgb1']*0.6+pred_lgbm['pred_rf1']*0.4

In [None]:
pred_lgbm

In [None]:
pred_lgbm['class'] = [0] * len(pred_lgbm)

pred_lgbm.loc[(pred_lgbm['fin_0'] > pred_lgbm['fin_1']), 'class'] = 0
pred_lgbm.loc[(pred_lgbm['fin_0'] <= pred_lgbm['fin_1']), 'class'] = 1

In [None]:
pred_lgbm['class'] .value_counts()