# ロジスティック回帰：Logistic Regression
　投稿結果:0.90882のサンプルプログラム
# 実際にサブミットしてみる。

## Import

In [161]:
import numpy as np
import pandas as pd
import math

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


import seaborn as sns
import matplotlib.pyplot as plt

## Setting

In [162]:
# Path
input_path = "../input_data/"

# Figure Size
plt.rcParams["figure.figsize"] = (15.0, 10.0)

# Set Display Max Columns
pd.set_option("display.max_columns", 50)

## Read Data

In [163]:
train = pd.read_csv(input_path + "bank/train.csv", sep=",", header=0, quotechar="\"")
test = pd.read_csv(input_path + "bank/test.csv", sep=",", header=0, quotechar="\"")

## Feature Engineering
 age, duration, campaignのみ加工して, 変数を選択.

In [208]:
def log_FE(data,target_encoding={}):
    data1 = data.copy()
    scaler = StandardScaler()
   
    data1["balance_z"] = scaler.fit_transform(data1[['balance']])
    data1["age_z"] = scaler.fit_transform(data1[['age']])

    kmeans = KMeans(n_clusters=3, random_state=77, n_init='auto')
    data1['cluster'] = kmeans.fit_predict(data1[["age_z","balance_z"]])
    
    #data1["loan_free"] = (data1["loan"] == "no") & (data1["housing"] == "no")
    data1["day_in_year"] = pd.to_datetime( data1["month"] + " " + data1["day"].astype(str) + " 2024", format="%b %d %Y").dt.dayofyear

    data1['balance'] = data1['balance'].clip(lower = 0,upper = 5704)
    
    data1["campaign"] = data1["campaign"].clip(upper = 10)

    data1["duration"] = data1["duration"].clip(lower=30, upper=2000)
    data1["duration"] = np.log(data1["duration"])

    data1["age"] = (data1["age"] - 50) ** 2 

    data1["campaign_date"] = abs(data1["day_in_year"].copy() - 183)

    data1["pdays"] = data1["pdays"].clip(lower=0, upper=365)

    if len(target_encoding) > 0:
        data1['job_month'] = data1['job'] + '_' + data1['month']
        data1['job_month_te'] = data1['job_month'].map(target_encoding)
    #return data1.drop(columns=["id","day","default","previous"])
    return data1[['balance','cluster','age','duration','campaign','campaign_date','pdays', 'job_month_te',
                 'housing','loan','job','education','marital','contact','month','poutcome']]

## Check Correlation

In [209]:
train1 = train.copy()
train1['job_month'] = train['job'] + '_' + train['month']
te_map =train1.groupby('job_month').y.mean().to_dict()


In [211]:

train1 = log_FE(train, te_map)
train1.corr(method="spearman",numeric_only=True)


Unnamed: 0,balance,cluster,age,duration,campaign,campaign_date,pdays,job_month_te
balance,1.0,0.184151,-0.04554,0.041749,-0.026801,0.090904,0.068704,0.135693
cluster,0.184151,1.0,-0.698207,-0.024309,0.016844,-0.010168,-0.021187,0.11193
age,-0.04554,-0.698207,1.0,0.066108,-0.057291,0.052949,0.043073,0.02426
duration,0.041749,-0.024309,0.066108,1.0,-0.109557,0.021166,0.031625,0.021169
campaign,-0.026801,0.016844,-0.057291,-0.109557,1.0,-0.130045,-0.110402,-0.035643
campaign_date,0.090904,-0.010168,0.052949,0.021166,-0.130045,1.0,0.291403,0.224872
pdays,0.068704,-0.021187,0.043073,0.031625,-0.110402,0.291403,1.0,0.104753
job_month_te,0.135693,0.11193,0.02426,0.021169,-0.035643,0.224872,0.104753,1.0


## Make Dummy Vars
 カテゴリ変数をダミー変数化.

In [212]:
train1 = pd.get_dummies(train1, drop_first=True) # drop_first=Trueでk-1個のダミー変数となる
train1

Unnamed: 0,balance,cluster,age,duration,campaign,campaign_date,pdays,job_month_te,housing_yes,loan_yes,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,education_secondary,education_tertiary,education_unknown,marital_married,marital_single,contact_telephone,contact_unknown,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,1756,0,121,6.844815,1,89,0,0.101911,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
1,1443,1,1,5.147494,10,134,0,0.120690,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True
2,436,0,196,6.340359,1,79,365,0.263975,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,474,1,169,6.047372,1,158,0,0.263158,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True
4,354,0,361,6.218600,1,62,9,0.263975,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27123,1455,0,64,5.631212,2,150,269,0.092742,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False
27124,719,0,256,5.891644,3,47,0,0.060000,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True
27125,49,0,196,6.003887,1,54,0,0.051138,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True
27126,209,0,289,4.330733,4,42,0,0.082192,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True


## Hold Out
　精度確認のためHold Outを行う.

In [213]:
# Hold Out
train_x, valid_x, train_y, valid_y = train_test_split(
    # ageとbalanceを説明変数とする
    train1, train["y"],
    # 検証データ割合
    test_size = 0.2,
    # 再現性のためシードを固定
    random_state = 77
)

## Make Model : Hold Out

In [214]:
Logi_model_ho = LogisticRegression(solver="liblinear")

# 学習
Logi_model_ho.fit(train_x, train_y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


## Predict and Check AUC

In [215]:
pred = Logi_model_ho.predict_proba(valid_x)[:, 1]
roc_auc_score(valid_y, pred)

0.9117658888083372

## Make Model

In [216]:
Logi_model = LogisticRegression(solver="liblinear")

# 学習
Logi_model.fit(train1, train["y"])

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


## Predict Test
trainデータと同様に特徴量を加工する

In [219]:
test1 = log_FE(test,te_map).fillna(0)
test_x = pd.get_dummies(test1, drop_first=True) 

In [220]:
submit = test[["id"]].copy()
submit["pred"] = Logi_model.predict_proba(test_x)[:, 1]

In [221]:
# Submit File
submit.to_csv(
    path_or_buf="../submit/submit_logi_20250929_5.csv",      # 出力先
    sep=",",                                             # 区切り文字
    index=False,                                        # indexの出力有無
    header=False                                        # headerの出力有無
)