# Preprocessing

In [1]:
# warningの無視
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 標準ライブラリ
import pickle
import datetime
from dateutil.relativedelta import relativedelta

# basic
import pandas as pd
import numpy as np

# visualize
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Preprocessing
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# User defined functions
from self_lib import tips
from self_lib import doggie_tail as d_

# Jupyter上で図を表示するためのコマンド
%matplotlib inline

In [3]:
# 表示列数を指定
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

### version setting

In [4]:
mode="pre"  #"pre / test"

In [5]:
if mode=="pre":
    filepath="../data/pre/"
elif mode=="test":
    filepath="../data/pre/"
else:
    print("miss spelling")

### Input

In [6]:
# data定義
with open('../_regulation/data_def.pickle','rb') as f:
    d_def = pickle.load(f)

In [7]:
# data読込み
if mode=="pre":
    # data
    raw_data=pd.read_csv('../data/raw/df_train.csv',
                         encoding='shift-Jis',
                         index_col=0,
                        dtype={"Zip":str,"SystemCode":str})
elif mode=="test":
    # test_data
    raw_data=pd.read_csv('../data/raw/df_test_x.csv',
                         encoding='shift-Jis',
                         index_col=0,
                        dtype={"Zip":str,"SystemCode":str})
else:
    print("miss spelling")

In [8]:
data = raw_data.copy()

In [9]:
# データ列の定義を表示する
d_def

ID                                                               融資番号
Borrower                                債務法人の名前（特定を避けるため関連のない文字列に変換済）
City                                                     債務法人の所在地（都市）
State                                                     債務法人の所在地（州）
Zip                                                    債務法人の所在地（郵便番号）
Bank                                      融資銀行名（特定を避けるため関連のない文字列に変換済）
BankState                                                 融資銀行の所在地（州）
SystemCode                                                産業分類システムコード
ApprovalDate                                               支援機関による承認日
ApprovalFY                                                支援機関による承認年度
Term                                                          融資期間（月）
Employees                                                  債務法人による雇用数
ExistNew                                          1 = 既存の事業, 2 = 新規事業
CreateJob                                                    創出された職種数
RetainedJob         

In [10]:
d_.d(data)

Unnamed: 0,Borrower,City,State,Zip,Bank,BankState,SystemCode,ApprovalDate,ApprovalFY,Term,Employees,ExistNew,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowLoan,DisbursementDate,DisbursementGross,BalanceGross,LoanAmount,GuaranteedLoan,LoanStatus
count,255477,255471,255473,255477,255049,255048,255477,255477,255477,255477.00,255477.00,255431.00,255477.00,255477.00,255477.00,255477.00,254117,254729,254876,255477,255477,255477,255477,255477.00
missing,0,6,4,0,428,429,0,0,0,0.00,0.00,46.00,0.00,0.00,0.00,0.00,1360,748,601,0,0,0,0,0.00
missing_per,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.00
unique,255477,20085,51,24373,4598,54,1264,8646,91,,,,,,,,9,7,5284,47102,5,10318,17655,
top,cvesxqshxkwfxet,LOS ANGELES,CA,10001,kcjyktbecc,CA,0,7-Jul-93,2005,,,,,,,,N,N,31-Jul-95,"$50,000.00",$0.00,"$50,000.00","$25,000.00",
freq,1,3441,37942,278,26362,33690,56781,311,19749,,,,,,,,120274,221768,3020,12787,255473,20596,14844,
mean,,,,,,,,,,111.43,11.17,1.28,8.56,10.85,1695.70,0.76,,,,,,,,0.19
std,,,,,,,,,,79.48,75.02,0.45,239.67,239.85,10129.08,0.64,,,,,,,,0.39
min,,,,,,,,,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,,,,,,,0.00
25%,,,,,,,,,,60.00,2.00,1.00,0.00,0.00,1.00,0.00,,,,,,,,0.00


In [11]:
data.head()

Unnamed: 0_level_0,Borrower,City,State,Zip,Bank,BankState,SystemCode,ApprovalDate,ApprovalFY,Term,Employees,ExistNew,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowLoan,DisbursementDate,DisbursementGross,BalanceGross,LoanAmount,GuaranteedLoan,LoanStatus
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
9568103009,aaaavgcebciyrso,LONGVIEW,TX,75601,jhcuxowaay,IL,621320,23-Sep-96,1996,84,6,1.0,0,0,1,0,0,N,31-Dec-96,"$100,000.00",$0.00,"$100,000.00","$50,000.00",0
7208134000,aaaavxsbhbgwzum,COEUR D'ALENE,ID,83814,neifwowons,ID,541511,24-Feb-04,2004,26,1,1.0,0,1,1,1,Y,N,31-Mar-04,"$47,719.00",$0.00,"$35,000.00","$17,500.00",0
8283143001,aaabwzkeytthskq,SEELEY LAKE,MT,59868,jwwxcwcjos,MT,0,21-Apr-95,1995,180,2,1.0,0,0,1,0,N,N,31-Jul-95,"$110,000.00",$0.00,"$110,000.00","$82,500.00",0
7653414010,aaadbmusvfrhphi,MILFORD (CENSUS NAME FOR MILFO,NH,3055,ajkgixblfd,RI,561622,29-Jul-04,2004,84,6,1.0,1,7,1,1,0,N,31-Aug-04,"$55,000.00",$0.00,"$100,000.00","$50,000.00",0
9066434010,aaadmbsqmlwccxz,COLUMBIA,MS,39429,mvyccmuael,LA,484110,25-Aug-05,2005,60,1,1.0,1,0,1,2,0,N,30-Nov-05,"$50,000.00",$0.00,"$50,000.00","$25,000.00",0


In [12]:
data['ApprovalDate']=pd.to_datetime(data['ApprovalDate'])

In [13]:
# data[df_date['ApprovalFY'].isin(['1975'])].sort_values(by='ApprovalDate')
# data[data['ApprovalFY'].isin(['2004'])].sort_values(by='ApprovalDate')

->ApprovalFYとApprovalDateの関係が1976年にずれている。  
  ApprovalDateのみ採用。年・月・日に分ける。

In [14]:
# 年・月・日に分ける
data['Approval_Y']=data['ApprovalDate'].dt.year.astype(int)
data['Approval_M']=data['ApprovalDate'].dt.month.astype(int)
data['Approval_D']=data['ApprovalDate'].dt.day.astype(int)
# data=data.drop(['ApprovalDate','ApprovalFY'],axis=1)

In [15]:
# 年・月・日に分ける
data['DisbursementDate']=pd.to_datetime(data['DisbursementDate'],errors='ignore')
data['DisbursementDate'][data['DisbursementDate'].isna()]=data['ApprovalDate']
data['DisbursementDate_Y']=data['DisbursementDate'].dt.year.astype(int)
data['DisbursementDate_M']=data['DisbursementDate'].dt.month.astype(int)
data['DisbursementDate_D']=data['DisbursementDate'].dt.day.astype(int)
# data=data.drop('DisbursementDate',axis=1)

In [16]:
# ＄とカンマを削除し、int型に変換
l_money=['DisbursementGross','BalanceGross','LoanAmount','GuaranteedLoan']

for money in l_money:
    data[money]=data[money].str.replace("$","")
    data[money]=data[money].str.replace(",","")
    data[money]=data[money].astype(float).astype(int)

In [17]:
if mode=='pre':
    data.to_pickle(filepath + 'pre_data.pickle')
elif mode=='test':
    data.to_pickle(filepath + 'pre_test_data.pickle')

In [3]:
data_pre=pd.read_pickle("../data/pre/pre_data.pickle")
data_test=pd.read_pickle("../data/pre/pre_test_data.pickle")

In [7]:
# data_pre.head()

In [6]:
# data_test.head()