In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

## 1) Feature 추가

### 1.인구 밀집 지역 소속 여부
#### : 인구 밀집 지역에 소속 되어 있으면 1, 아니면 0

In [5]:
idx = df[df['Region_Code'] == 28].index

r_lst = []
for i in range(len(df)):
  if i in idx:
    r_lst.append('main')
  else:
    r_lst.append('notmain')

df['Population'] = r_lst

### 2. 기본 보험 등록자
#### : 연간 납부하는 보험료가 최소인 집단은 기본적인 보험만 가입한 것으로 취급. (기본: 1, 그 외: 0)

In [6]:
idx = df[df['Annual_Premium'] == 2630].index

a_lst = []
for i in range(len(df)):
  if i in idx:
    a_lst.append('basic')
  else:
    a_lst.append('option')

df['Basic_Annual'] = a_lst

### 3. 보험 수혜자
#### : 사전에 보험에 가입했고, 자동차 파손 이력이 있으면 보험 수혜자로 취급 (Yes: 1, No: 0)

In [7]:
df['Beneficiary'] = np.where((df['Previously_Insured'] == 1) & (df['Vehicle_Damage'] == 'Yes'), 'benefit', 'not_benefit')

### 4. 사고 위험도
#### : 자동차 연식이 적은데 사고 경험이 있으면 운전 습관이 위험하다고 판단.
#### -> 사고 위험이 높으면 보험에 대한 필요성 높음

In [8]:
df['Danger'] = 1000

for i in range(len(df)):
  #고위험군
  if ((df.loc[i, 'Vehicle_Damage'] == 'Yes') & (df.loc[i, 'Vehicle_Age'] == '< 1 Year')) == 1:
    df.loc[i, 'Danger'] = 'high'
  #저위험군
  elif ((df.loc[i, 'Vehicle_Damage'] == 'No') & (df.loc[i, 'Vehicle_Age'] == '> 2 Years')) == 1:
    df.loc[i, 'Danger'] = 'low'
  #그 외
  else:
    df.loc[i, 'Danger'] = 'mid'

### 5. 사고 위험도2
#### : 보험을 가입하지 않았는데 사고 이력 있음.

In [9]:
df['N_Danger'] = np.where((df['Previously_Insured'] == 0) & (df['Vehicle_Damage'] == 'Yes'), 'high', 'low')

### 6. 연령별 사고 경험 비율
#### : 연령대에 따른 사고 유경험자 비율

In [10]:
df1=df[df['Age']<= 25]
r1 = round(df1['Vehicle_Damage'].value_counts()[1] / len(df1),4)

df2=df[(df['Age']>=26) & (df['Age']<=36)]
r2 = round(df2['Vehicle_Damage'].value_counts()[1] / len(df2),4)

df3=df[(df['Age']>=37) & (df['Age']<=49)]
r3 = round(df3['Vehicle_Damage'].value_counts()[0] / len(df3),4)

df4=df[(df['Age']>=50) & (df['Age']<=85)]
r4 = round(df4['Vehicle_Damage'].value_counts()[1] / len(df4),4)

df['Age_damaged'] = 10000000
for i in range(len(df)):
    if df.loc[i,'Age'] < 25:
        df.loc[i,'Age_damaged'] = r1
    elif (df.loc[i,'Age'] >= 26) & (df.loc[i,'Age'] <= 36):
        df.loc[i,'Age_damaged'] = r2
    elif (df.loc[i,'Age'] >= 37) & (df.loc[i,'Age'] <= 49):
        df.loc[i,'Age_damaged'] = r3
    else:
        df.loc[i,'Age_damaged'] = r4

### 7. 주요 채널 grouping
#### : 채널 변수 내의 각 채널 중 빈도가 높은 채널을 주요 채널로 설정

In [12]:
df['Main_Channel'] = 1000000
for i in range(len(df)):
    if (df.loc[i,'Policy_Sales_Channel'] == 152) | (df.loc[i,'Policy_Sales_Channel'] == 26) | (df.loc[i,'Policy_Sales_Channel'] == 124):
        df.loc[i,'Main_Channel'] = 'main_ch'
    else:
        df.loc[i,'Main_Channel'] = 'notmain_ch'

### 8. 연령대 별 주요 채널 grouping
##### : 연령대 별로 빈도 수가 높은 채널이 있어 grouping + 36살을 기준으로 연령대를 크게 두 파트로 나눔

In [13]:
df['Age_Channel'] = 1000000
for i in range(len(df)):
    if (df.loc[i,'Policy_Sales_Channel'] == 26) | (df.loc[i,'Policy_Sales_Channel'] == 124):
        df.loc[i,'Age_Channel'] = 'main_over'
    elif (df.loc[i,'Policy_Sales_Channel'] == 152) | (df.loc[i,'Policy_Sales_Channel'] == 160):
        df.loc[i,'Age_Channel'] = 'main_under'
    else:
        df.loc[i,"Age_Channel"] = 'channel'

### 9. 연령대
##### : 연령대에 따라 특징들이 많이 잡혀 연령대 변수 추가

In [14]:
df['Age_group'] = 100000

for i in range(len(df)):
    if df.loc[i,'Age'] <= 25:
        df.loc[i,'Age_group'] = '~25'
    elif (df.loc[i,'Age'] >= 26) & (df.loc[i,'Age'] <= 36):
        df.loc[i,'Age_group'] = '26~36'
    elif (df.loc[i,'Age'] >= 37) & (df.loc[i,'Age'] <= 49):
        df.loc[i,'Age_group'] = '37~49'
    else:
        df.loc[i,'Age_group'] = '50~'

### 10. Young & Rich
#### : previously Insured와 Response이 1 인 경우 젊고 돈이 많은 것으로 확인

In [15]:
df['Young_Rich'] = 1000000

for i in range(len(df)):
    if (df.loc[i,'Previously_Insured'] == 1) & (df.loc[i,'Response'] == 1) :
        df.loc[i,'Young_Rich'] = 'Not_YR'
    else:
        df.loc[i,'Young_Rich'] = 'YR'

In [None]:
df.drop('id', axis=1, inplace=True)
df.to_csv('var_df.csv',index=False)

## 2 ) 서브 모델링 데이터 생성

In [None]:
df = pd.read_csv('origin_df.csv')

In [76]:
model_b = df[df['Vehicle_Age'] == '> 2 Years']
model_b.to_csv("origin_balance.csv", index = False)

In [77]:
df1 = pd.read_csv('var_df.csv')

In [78]:
model_imb = df1[df1['Vehicle_Age'] != '> 2 Years']
model_imb.to_csv("var_imbalance.csv", index = False)

In [79]:
model_imb = df1[df1['Vehicle_Age'] == '> 2 Years']
model_imb.to_csv("var_balance.csv", index = False)

## 3 ) 데이터 전처리

In [None]:
def data_split(df):

    df_zero = df[df["target"] == 0]
    df_one = df[df["target"] == 1]

    X, y = df_zero.drop(["target"], axis = 1), df_zero["target"]
    a, b, c, d = train_test_split(X, y, test_size = 0.3)

    X, y = df_one.drop(["target"], axis = 1), df_one["target"]
    e, f, g, h= train_test_split(X, y, test_size = 0.3)

    temp1 = pd.concat([a, c], axis = 1)
    temp2 = pd.concat([b, d], axis = 1)
    temp3 = pd.concat([e, g], axis = 1)
    temp4 = pd.concat([f, h], axis = 1)

    df_train = pd.concat([temp1, temp3], axis = 0)
    df_test = pd.concat([temp2, temp4], axis = 0)

    df_train = df_train.sample(frac = 1)
    df_test = df_test.sample(frac = 1)

    df_train.reset_index(drop = True, inplace = True)
    df_test.reset_index(drop = True, inplace = True)


    return df_train, df_test

In [None]:
def LabelEncoding(df):
    o_lst = df.select_dtypes('object').columns.tolist()
    le = LabelEncoder()

    for col in o_lst:
        le = LabelEncoder()
        le.fit(df[col])
        trs = le.transform(df[col])
        df['encode_'+col] = trs
    df = df.drop(o_lst, axis=1)

    return df

In [None]:
def OnehotEncoding(df):
    o_lst = df.select_dtypes('object').columns.tolist()
    df = pd.get_dummies(data = df, columns = o_lst, prefix = 'encode', drop_first = True)

    return df

In [None]:
def outlier(df):
    ndf=df[df['Driving_License']==0]
    idx=ndf[ndf['Vehicle_Damage']=='No'].index
    df.drop(index=idx, inplace = True)
    
    return df

In [None]:
def train_ver(df_tr, ver,name):
    if ver == 1 :
        ndf_train = outlier(df_tr)
        ndf_train = LabelEncoding(ndf_train)
        ndf_train.to_csv(name + "1.csv", index = False)
    elif ver == 2:
        ndf_train = LabelEncoding(df_tr)
        ndf_train.to_csv(name + "2.csv", index = False)

    elif ver == 3:
        ndf_train = outlier(df_tr)
        ndf_train = OnehotEncoding(ndf_train)
        ndf_train.to_csv(name + "3.csv", index = False)

    else:
        ndf_train = OnehotEncoding(df_tr)
        ndf_train.to_csv(name + "4.csv", index = False)

In [None]:
def test_ver(df_test, ver,name):

    if (ver == 1) | (ver == 2) :
        ndf_test = LabelEncoding(df_test)
        ndf_test.to_csv(name + str(ver) + ".csv", index = False)

    elif (ver == 3) | (ver == 4):
        ndf_test = OnehotEncoding(df_test)
        ndf_test.to_csv(name + str(ver) + ".csv", index = False)

In [None]:
### main code - 변수 추가 안함 + imbalanced modeling

df = pd.read_csv('origin_imbalance.csv')
   
df_train, df_test = data_split(df)


for i in range(1,5):
    df_tr, df_ts = df_train.copy(), df_test.copy()
    train_ver(df_tr, i, 'oi_train')
    test_ver(df_ts, i, 'oi_test')

In [None]:
### main code - 변수 추가 + imbalanced modeling

df = pd.read_csv('var_imbalance.csv')
   
df_train, df_test = data_split(df)

for i in range(1,5):
    df_tr, df_ts = df_train.copy(), df_test.copy()
    train_ver(df_tr, i, 'vi_train')
    test_ver(df_ts, i, 'vi_test')  

In [None]:
### main code - 변수 추가 안함 + balanced modeling


df = pd.read_csv('origin_balance.csv')
   
X = df.drop('target', axis = 1) ; y = df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

df_train = pd.concat([X_train, y_train], axis = 1)
df_test = pd.concat([X_test, y_test], axis = 1)

for i in range(1,5):
    df_tr, df_ts = df_train.copy(), df_test.copy()
    train_ver(df_tr, i, 'ob_train')
    test_ver(df_ts, i, 'ob_test')

In [None]:
### main code - 변수 추가 + balanced modeling

df = pd.read_csv('var_balance.csv')
   
X = df.drop('target', axis = 1) ; y = df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

df_train = pd.concat([X_train, y_train], axis = 1)
df_test = pd.concat([X_test, y_test], axis = 1)

for i in range(1,5):
    df_tr, df_ts = df_train.copy(), df_test.copy()
    train_ver(df_tr, i, 'vb_train')
    test_ver(df_ts, i, 'vb_test')

In [None]:
### main code - 변수추가 X + submodeling x

df = pd.read_csv('origin_df.csv')

df_train, df_test = data_split(df)


for i in range(1,5):
    df_tr, df_ts = df_train.copy(), df_test.copy()
    train_ver(df_tr, i, 'of_train')
    test_ver(df_ts, i, 'of_test')

In [None]:
### main code - 변수추가 + submodeling x

df = pd.read_csv('var_df.csv')

df_train, df_test = data_split(df)


for i in range(1,5):
    df_tr, df_ts = df_train.copy(), df_test.copy()
    train_ver(df_tr, i, 'vf_train')
    test_ver(df_ts, i, 'vf_test')