In [None]:
import pandas as pd
import numpy as np
import os
import gc
from datetime import datetime, timedelta
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.simplefilter('ignore')

In [None]:
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
pd.set_option('max_colwidth',200)

## 1、数据前处理

In [None]:
FilePath = os.path.abspath('.')
SuperPath = os.path.dirname(FilePath)
DocPath = os.path.join(SuperPath, '1_Data')
print(FilePath)
print(SuperPath)

In [None]:
test = pd.read_csv(os.path.join(DocPath, 'testA.csv'))
train = pd.read_csv(os.path.join(DocPath, 'train.csv'))

In [None]:
test.info()

In [None]:
label = train[['id','isDefault']]

In [None]:
#查看样本是否平衡
label['isDefault'].value_counts()

In [None]:
train = train.drop('isDefault', axis=1)

In [None]:
df = train.append(test)

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
df.set_index('id',inplace=True)

In [None]:
df.head(5)

In [None]:
df.isnull().sum()

In [None]:
df['n10'].unique()

In [None]:
df = df.fillna(-1)

## 2、特征工程构建

In [None]:
df.columns

In [None]:
import re

In [None]:
#改字段格式
df['employmentLength'].value_counts()

In [None]:
df['employmentLength'] = df['employmentLength'].apply(lambda x: '0 year' if str(x) =='< 1 year'else '10 year' if str(x) == '10+ years'
 else '-1 years' if str(x) == '-1' else x)

In [None]:
df['employmentLength'].value_counts()

In [None]:
df['employmentLength'] = df['employmentLength'].apply(lambda x:str(x).split(' ')[0])

In [None]:
df['earliesCreditLine_year'] = df['earliesCreditLine'].apply(lambda x:str(x).split('-')[1])

In [None]:
# df['earliesCreditLine_month'] = df['earliesCreditLine'].apply(lambda x:str(x).split('-')[0])

In [None]:
df['earliesCreditLine_year'].value_counts()

In [None]:
# 等级变换
df['grade'] = df['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})

In [None]:
grade_map = {'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7}

In [None]:
def subGrade_map(x):
    grade, num = list(x)
    ans = grade_map[grade]
    ans = ans * 5 + int(num)-1
    return ans

In [None]:
df['subGrade'] = df['subGrade'].map(subGrade_map)

In [None]:
df['subGrade'].value_counts()

In [None]:
df['issueDate'] = pd.to_datetime(df['issueDate'], format='%Y-%m-%d')

In [None]:
starttime = datetime.strptime('2007-06-01','%Y-%m-%d')

In [None]:
df['issueDate_Dt'] = df['issueDate'].apply(lambda x: x-starttime ).dt.days

In [None]:
df['issueDate_Dt']

In [None]:
df.drop(['issueDate', 'earliesCreditLine'], axis=1, inplace=True)

## 2、对变量进行分类
1、分类变量
2、数值变量
3、时间序列

In [None]:
df.shape

### 2.1类别特征

In [None]:
df.drop(['policyCode'], axis=1, inplace=True)   #类别为1删除

In [None]:
# cate_feature = ['grade','subGrade','employmentTitle','verificationStatus',
#                 'purpose','postCode','homeOwnership','regionCode','applicationType','title']

In [None]:
# for f in cate_feature:
#     print(f,'类型数:',df[f].nunique())

In [None]:
# df = pd.get_dummies(df, columns=['grade','subGrade','verificationStatus','purpose','homeOwnership','regionCode','applicationType'],
#                     drop_first=True)

In [None]:
cate_features = ['applicationType', 'employmentLength', 'employmentTitle', 'grade', 'homeOwnership', 'initialListStatus',
                 'postCode', 'purpose', 'regionCode', 'subGrade', 'title', 'verificationStatus']
dense_features = ['annualIncome', 'delinquency_2years', 'dti', 'employmentLength', 'ficoRangeHigh',
                  'ficoRangeLow', 'installment', 'interestRate', 'loanAmnt', 'openAcc', 'pubRec', 'pubRecBankruptcies',
                  'revolBal', 'revolUtil', 'subGrade', 'term', 'totalAcc']

In [None]:
df.shape

In [None]:
for f in tqdm(cate_features):
    df['{}_cnt'.format(f)] = df.groupby([f])[f].transform('count')

In [None]:
for f1 in tqdm(cate_features):
    for f2 in cate_features:
        if f1 != f2:
            df['{}_{}'.format(f1,f2)] = df.groupby([f1,f2])[f].transform('count')

In [None]:
df.shape