In [1]:
import os
import pandas as pd
import numpy as np
import time
import datetime
import missingno as msno
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore") ##忽略警告
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.sans-serif']=['SimHei']   # 用黑体显示中文
matplotlib.rcParams['axes.unicode_minus']=False     # 正常显示负号

def data_read(data_path,file_name):
    df = pd.read_csv(os.path.join(data_path,file_name),delim_whitespace = True,header = None)
    ##变量重命名
    columns = ['status_account','duration','credit_history','purpose','amount','svaing_account',
               'present_emp','income_rate','personal_status','other_debtors','residence_info',
               'property','age','inst_plans','housing','num_credits','job','dependents',
               'telephone','foreign_worker','target']
    df.columns = columns
    df.target = df.target -1
    return df

In [2]:
df = data_read('/Users/wanggaojie/PycharmProjects/IntelligentRiskControl/code/chapter4/data','german.csv')

In [3]:
data_train,data_test = train_test_split(df,test_size=0.2,random_state=0,stratify=df.target)

In [4]:
var_no_order = ['credit_history','purpose', 'personal_status', 'other_debtors',
                 'inst_plans', 'housing', 'job','telephone', 'foreign_worker']
var_order = ['duration','income_rate','age','num_credits','dependents',]

In [15]:
data_train[var_no_order].head(5)
data_train[var_no_order].loc[0:5,:]

Unnamed: 0,credit_history,purpose,personal_status,other_debtors,inst_plans,housing,job,telephone,foreign_worker


In [7]:
enc = OneHotEncoder(dtype='int',sparse=False).fit(data_train[var_no_order])
enc.transform(data_train[var_no_order])

array([[0, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 1, ..., 1, 1, 0]])

In [7]:
def onehot_encode(df,data_path_1,flag='train'):
    df = df.reset_index(drop=True)
    ##判断数据集是否存在缺失值
    if sum(df.isnull().any()) > 0 :
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        var_numerics = df.select_dtypes(include=numerics).columns
        var_str = [ i for i in df.columns if i not in  var_numerics ]
        ##数据类型的缺失值用-77777填补
        if len(var_numerics) > 0:
            df.loc[:,var_numerics] = df[var_numerics].fillna(-7777)
        ##字符串类型的缺失值用NA填补
        if len(var_str) > 0:
            df.loc[:,var_str] = df[var_str].fillna('NA')
            
    if flag == 'train':
        enc = OneHotEncoder(dtype='int').fit(df)
        ##保存编码模型
        save_model = open(os.path.join(data_path_1 ,'onehot.pkl'), 'wb')
        pickle.dump(enc, save_model, 0)
        save_model.close()
        df_return = pd.DataFrame( enc.transform(df).toarray())
        df_return.columns = enc.get_feature_names(df.columns)
        
    elif flag =='test':
        ##测试数据编码
        read_model = open(os.path.join(data_path_1 ,'onehot.pkl'),'rb')
        onehot_model = pickle.load(read_model)
        read_model.close()
        ##如果训练集无缺失值，测试集有缺失值则将该样本删除
        var_range = onehot_model.categories_
        var_name = df.columns
        del_index = []
        for i in range(len(var_range)):
            if 'NA' not in var_range[i]and 'NA' in df[var_name[i]].unique():
                index = np.where( df[var_name[i]] == 'NA')
                del_index.append(index)
            elif -7777 not in var_range[i] and -7777 in df[var_name[i]].unique():
                index = np.where( df[var_name[i]] == -7777)
                del_index.append(index)
        ##删除样本
        if len(del_index) > 0:
            del_index = np.unique(del_index)
            df = df.drop(del_index)
            print('训练集无缺失值，但测试集有缺失值，第{0}条样本被删除'.format(del_index))
        df_return = pd.DataFrame(onehot_model.transform( df).toarray())
        df_return.columns = onehot_model.get_feature_names(df.columns)
        
    elif flag == 'transform':
        ##编码数据值转化为原始变量
        read_model = open(os.path.join(data_path_1,'onehot.pkl'),'rb')
        onehot_model = pickle.load(read_model)
        read_model.close()
        ##逆变换
        df_return = pd.DataFrame( onehot_model.inverse_transform(df) )
        df_return.columns  = np.unique( ['_'.join(i.rsplit('_')[:-1] ) for i in df.columns])
    return df_return

In [9]:
enc = da

In [11]:
result= OneHotEncoder(sparse = False).fit_transform(enc.reshape(-1,1))

In [12]:
result

array([[0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.]])

In [11]:
trains = onehot_encode(data_train[var_no_order],'/Users/wanggaojie/PycharmProjects/IntelligentRiskControl/code/chapter5/data','train')

In [14]:
trains.head(8)

Unnamed: 0,credit_history_A30,credit_history_A31,credit_history_A32,credit_history_A33,credit_history_A34,purpose_A40,purpose_A41,purpose_A410,purpose_A42,purpose_A43,...,housing_A152,housing_A153,job_A171,job_A172,job_A173,job_A174,telephone_A191,telephone_A192,foreign_worker_A201,foreign_worker_A202
0,0,0,0,0,1,1,0,0,0,0,...,1,0,0,0,1,0,1,0,1,0
1,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,1,0,0,1,1,0
2,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,1,0,0,1,1,0
3,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,1,0
4,0,0,0,0,1,1,0,0,0,0,...,1,0,0,1,0,0,1,0,1,0
5,0,0,1,0,0,0,1,0,0,0,...,0,1,0,0,1,0,0,1,1,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
7,0,0,1,0,0,0,0,0,0,1,...,1,0,0,0,1,0,0,1,1,0
