In [16]:
import pandas as pd
from tqdm import tqdm

In [17]:
base_info = pd.read_csv('../data/train/base_info.csv')
# 把缺失数量作为一种编码
base_info_clean = base_info
base_info_clean['nan_num'] = base_info.isnull().sum(axis=1)

nums, shapes = base_info_clean.shape
# 删除缺失 70%以上的数据
for name, count in base_info_clean.isnull().sum().items():
    if count * 1.0 / nums >= 0.70:
        base_info_clean.drop([name], axis=1, inplace=True)

# 删除类别相同的数据
for name, count in base_info_clean.nunique().items():
    if count == 0:
        base_info_clean.drop([name], axis=1, inplace=True)


In [18]:
base_info_clean.isnull().sum()

id                   0
oplocdistrict        0
industryphy          0
industryco           1
dom                  0
opscope              0
enttype              0
enttypeitem       8214
opfrom               0
opto             16040
state                0
orgid                0
jobid                0
adbusign             0
townsign             0
regtype              0
empnum            5250
compform         14234
opform           15865
venind           16428
oploc                0
regcap             191
enttypegb            0
nan_num              0
dtype: int64

In [19]:
base_info_clean.drop('dom',axis=1,inplace=True)

In [20]:
# 正则化分词，先去除掉括号里面的内容
print(base_info_clean['opscope'].head(2))
opscope = base_info_clean['opscope']
opscope.str.split(r',|、|。|;|，',expand = True).head(2)
# 这一行先删掉，还没想好怎么处理,感觉与类别强相关
base_info_clean.drop('opscope',axis=1,inplace=True)


0    纳米新材料、机械设备、五金配件加工、销售及技术推广服务，道路货物运输。（依法须经批准的项目，...
1                    健身服务。（依法须经批准的项目，经相关部门批准后方可开展经营活动）
Name: opscope, dtype: object


In [21]:
# 处理日期类
date_cols = ['opfrom','opto']
for col in tqdm(date_cols):
    base_info_clean[f'{col}_year'] = pd.to_datetime(base_info[col]).dt.year.fillna(-1)
base_info_clean.drop(date_cols,axis=1,inplace=True)

100%|██████████| 2/2 [00:00<00:00, 54.49it/s]


In [22]:
# 处理 category 类
base_info_clean['opform'] = base_info_clean['opform'].replace('01', '01-以个人财产出资').replace('02', '02-以家庭共有财产作为个人出资')

cat_cols = ['oplocdistrict','industryphy','industryco','enttype','enttypeitem',
              'state','orgid','jobid',
              'adbusign','townsign','regtype',
              'compform','opform','venind','oploc','enttypegb']


def cross_category(col1, col2):
    column = f'{col1}_{col2}'
    new_cate = []
    for cat1,cat2 in tqdm(zip(base_info_clean[col1].values,base_info_clean[col2].values)):
        new_cate.append(f'{cat1}_{cat2}')
    base_info_clean[column] = new_cate
    cat_cols.append(column)

# 类别交叉
cross_category('industryphy','industryco')
cross_category('enttypegb','enttypeitem')
cross_category('industryphy','enttypegb')
cross_category('industryphy','enttypeitem')
cross_category('industryco','enttypeitem')
cross_category('industryco','enttypegb')




24865it [00:00, 489883.83it/s]
24865it [00:00, 377990.54it/s]
24865it [00:00, 660933.68it/s]
24865it [00:00, 745625.39it/s]
24865it [00:00, 474198.25it/s]
24865it [00:00, 504566.48it/s]


In [23]:
# 类别编码
for cat_col in tqdm(cat_cols):
    base_info_clean[cat_col] = base_info_clean[cat_col].astype('category').cat.codes

100%|██████████| 22/22 [00:00<00:00, 292.98it/s]


In [24]:
 base_info_clean[cat_cols].head()


Unnamed: 0,oplocdistrict,industryphy,industryco,enttype,enttypeitem,state,orgid,jobid,adbusign,townsign,...,opform,venind,oploc,enttypegb,industryphy_industryco,enttypegb_enttypeitem,industryphy_enttypegb,industryphy_enttypeitem,industryco_enttypeitem,industryco_enttypegb
0,10,12,199,0,4,1,62,112,0,0,...,-1,-1,108,6,196,6,141,102,520,625
1,9,14,244,16,-1,1,58,61,0,1,...,3,2,1989,52,244,53,205,145,711,863
2,3,17,336,0,4,1,32,422,0,0,...,-1,-1,108,6,339,6,230,164,1020,1228
3,8,11,143,9,18,1,50,321,0,1,...,1,-1,108,39,138,39,133,94,302,347
4,3,17,312,0,2,2,1,346,0,0,...,-1,-1,108,4,314,4,229,163,917,1109


In [25]:
# 数值数据进行分桶处理

# 对于 nan_num 采用
# base_info_clean = base_info_clean.sort_values(by='nan_num',ascending=False)
# 手动分箱 {7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
# <= 9, <= 11, 12, 13, >= 14
base_info_clean['nan_num_bin'] = 1
base_info_clean.loc[base_info_clean['nan_num'] > 9,'nan_num_bin'] = 2
base_info_clean.loc[base_info_clean['nan_num'] >= 11,'nan_num_bin'] = 3
base_info_clean.loc[base_info_clean['nan_num'] >= 12,'nan_num_bin'] = 4
base_info_clean.loc[base_info_clean['nan_num'] >= 13,'nan_num_bin'] = 5
base_info_clean.loc[base_info_clean['nan_num'] >= 14,'nan_num_bin'] = 6
cat_cols.append('nan_num_bim')
print("nan_num 分桶完毕 ......... ")

nan_num 分桶完毕 ......... 


In [26]:
#注册资本分桶
base_info_clean['regcap']=base_info_clean['regcap'].fillna(base_info_clean['regcap'].median())
base_info_clean = base_info_clean.sort_values(by='regcap')
base_info_clean['regcap_bin']=pd.qcut(base_info_clean['regcap'],6,labels = False)
cat_cols.append('regcap_bin')
print("注册资本 regcap_bin 分桶完毕 ......... ")

注册资本 regcap_bin 分桶完毕 ......... 


In [27]:
# empnum 分桶
base_info_clean['empnum']=base_info_clean['empnum'].fillna(base_info_clean['empnum'].median())
base_info_clean = base_info_clean.sort_values(by='empnum')
base_info_clean['empnum_bin']=pd.qcut(base_info_clean['empnum'],5,labels = False,duplicates = 'drop')
cat_cols.append('empnum_bin')
print("empnum_bin 分桶完毕 ......... ")

empnum_bin 分桶完毕 ......... 


In [31]:
base_info_clean.nunique()

id                         24865
oplocdistrict                 16
industryphy                   20
industryco                   346
enttype                       17
enttypeitem                   32
state                          6
orgid                         78
jobid                        434
adbusign                       2
townsign                       2
regtype                        3
empnum                        62
compform                       3
opform                        32
venind                         4
oploc                       5351
regcap                      1143
enttypegb                     53
nan_num                       11
opfrom_year                   35
opto_year                     70
industryphy_industryco       348
enttypegb_enttypeitem         54
industryphy_enttypegb        250
industryphy_enttypeitem      179
industryco_enttypeitem      1040
industryco_enttypegb        1254
nan_num_bin                    6
regcap_bin                     6
empnum_bin

In [30]:
base_info_clean.to_csv('./features/base_info_feature.csv',index= False)