# 企业非法集资风险预测

In [77]:
!ls -R data/

[1m[36mtest[m[m  [1m[36mtrain[m[m

data//test:
entprise_evaluate.csv

data//train:
annual_report_info.csv entprise_info.csv      tax_info.csv
base_info.csv          news_info.csv
change_info.csv        other_info.csv


In [78]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm 

def category_info(data):
    for col in tqdm(data.columns):
        print(col, data[col].nunique(dropna=False))

def fill_nan(data):
    for col in tqdm(data.select_dtypes(['float64']).columns):
        data[col] = data[col].fillna(data[col].median())
    return data

## 1. 数据集处理

包含数据集7和8中涉及到的所有企业的基本信息，每一行代表一个企业的基本数据，每一行有33列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。

id:企业唯一标识, oplocdistrict:行政区划代码, industryphy:行业类别代码, industryco:行业细类代码, dom:经营地址, opscope:经营范围, enttype:企业类型, enttypeitem:企业类型小类, opfrom:经营期限起, opto:经营期限止, state:状态, orgid:机构标识, jobid:职位标识, adbusign:是否广告经营, townsign:是否城镇, regtype:主题登记类型, empnum:从业人数, compform:组织形式, parnum:合伙人数, exenum:执行人数, opform:经营方式, ptbusscope:兼营范围, venind:风险行业, enttypeminu:企业类型细类, midpreindcode:中西部优势产业代码, protype:项目类型, oploc:经营场所, regcap:注册资本（金）, reccap:实缴资本, forreccap:实缴资本（外方）, forregcap:注册资本（外方）, congro:投资总额, enttypegb:企业（机构）类型

#### 1.1 读取数据

In [79]:

base_info = pd.read_csv('./data/train/base_info.csv')
print(base_info.shape)



(24865, 33)


## 1.2 base_info 处理

In [80]:
# 查看特征类别信息
category_info(base_info)

 76%|███████▌  | 25/33 [00:00<00:00, 211.75it/s]id 24865
oplocdistrict 16
industryphy 20
industryco 346
dom 23278
opscope 20815
enttype 17
enttypeitem 32
opfrom 6620
opto 5747
state 6
orgid 78
jobid 434
adbusign 2
townsign 2
regtype 3
empnum 63
compform 3
parnum 52
exenum 51
opform 34
ptbusscope 1
venind 4
enttypeminu 27
midpreindcode 1
100%|██████████| 33/33 [00:00<00:00, 219.64it/s]protype 3
oploc 5351
regcap 1144
reccap 598
forreccap 12
forregcap 39
congro 34
enttypegb 53



In [81]:
# 删掉只有一个类别的
base_info = base_info.drop(columns=['ptbusscope'])
base_info = base_info.drop(columns=['midpreindcode'])

# 删除类别数量较多的
base_info = base_info.drop(columns=['opscope'])
base_info = base_info.drop(columns=['dom'])

base_info.head()

Unnamed: 0,id,oplocdistrict,industryphy,industryco,enttype,enttypeitem,opfrom,opto,state,orgid,...,venind,enttypeminu,protype,oploc,regcap,reccap,forreccap,forregcap,congro,enttypegb
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,340223,M,7513.0,1100,1150.0,2019-07-11 00:00:00,,6,340223010010000000,...,,1151.0,,2367b4cac96d8598,50.0,,,,,1151
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,340222,O,8090.0,9600,,2017-09-06,,6,340222060010000000,...,3.0,,,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...,10.0,,,,,9600
2,59b38c56de3836838082cfcb1a298951abfe15e6940c49ba,340202,R,9053.0,1100,1150.0,2020-09-14 14:46:30,,6,340202010010000000,...,,1151.0,,2367b4cac96d8598,100.0,,,,,1151
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,340221,L,7212.0,4500,4540.0,2015-09-30,,6,340221010010000000,...,,,,2367b4cac96d8598,10.0,,,,,4540
4,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,340202,R,8810.0,1100,1130.0,2017-12-01,2067-11-30,7,340200000000000000,...,,,,2367b4cac96d8598,100.0,,,,,1130


In [82]:
# 处理日期，只取年、日
#  opfrom:经营期限起, opto:经营期限止
opfrom = pd.to_datetime(base_info['opfrom'])#转换格式
opto = pd.to_datetime(base_info['opto'])
base_info['opfrom_year'] = opfrom.dt.year
base_info['opfrom_month'] = opfrom.dt.month
base_info['opto_year'] = opto.dt.year
base_info['opto_month'] = opto.dt.month

del base_info['opfrom']
del base_info['opto']

gc.collect()

584

In [83]:

# 类别清理 `object`

for col in tqdm(['industryco', 'enttypeitem', 'compform', 'venind', 
                 'enttypeminu', 'protype']):
    base_info[col] = base_info[col].fillna(-1).astype('int')

100%|██████████| 6/6 [00:00<00:00, 495.46it/s]


In [84]:
# 数字类型缺失值太多, 且部分 columns std 较大, 先考虑用中值填充

for col in tqdm(base_info.select_dtypes(['float64']).columns):
    base_info[col] = base_info[col].fillna(base_info[col].median())

100%|██████████| 10/10 [00:00<00:00, 566.09it/s]


In [85]:
# 数据清理

base_info['opform'] = base_info['opform'].replace('01', '01-以个人财产出资').replace('02', '02-以家庭共有财产作为个人出资')

In [86]:
# 数据比较长尾, label encoding 和 freq 处理
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

for col in tqdm(['industryphy', 'opform', 'oploc', 'orgid', 'jobid', 'oplocdistrict',
                 'enttypegb', 'industryco', 'enttype', 'enttypeitem']):
    lbl = LabelEncoder()
    base_info[col] = lbl.fit_transform(base_info[col].astype(str))
    vc = base_info[col].value_counts(dropna=True, normalize=True).to_dict()
    base_info[f'{col}_freq'] = base_info[col].map(vc)

100%|██████████| 10/10 [00:00<00:00, 53.39it/s]


In [87]:
del base_info['forreccap']
del base_info['forregcap']
del base_info['protype']
del base_info['congro']


In [88]:
base_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24865 entries, 0 to 24864
Data columns (total 37 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  24865 non-null  object 
 1   oplocdistrict       24865 non-null  int64  
 2   industryphy         24865 non-null  int64  
 3   industryco          24865 non-null  int64  
 4   enttype             24865 non-null  int64  
 5   enttypeitem         24865 non-null  int64  
 6   state               24865 non-null  int64  
 7   orgid               24865 non-null  int64  
 8   jobid               24865 non-null  int64  
 9   adbusign            24865 non-null  int64  
 10  townsign            24865 non-null  int64  
 11  regtype             24865 non-null  int64  
 12  empnum              24865 non-null  float64
 13  compform            24865 non-null  int64  
 14  parnum              24865 non-null  float64
 15  exenum              24865 non-null  float64
 16  opfo

### 1.3 annual_report_info.csv 处理

包含数据集7和8中涉及到的企业的年报基本信息，每一行代表一个企业的年报基本数据，每一行有23列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。
数据格式如下：
[id:企业唯一标识, ANCHEYEAR:年度, STATE:状态, FUNDAM:资金数额, MEMNUM:成员人数, FARNUM:农民人数, ANNNEWMEMNUM:本年度新增成员人数, ANNREDMEMNUM:本年度退出成员人数, EMPNUM:从业人数, EMPNUMSIGN:从业人数是否公示, BUSSTNAME:经营状态名称, COLGRANUM:其中高校毕业生人数经营者, RETSOLNUM:其中退役士兵人数经营者, DISPERNUM:其中残疾人人数经营者, UNENUM:其中下岗失业人数经营者, COLEMPLNUM:其中高校毕业生人数雇员, RETEMPLNUM:其中退役士兵人数雇员, DISEMPLNUM:其中残疾人人数雇员, UNEEMPLNUM:其中下岗失业人数雇员, WEBSITSIGN:是否有网站标志, FORINVESTSIGN:是否有对外投资企业标志, STOCKTRANSIGN:有限责任公司本年度是否发生股东股权转让标志, PUBSTATE:公示状态：1 全部公示，2部分公示,3全部不公示]


In [89]:
annual_report_info = pd.read_csv('./data/train/annual_report_info.csv')
print(annual_report_info.shape)
print(annual_report_info.info())
category_info(annual_report_info)

100%|██████████| 23/23 [00:00<00:00, 976.93it/s](22550, 23)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22550 entries, 0 to 22549
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             22550 non-null  object 
 1   ANCHEYEAR      22550 non-null  float64
 2   STATE          22545 non-null  float64
 3   FUNDAM         5702 non-null   float64
 4   MEMNUM         29 non-null     float64
 5   FARNUM         29 non-null     float64
 6   ANNNEWMEMNUM   29 non-null     float64
 7   ANNREDMEMNUM   29 non-null     float64
 8   EMPNUM         22535 non-null  float64
 9   EMPNUMSIGN     16833 non-null  float64
 10  BUSSTNAME      17680 non-null  object 
 11  COLGRANUM      20041 non-null  float64
 12  RETSOLNUM      20041 non-null  float64
 13  DISPERNUM      20041 non-null  float64
 14  UNENUM         20041 non-null  float64
 15  COLEMPLNUM     20041 non-null  float64
 16  RETEMPLNUM     20041 non-null  flo

> 共有 22550 条数据，但是id只有8937个，说明有大量数据的重复。总共的共有24865个，那么怎么来处理这些特征呢？

In [90]:
# BUSSTNAME,PUBSTATE 进行 one-hot 编码

cols = ['BUSSTNAME','PUBSTATE','STATE','FORINVESTSIGN','STOCKTRANSIGN','WEBSITSIGN']
for col in cols:
    one_hot_code = pd.get_dummies(annual_report_info[col],dtype = float)
    one_hot_code_cols = []
    for i in range(one_hot_code.shape[1]):
        one_hot_code_cols.append('{}_{}'.format(col,i))
    annual_report_info[one_hot_code_cols] = one_hot_code

In [91]:
annual_report_info.dtypes

id                  object
ANCHEYEAR          float64
STATE              float64
FUNDAM             float64
MEMNUM             float64
FARNUM             float64
ANNNEWMEMNUM       float64
ANNREDMEMNUM       float64
EMPNUM             float64
EMPNUMSIGN         float64
BUSSTNAME           object
COLGRANUM          float64
RETSOLNUM          float64
DISPERNUM          float64
UNENUM             float64
COLEMPLNUM         float64
RETEMPLNUM         float64
DISEMPLNUM         float64
UNEEMPLNUM         float64
WEBSITSIGN         float64
FORINVESTSIGN      float64
STOCKTRANSIGN      float64
PUBSTATE           float64
BUSSTNAME_0        float64
BUSSTNAME_1        float64
BUSSTNAME_2        float64
BUSSTNAME_3        float64
PUBSTATE_0         float64
PUBSTATE_1         float64
PUBSTATE_2         float64
STATE_0            float64
STATE_1            float64
FORINVESTSIGN_0    float64
FORINVESTSIGN_1    float64
STOCKTRANSIGN_0    float64
STOCKTRANSIGN_1    float64
WEBSITSIGN_0       float64
W

In [92]:
# 对其中的一些数值指标，取最近一个时间段的数据
annual_report_info.sort_values(['ANCHEYEAR'],ascending=False,inplace= True)
annual_report_info = annual_report_info.groupby('id').head(1)
annual_report_info.shape

(8937, 38)

In [93]:
del annual_report_info['BUSSTNAME']
del annual_report_info['PUBSTATE']
del annual_report_info['STATE']
del annual_report_info['FORINVESTSIGN']
del annual_report_info['STOCKTRANSIGN']
del annual_report_info['WEBSITSIGN']

In [94]:
annual_report_info.describe()

Unnamed: 0,ANCHEYEAR,FUNDAM,MEMNUM,FARNUM,ANNNEWMEMNUM,ANNREDMEMNUM,EMPNUM,EMPNUMSIGN,COLGRANUM,RETSOLNUM,...,PUBSTATE_1,PUBSTATE_2,STATE_0,STATE_1,FORINVESTSIGN_0,FORINVESTSIGN_1,STOCKTRANSIGN_0,STOCKTRANSIGN_1,WEBSITSIGN_0,WEBSITSIGN_1
count,8937.0,2705.0,9.0,9.0,9.0,9.0,8930.0,6225.0,8110.0,8110.0,...,8937.0,8937.0,8937.0,8937.0,8937.0,8937.0,8937.0,8937.0,8937.0,8937.0
mean,2017.787177,70.799383,6.777778,6.777778,0.0,0.0,9.285554,1.844177,0.711961,0.032552,...,0.186192,0.760098,0.001678,0.998322,0.035582,0.648987,0.025064,0.538212,0.032002,0.967103
std,0.589107,2150.248047,2.223611,2.223611,0.0,0.0,141.50862,0.362717,7.465522,0.294942,...,0.389283,0.427047,0.040936,0.040936,0.185257,0.477314,0.156329,0.498566,0.176015,0.178377
min,2015.0,0.0,5.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2018.0,3.0,5.0,5.0,0.0,0.0,1.0,2.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2018.0,8.0,6.0,6.0,0.0,0.0,2.0,2.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
75%,2018.0,10.0,7.0,7.0,0.0,0.0,5.0,2.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
max,2018.0,100000.0,11.0,11.0,0.0,0.0,12045.0,2.0,660.0,15.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [95]:
annual_report_info = fill_nan(annual_report_info)
annual_report_info.info()

100%|██████████| 31/31 [00:00<00:00, 1173.05it/s]<class 'pandas.core.frame.DataFrame'>
Int64Index: 8937 entries, 11275 to 7329
Data columns (total 32 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               8937 non-null   object 
 1   ANCHEYEAR        8937 non-null   float64
 2   FUNDAM           8937 non-null   float64
 3   MEMNUM           8937 non-null   float64
 4   FARNUM           8937 non-null   float64
 5   ANNNEWMEMNUM     8937 non-null   float64
 6   ANNREDMEMNUM     8937 non-null   float64
 7   EMPNUM           8937 non-null   float64
 8   EMPNUMSIGN       8937 non-null   float64
 9   COLGRANUM        8937 non-null   float64
 10  RETSOLNUM        8937 non-null   float64
 11  DISPERNUM        8937 non-null   float64
 12  UNENUM           8937 non-null   float64
 13  COLEMPLNUM       8937 non-null   float64
 14  RETEMPLNUM       8937 non-null   float64
 15  DISEMPLNUM       8937 non-null   float64
 16  UNEEMPL

### 1.4  tax_info.csv



In [96]:
tax_info = pd.read_csv('./data/train/tax_info.csv')
category_info(tax_info)

# 只有808个值，缺失的有点多，先放弃掉这个数据集，后面再考虑

100%|██████████| 9/9 [00:00<00:00, 383.03it/s]id 808
START_DATE 91
END_DATE 91
TAX_CATEGORIES 17
TAX_ITEMS 275
TAXATION_BASIS 1666
TAX_RATE 29
DEDUCTION 248
TAX_AMOUNT 4568



### 1.5 change_info.csv

In [97]:


change_info = pd.read_csv('./data/train/change_info.csv')
category_info(change_info)

# 这个暂时不考虑

100%|██████████| 5/5 [00:00<00:00, 64.05it/s]id 8726
bgxmdm 45
bgq 28802
bgh 30501
bgrq 23663



### 1.6 news_info.csv

In [98]:

news_info = pd.read_csv('./data/train/news_info.csv')
category_info(news_info)

news_info['public_date'].replace(r'\d+小时前|\d+分钟前',value = '2020-09-11',regex = True,inplace = True)
news_info.sort_values(['public_date'],ascending=False,inplace= True)
news_info = news_info.groupby('id').head(1)
news_info.head(10)


100%|██████████| 3/3 [00:00<00:00, 473.70it/s]id 927
positive_negtive 3
public_date 2719



Unnamed: 0,id,positive_negtive,public_date
4192,f000950527a6feb6b1cedd99a67ea0c20a3ce2ca11761489,消极,2020-09-11
8698,f000950527a6feb674725f9f9f10e1e248caae28c793b606,积极,2020-09-11
4647,d8071a739aa75a3b04c9a1789bd657ef83e10c02b1a9fef5,积极,2020-09-11
6160,f000950527a6feb63702b1f6c1dabe5ea196d320bbbff425,积极,2020-09-11
1033,f000950527a6feb6bc5d81f4b5b017d5a93a9096d0245eb1,消极,2020-09-11
4346,47645761dc56bb8ce4034b5b3a2d12beb159d10cc4c77f88,中立,2020-09-11
1540,d8071a739aa75a3bb574df9edeb7d152892dc454f3195d2f,中立,2020-09-11
5019,f000950527a6feb6f16d34b2a6b30955cf157698694adc20,消极,2020-09-11
31,f000950527a6feb6a3e5db12921603ab7f44bcfd265bd5b1,积极,2020-09-10
2173,f000950527a6feb60a1c44d64d8bc8d99a38b6614080ef0f,中立,2020-09-08


In [99]:
# 改为one-hot

news_info[['PN_0','PN_1','PN_2']] = pd.get_dummies(news_info['positive_negtive'],dtype = float) 


public_date = pd.to_datetime(news_info['public_date'])#转换格式
news_info['public_date_year'] = public_date.dt.year
news_info['public_date_month'] = public_date.dt.month
news_info.head()

Unnamed: 0,id,positive_negtive,public_date,PN_0,PN_1,PN_2,public_date_year,public_date_month
4192,f000950527a6feb6b1cedd99a67ea0c20a3ce2ca11761489,消极,2020-09-11,0.0,1.0,0.0,2020,9
8698,f000950527a6feb674725f9f9f10e1e248caae28c793b606,积极,2020-09-11,0.0,0.0,1.0,2020,9
4647,d8071a739aa75a3b04c9a1789bd657ef83e10c02b1a9fef5,积极,2020-09-11,0.0,0.0,1.0,2020,9
6160,f000950527a6feb63702b1f6c1dabe5ea196d320bbbff425,积极,2020-09-11,0.0,0.0,1.0,2020,9
1033,f000950527a6feb6bc5d81f4b5b017d5a93a9096d0245eb1,消极,2020-09-11,0.0,1.0,0.0,2020,9


In [100]:
del news_info['public_date']
del news_info['positive_negtive']

In [101]:
news_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 927 entries, 4192 to 2650
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 927 non-null    object 
 1   PN_0               927 non-null    float64
 2   PN_1               927 non-null    float64
 3   PN_2               927 non-null    float64
 4   public_date_year   927 non-null    int64  
 5   public_date_month  927 non-null    int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 50.7+ KB


### 1.7 other_info.csv


In [102]:

other_info = pd.read_csv('./data/train/other_info.csv')
category_info(other_info)

100%|██████████| 4/4 [00:00<00:00, 583.86it/s]id 1888
legal_judgment_num 94
brand_num 83
patent_num 115



In [103]:
# 采取累加的方式
other_info = other_info.fillna(0.0)
groud = other_info.groupby('id')
other_info = groud.agg(np.sum)

## 2. merge data

In [104]:
info = pd.merge(base_info,annual_report_info, how = 'left' ,on='id')
info = pd.merge(info,news_info, how='left',on = 'id')
info = pd.merge(info, other_info, how='left', on ='id')

info.head()

Unnamed: 0,id,oplocdistrict,industryphy,industryco,enttype,enttypeitem,state,orgid,jobid,adbusign,...,WEBSITSIGN_0,WEBSITSIGN_1,PN_0,PN_1,PN_2,public_date_year,public_date_month,legal_judgment_num,brand_num,patent_num
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,10,12,198,0,5,6,62,112,0,...,,,,,,,,,,
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,9,14,244,16,0,6,58,61,0,...,0.0,1.0,,,,,,,,
2,59b38c56de3836838082cfcb1a298951abfe15e6940c49ba,3,17,337,0,5,6,32,422,0,...,,,,,,,,,,
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,8,11,140,9,19,6,50,321,0,...,,,,,,,,,,
4,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,3,17,312,0,3,7,1,346,0,...,0.0,1.0,,,,,,,,


In [105]:
# 缺失值填充
# public_date_year public_date_month
# 日期类，填充为创建日期

info['public_date_year'].fillna(info['opfrom_year'])
info['public_date_month'].fillna(info['opfrom_month'])

# 类别采用全0填充
cols = ['BUSSTNAME','PUBSTATE','STATE','FORINVESTSIGN','STOCKTRANSIGN','WEBSITSIGN','PN']
nums = [4,3,2,2,2,2,3]
for num, col in zip(nums,cols):
    for i in range(num):
        info['{}_{}'.format(col,i)].fillna(0.0)
    

# 数值类，采用中值填充
info = fill_nan(info)

100%|██████████| 56/56 [00:00<00:00, 857.79it/s]


## 3 训练

### 3.1 数据集划分

In [106]:
 # 数据集7：entprise_info.csv
# 带标注的企业数据。每一行代表一个企业，每一行2列，其中id列为企业唯一标识，label列为标注（1：有非法集资风险，0：无非法集资风险），列之间采用“,”分隔符分割。

# 训练集 id 及标签

entprise_info = pd.read_csv('data/train/entprise_info.csv')

print(entprise_info.shape)
entprise_info.head()

(14865, 2)


Unnamed: 0,id,label
0,59b38c56de3836831ff90a77d892a13523b7494f6ed09ff7,1
1,da8691b210adb3f6be8064e006f220070565db287275ad38,0
2,82750f1b9d122350918121f97c99bf96e11aa24ee91504a9,0
3,f000950527a6feb6b2c6de6f85c1e7438ba5590be931e2ec,0
4,f1c1045b13d1832927e3743e49d2917f2d98424f0849a373,0


In [107]:
# 数据集8（验证集）：entprise_evaluate.csv
# 未标注企业数据。参赛队伍需提交的最终结果数据集，每一行代表一个企业，每一行有 2 列, 其中id列为企业唯一标识，score列为空，列之间采用“,”分隔符分割。

# 测试集 id

entprise_evaluate = pd.read_csv('data/test/entprise_evaluate.csv')

print(entprise_evaluate.shape)
entprise_evaluate.head()

(10000, 2)


Unnamed: 0,id,score
0,82750f1b9d1223508ee329d47e27d35176c93eb9f35e9c1a,
1,f000950527a6feb670cc1c87c2025f3922aaa4a0206a0a33,
2,e9f7b28ec10e04700ef4db75a494f9a1e8e8b09555e6afa1,
3,beb4aaaa89e0a0ae9d77bd5d7665be6342f552f51840cf19,
4,e9f7b28ec10e0470ee4172cec0133b6826c34f27d3dff204,


In [108]:

# 划分训练集和测试集

entprise_evaluate.columns = ['id', 'label']

labels = pd.concat([entprise_info, entprise_evaluate])
df = pd.merge(info, labels, on='id', how='left')

print(df.shape)
df.head()

(24865, 77)


Unnamed: 0,id,oplocdistrict,industryphy,industryco,enttype,enttypeitem,state,orgid,jobid,adbusign,...,WEBSITSIGN_1,PN_0,PN_1,PN_2,public_date_year,public_date_month,legal_judgment_num,brand_num,patent_num,label
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,10,12,198,0,5,6,62,112,0,...,1.0,0.0,0.0,0.0,2019.0,6.0,1.0,0.0,0.0,0.0
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,9,14,244,16,0,6,58,61,0,...,1.0,0.0,0.0,0.0,2019.0,6.0,1.0,0.0,0.0,
2,59b38c56de3836838082cfcb1a298951abfe15e6940c49ba,3,17,337,0,5,6,32,422,0,...,1.0,0.0,0.0,0.0,2019.0,6.0,1.0,0.0,0.0,0.0
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,8,11,140,9,19,6,50,321,0,...,1.0,0.0,0.0,0.0,2019.0,6.0,1.0,0.0,0.0,0.0
4,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,3,17,312,0,3,7,1,346,0,...,1.0,0.0,0.0,0.0,2019.0,6.0,1.0,0.0,0.0,0.0


In [109]:
# 缺失值填充

df[[col for col in df.columns if col != 'label']].fillna(-1, inplace=True)

In [110]:

train = df[df.label.notna()]
test = df[df.label.isna()]

print(train.shape, test.shape)

(14865, 77) (10000, 77)


In [111]:
import lightgbm as lgb

ycol = 'label'
feature_names = list(
    filter(lambda x: x not in [ycol, 'id'], train.columns))

model = lgb.LGBMClassifier(objective='binary',
                           boosting_type='gbdt',
                           tree_learner='serial',
                           num_leaves=128,
                           max_depth=8,
                           learning_rate=0.04,
                           n_estimators=100000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.3,
                           reg_lambda=0.5,
                           random_state=2020,
                           is_unbalance=True)


oof = []
prediction = test[['id']]
prediction[f'{ycol}_0'] = 0
prediction[f'{ycol}_1'] = 0
df_importance_list = []

def f1_score_custom(y_true, y_pred):
    y_pred = y_pred.round()
    return 'f1', f1_score(y_true, y_pred), True

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train[feature_names], train[ycol])):
    X_train = train.iloc[trn_idx][feature_names]
    Y_train = train.iloc[trn_idx][ycol]

    X_val = train.iloc[val_idx][feature_names]
    Y_val = train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))
    

    
    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric=lambda y_true, y_pred: f1_score_custom(y_true, y_pred),
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = train.iloc[val_idx][['id', ycol]].copy()
    df_oof[f'{ycol}_0'] = pred_val[:,0]
    df_oof[f'{ycol}_1'] = pred_val[:,1]
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        test[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction[f'{ycol}_0'] += pred_test[:,0] / kfold.n_splits
    prediction[f'{ycol}_1'] += pred_test[:,1] / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    
    
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[17]	train's binary_logloss: 0.0976942	train's f1: 0.83974	valid's binary_logloss: 0.097996	valid's f1: 0.852868


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[17]	train's binary_logloss: 0.0977091	train's f1: 0.845361	valid's binary_logloss: 0.101839	valid's f1: 0.834123


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[18]	train's binary_logloss: 0.0949265	train's f1: 0.846682	valid's binary_logloss: 0.0987511	valid's f1: 0.830357


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[17]	train's binary_logloss: 0.0971305	train's f1: 0.85242	valid's binary_logloss: 0.101078	valid's f1: 0.821596


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[17]	train's binary_logloss: 0.0971406	train's f1: 0.84

Unnamed: 0,column,importance
0,industryco,26.6
1,regcap,25.8
2,jobid_freq,20.8
3,reccap,20.2
4,enttypegb_freq,19.8
...,...,...
70,PUBSTATE_2,0.0
71,PUBSTATE_0,0.0
72,PN_2,0.0
73,PN_1,0.0


In [112]:
df_oof = pd.concat(oof)
score = f1_score(df_oof[ycol].astype('int'), 
                 np.argmax(df_oof[['label_0', 'label_1']].values, axis=1).astype('int'))
print('f1:', score)

f1: 0.8355387523629489


In [113]:
sub = prediction[['id', 'label_1']]
sub.columns = ['id', 'score']

sub = pd.merge(entprise_evaluate, sub, on='id', how='left')
sub.drop(['label'], axis=1, inplace=True)

sub.head()

Unnamed: 0,id,score
0,82750f1b9d1223508ee329d47e27d35176c93eb9f35e9c1a,0.045523
1,f000950527a6feb670cc1c87c2025f3922aaa4a0206a0a33,0.51685
2,e9f7b28ec10e04700ef4db75a494f9a1e8e8b09555e6afa1,0.034678
3,beb4aaaa89e0a0ae9d77bd5d7665be6342f552f51840cf19,0.033203
4,e9f7b28ec10e0470ee4172cec0133b6826c34f27d3dff204,0.094375


In [114]:
sub.to_csv(f'./prediction/baseline_{score}.csv', index=False)