# 1 模块导入

In [1]:
# -*- coding: utf-8 -*-
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing  # 预处理模块
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import time
import joblib
from sklearn.linear_model import Lasso, LassoLarsIC

import warnings
warnings.filterwarnings("ignore")

In [2]:
#不用print即可输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# 2 数据导入与预处理

### 3.1 数据导入

In [3]:
#定义加载数据集的方法
def load_dataset(path,inputfile):
    return pd.read_csv(path+inputfile)

In [4]:
path='data/zombie_enterprise_classification/'
#加载训练集
base_train=load_dataset(path,'base-train.csv')
know_train=load_dataset(path,'knowledge-train.csv')
money_train=load_dataset(path,'money-train.csv')
year_train=load_dataset(path,'year-train.csv')

#加载无标签数据集
base_test=load_dataset(path,'base-test.csv')
know_test=load_dataset(path,'knowledge-test.csv')
money_test=load_dataset(path,'money-test.csv')
year_test=load_dataset(path,'year-test.csv')

In [5]:
#数据维度
print('base_train:',base_train.shape)
print('know_train:',know_train.shape)
print('money_train',money_train.shape)
print('year_train',year_train.shape)
print('base_test:',base_test.shape)
print('know_test:',know_test.shape)
print('money_test',money_test.shape)
print('year_test',year_test.shape)

base_train: (28519, 9)
know_train: (28519, 4)
money_train (85548, 10)
year_train (85548, 10)
base_test: (7132, 8)
know_test: (7132, 4)
money_test (21396, 10)
year_test (21396, 10)


### 3.2 数据预处理

###### 3.2.1 财报、年报数据的year特征填充

In [6]:
print('数据集：year_train')
year_train.isna().sum()
print('-'*40)
print('数据集：money_train')
money_train.isna().sum()
print('-'*40)
print('数据集：year_test')
year_test.isna().sum()
print('-'*40)
print('数据集：money_test')
money_test.isna().sum()
print('-'*40)

数据集：year_train


ID           0
year       856
从业人数       805
资产总额       927
负债总额       828
营业总收入      876
主营业务收入     845
利润总额       849
纳税总额       817
所有者权益合计    875
dtype: int64

----------------------------------------
数据集：money_train


ID               0
year           845
债权融资额度         809
债权融资成本         881
股权融资额度         849
股权融资成本         876
内部融资和贸易融资额度    808
内部融资和贸易融资成本    828
项目融资和政策融资额度    862
项目融资和政策融资成本    853
dtype: int64

----------------------------------------
数据集：year_test


ID           0
year       210
从业人数       217
资产总额       223
负债总额       188
营业总收入      211
主营业务收入     211
利润总额       185
纳税总额       198
所有者权益合计    229
dtype: int64

----------------------------------------
数据集：money_test


ID               0
year           228
债权融资额度         223
债权融资成本         211
股权融资额度         208
股权融资成本         213
内部融资和贸易融资额度    196
内部融资和贸易融资成本    213
项目融资和政策融资额度    226
项目融资和政策融资成本    229
dtype: int64

----------------------------------------


In [10]:
show121=findIdLackOfYear(year_train)
year_train[year_train['ID'].isin([show121[0]])]

Unnamed: 0,ID,year,从业人数,资产总额,负债总额,营业总收入,主营业务收入,利润总额,纳税总额,所有者权益合计
3,990,2015.0,863.0,33760.0,25320.0,145168.0,58067.2,14516.8,0.0,8440.0
111,990,,177.0,16880.0,4220.0,75960.0,30384.0,7596.0,0.0,12660.0
219,990,2017.0,880.0,75960.0,101280.0,372204.0,260542.8,37220.4,0.0,-25320.0


In [7]:
#找到缺失年份的ID
def findIdLackOfYear(df,col_id='ID',col_lack='year'):
    return df[col_id][df[col_lack].isna()].values
#通过ID填补缺失的年份
def fillYearById(df,na_year_id,col_id='ID',col_lack='year'):
    res=pd.DataFrame(columns=df.columns.values)
    for idx in na_year_id:
        temp=df[df[col_id].isin([idx])]
        #若年份只缺失了一个，则根据另外两个反推
        if temp[col_lack].isna().sum()==1:
            if not temp[col_lack].isin([2015]).any():
                temp[col_lack][temp[col_lack].isna()]=2015            
            elif not temp[col_lack].isin([2016]).any():
                temp[col_lack][temp[col_lack].isna()]=2016
            elif not temp[col_lack].isin([2017]).any():
                temp[col_lack][temp[col_lack].isna()]=2017
        res=res.append(temp)
    na_id=findIdLackOfYear(res)
    res=res[~res[col_id].isin(na_id)]
    return res
#合并到year数据集
def getFilledYear(df,res,col_id='ID'):
    return df[~df[col_id].isin(res[col_id].values)].append(res)
#精确填充ID缺失的年份
def fillAndMerge(df,col_id='ID',col_lack='year'):
    na_id=findIdLackOfYear(df)
    fill=fillYearById(df,na_id)
    return getFilledYear(df,fill)

In [11]:
#填充train的年份缺失值
year_train321=fillAndMerge(year_train)
money_train321=fillAndMerge(money_train)
#填充test的年份缺失值
year_test321=fillAndMerge(year_test)
money_test321=fillAndMerge(money_test)

In [12]:
#填充后维度和原数据集是否一致
print('year_train',year_train.shape)
print('year_train321',year_train321.shape)
print('money_train',money_train.shape)
print('money_train321',money_train321.shape)
print('-'*40)
print('year_test',year_test.shape)
print('year_test321',year_test321.shape)
print('money_test',money_test.shape)
print('money_test321',money_test321.shape)

year_train (85548, 10)
year_train321 (85548, 10)
money_train (85548, 10)
money_train321 (85548, 10)
----------------------------------------
year_test (21396, 10)
year_test321 (21396, 10)
money_test (21396, 10)
money_test321 (21396, 10)


填充后删除year特征仍有缺失值的样本
可看出year特征扔含有少量缺失值，原因是该部分样本每个都缺失了不止1个year特征，此时无法精确填充，直接删除该部分样本

In [13]:
#填充后的缺失值统计
print('数据集：year_train321',year_train321.shape)
year_train321['year'].isna().sum()
print('-'*40)
print('数据集：money_train321',money_train321.shape)
money_train321['year'].isna().sum()
print('-'*40)
print('数据集：year_test321',year_test321.shape)
year_test321['year'].isna().sum()
print('-'*40)
print('数据集：money_test321',money_test321.shape)
money_test321['year'].isna().sum()

数据集：year_train321 (85548, 10)


24

----------------------------------------
数据集：money_train321 (85548, 10)


10

----------------------------------------
数据集：year_test321 (21396, 10)


0

----------------------------------------
数据集：money_test321 (21396, 10)


8

In [14]:
#删除yeart特征含有缺失值的样本
year_train321=year_train321.dropna(subset=['year'])
money_train321=money_train321.dropna(subset=['year'])
#填充后的缺失值统计
print('数据集：year_train321',year_train321.shape)
year_train321['year'].isna().sum()
print('-'*40)
print('数据集：money_train321',money_train321.shape)
money_train321['year'].isna().sum()

数据集：year_train321 (85524, 10)


0

----------------------------------------
数据集：money_train321 (85538, 10)


0

###### 3.2.2 拆分数据集
按year特征(2015,2016,2017)拆分财报money和年报year数据集

In [15]:
#定义拆分数据集的方法
def split_dataset_by(df,col='year'):
    """
    df：要处理的数据集的dataframe
    col：要按哪一列处理
    return:拆分后的数据集dataframe组成的列表
    """
    split_df=[]
    col_values=np.sort(df[col].value_counts().index.values)#col这一列的唯一值
    columns=df.columns.values.tolist()#df所有列名
    columns.remove(col)#删除掉要拆分的这一列的名称col
    #循环拆分数据集
    for col_value in col_values:
        col_value=int(col_value)
        locals()[col+str(col_value)]=df[df[col].isin([col_value])].drop([col],axis=1)
        new_columns=['ID']
        for column in columns:
            if column=='ID':
                continue
            new_columns.append(column+str(col_value))
        locals()[col+str(col_value)].columns=new_columns#拆分出来的数据集重新命名它的列名称
        split_df.append(locals()[col+str(col_value)])#处理完成的数据集添加到列表中
    return split_df

In [16]:
#按年份拆分money_train,year_train,money_test,year-test四个数据集
split_money_train=split_dataset_by(money_train321)
split_year_train=split_dataset_by(year_train321)
split_money_test=split_dataset_by(money_test321)
split_year_test=split_dataset_by(year_test321)

###### 3.2.3 合并拆分数据集

In [17]:
#定义合并数据集的方法
def merge_dataset(dfs,col='ID'):
    """
    df：要处理的数据集的dataframe
    col：要按哪一列处理
    return:合并后的数据集dataframe
    """
    df_res=pd.DataFrame()
    for df in dfs:
        if len(df_res)==0:
            df_res=df
        else:
            df_res=pd.merge(df_res,df,on=col,how='outer')#按col列合并，outer表示保留两个表的信息
    return df_res

In [18]:
#分别合并money_train,year_train拆分后的数据集
money_train_res=merge_dataset(split_money_train)
year_train_res=merge_dataset(split_year_train)
money_test_res=merge_dataset(split_money_test)
year_test_res=merge_dataset(split_year_test)

In [19]:
print('数据集：money_train_res',money_train_res.shape)
print('数据集：year_train_res',year_train_res.shape)
print('数据集：money_test_res',money_test_res.shape)
print('数据集：year_test_res',year_test_res.shape)

数据集：money_train_res (28516, 25)
数据集：year_train_res (28516, 25)
数据集：money_test_res (7132, 25)
数据集：year_test_res (7132, 25)


In [20]:
#合并所有数据集为train,test
train=merge_dataset([base_train,know_train,money_train_res,year_train_res])
test=merge_dataset([base_test,know_test,money_test_res,year_test_res])

In [21]:
train.shape
test.shape
train.head()

(28519, 60)

(7132, 59)

Unnamed: 0,ID,注册时间,注册资本,行业,区域,企业类型,控制人类型,控制人持股比例,flag,专利,...,纳税总额2016,所有者权益合计2016,从业人数2017,资产总额2017,负债总额2017,营业总收入2017,主营业务收入2017,利润总额2017,纳税总额2017,所有者权益合计2017
0,5986361,2014.0,7090.0,服务业,湖北,有限责任公司,自然人,0.93,0,0.0,...,0.0,7090.0,503.0,85080.0,155980.0,187176.0,149740.8,74870.4,37435.2,-70900.0
1,5991749,2007.0,5940.0,零售业,湖南,合伙企业,企业法人,0.57,0,1.0,...,8553.6,109890.0,514.0,178200.0,344520.0,267300.0,106920.0,80190.0,160380.0,-166320.0
2,5998154,2002.0,9720.0,工业,福建,合伙企业,自然人,0.74,0,1.0,...,10886.4,72900.0,891.0,729000.0,719280.0,1458000.0,729000.0,729000.0,291600.0,
3,5984390,2000.0,4800.0,商业服务业,山东,股份有限公司,,0.9,0,0.0,...,61440.0,156000.0,819.0,326400.0,,163200.0,65280.0,48960.0,65280.0,165600.0
4,5980535,2004.0,4530.0,零售业,广东,农民专业合作社,自然人,0.95,0,0.0,...,65232.0,4530.0,158.0,176670.0,258210.0,494676.0,296805.6,197870.4,0.0,-81540.0


In [22]:
#train和test特征对齐
cols323=test.columns.values.tolist()
cols323.append('flag')
train323=train.loc[:,cols323].copy()
test323=test

###### 3.2.4 保存总数据集

In [24]:
path324='data/zombie_enterprise_classification/alldata/'
train323.to_csv(path324+'train324.txt',index=False)
test323.to_csv(path324+'test324.txt',index=False)

### 3.3 特征工程

In [25]:
path33='data/zombie_enterprise_classification/alldata/'
train33=pd.read_csv(path33+'train324.txt')
test33=pd.read_csv(path33+'test324.txt')

###### 3.3.1 ID特征处理

In [26]:
#处理前形状
print(train33.shape)
print(test33.shape)

(28519, 60)
(7132, 59)


In [27]:
train_Id = train33['ID']
test_Id = test33['ID']
train33.drop('ID', axis=1, inplace=True)
test33.drop('ID', axis=1, inplace=True)

In [28]:
#处理后形状
print(train33.shape)
print(test33.shape)

(28519, 59)
(7132, 58)


###### 3.3.2 数据集连接

In [29]:
ntrain = train33.shape[0]
ntest = test33.shape[0]
y_train = train33['flag'].values
all_data = pd.concat((train33, test33)).reset_index(drop=True)
all_data.drop(['flag'], axis=1, inplace=True)
print('all_data size is {}'.format(all_data.shape))

all_data size is (35651, 58)


###### 3.3.3 缺失值处理

In [30]:
#缺失值统计信息
all_data.isna().sum()

注册时间               371
注册资本               366
行业                 323
区域                 331
企业类型               366
控制人类型              387
控制人持股比例            360
专利                 351
商标                 377
著作权                339
债权融资额度2015         369
债权融资成本2015         386
股权融资额度2015         369
股权融资成本2015         369
内部融资和贸易融资额度2015    344
内部融资和贸易融资成本2015    378
项目融资和政策融资额度2015    372
项目融资和政策融资成本2015    345
债权融资额度2016         346
债权融资成本2016         364
股权融资额度2016         343
股权融资成本2016         372
内部融资和贸易融资额度2016    352
内部融资和贸易融资成本2016    349
项目融资和政策融资额度2016    372
项目融资和政策融资成本2016    379
债权融资额度2017         344
债权融资成本2017         369
股权融资额度2017         372
股权融资成本2017         375
内部融资和贸易融资额度2017    335
内部融资和贸易融资成本2017    341
项目融资和政策融资额度2017    371
项目融资和政策融资成本2017    385
从业人数2015           349
资产总额2015           400
负债总额2015           344
营业总收入2015          395
主营业务收入2015         364
利润总额2015           347
纳税总额2015           340
所有者权益合计2015        382
从业人数2016           353
资产总额2016   

In [31]:
#定义填充方法
def fill_with_mode(df):
    cols=df.columns
    for col in cols:
        df[col]=df[col].fillna(df[col].mode()[0])
    return df

In [32]:
#用众数填补缺失值
# train322=fill_with_mode(train31)
# test322=fill_with_mode(test31)
all_data333=fill_with_mode(all_data)

###### 3.2.4 独热编码

In [33]:
all_data334 = pd.get_dummies(all_data333)
all_data334.head()

Unnamed: 0,注册时间,注册资本,控制人持股比例,专利,商标,著作权,债权融资额度2015,债权融资成本2015,股权融资额度2015,股权融资成本2015,...,区域_湖北,区域_湖南,区域_福建,企业类型_农民专业合作社,企业类型_合伙企业,企业类型_有限责任公司,企业类型_股份有限公司,企业类型_集体所有制企业,控制人类型_企业法人,控制人类型_自然人
0,2014.0,7090.0,0.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,1,0,0,0,1
1,2007.0,5940.0,0.57,1.0,1.0,0.0,0.0,0.0,45144.0,1805.76,...,0,1,0,0,1,0,0,0,1,0
2,2002.0,9720.0,0.74,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,0,0,0,1
3,2000.0,4800.0,0.9,0.0,0.0,0.0,0.0,0.0,21312.0,852.48,...,0,0,0,0,0,0,1,0,1,0
4,2004.0,4530.0,0.95,0.0,1.0,1.0,0.0,0.0,13952.4,558.096,...,0,0,0,1,0,0,0,0,0,1


###### 3.2.5 标签编码

In [36]:
#待编码的特征
cols_encode=['注册时间']

In [35]:
#将数值类型（数值作为类型的列）转化为字符串类型
feature=['注册时间']
for i in feature:
    train[i]=train[i].astype(str)

In [38]:
#对不连续的数字或文本进行编号，转换成连续的数值型变量
for c in cols_encode:
    le=LabelEncoder()
    all_data334[c]=le.fit_transform(list(all_data334[c].values))

In [39]:
#查看编码后的情况
all_data334[cols_encode].head()

Unnamed: 0,注册时间
0,14
1,7
2,2
3,0
4,4


###### 3.2.6 重新划分数据集

In [40]:
# 将处理好的数据重新划分成训练集和测试集，为模型训练与测试做准备
train326 = all_data334[:ntrain]
test326 = all_data334[ntrain:]

###### 3.2.6 特征重要性检测

In [43]:
lasso=Lasso(alpha=0.001)
lasso.fit(train326,y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [44]:
FI_lasso = pd.DataFrame({"特征重要性":lasso.coef_}, index=train326.columns)
FI_lasso.sort_values("特征重要性",ascending=False)

Unnamed: 0,特征重要性
企业类型_有限责任公司,0.016948
区域_江西,0.003585
行业_工业,0.001465
项目融资和政策融资成本2017,0.000110
项目融资和政策融资成本2015,0.000096
...,...
注册时间,-0.001867
区域_广东,-0.002932
企业类型_合伙企业,-0.004633
著作权,-0.004877


# 4 模型定义、训练、评估 

使用SVM作为模型时，通常采用如下流程：

- 对样本数据进行归一化
- 应用核函数对样本进行映射
- 用cross-validation和grid-search对超参数进行优选
- 用最优参数训练得到模型
- 使用训练好的模型进行预测

In [45]:
#训练集向量
X=train326.values
y=y_train

#预测数据集向量
X_pre=test326.values

#按8:2分割数据集为训练集和测试集，固定随机种子
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2020)

#对数据集进行标准化
scaler=preprocessing.StandardScaler().fit(X_train)#通过训练集获得归一化函数

#使用相同的归一化函数处理训练向量和测试向量，以及无标签测试向量
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)
X_pre=scaler.transform(X_pre)

#训练集中正例和负例个数
y_list=y_train.tolist()
print("正例：",y_list.count(0))
print("负例：",y_list.count(1))

正例： 13933
负例： 8882


In [None]:
SVM rbf核函数，C，gamma参数调优
grid = GridSearchCV(SVC(), param_grid={"C":[0.1, 1, 10], "gamma":[1, 0.1, 0.01]}, cv=4)  
grid.fit(X_train, y_train)  
print("The best parameters are %s with a score of %0.2f" %(grid.best_params_, grid.best_score_))

In [46]:
#SVM模型训练
clf=SVC(C=10,gamma=0.01)
t1=time.time()
clf.fit(X_train,y_train)
t2=time.time()
print("训练时间： %fs"%(t2-t1))
sv=clf.n_support_
print("支持向量：",sv)

#支持向量中正例负例个数
print("正例：",sv[0])
print("负例：",sv[1])

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

训练时间： 33.007998s
支持向量： [2296 3329]
正例： 2296
负例： 3329


In [47]:
accur=clf.score(X_test,y_test)
print("准确率:",accur)

准确率: 0.9665147265077139


In [48]:
#保存模型
joblib.dump(clf,'data//zombie_enterprise_classification/model/svc0513.model')

['data//zombie_enterprise_classification/model/svc0513.model']

### 预测

In [49]:
y_pre=clf.predict(X_pre).tolist()

In [50]:
df_res=pd.DataFrame()
df_res['ID']=test_Id
df_res['flag']=y_pre
df_res=df_res.astype(int)
df_res.to_csv("data//zombie_enterprise_classification/output/res0513.txt",index=False)#输出到文件，不加行索引 
df_res

Unnamed: 0,ID,flag
0,5991927,0
1,5998351,0
2,5992703,0
3,5979231,0
4,5995422,1
...,...,...
7127,80564,1
7128,978515,1
7129,568065,1
7130,1889883,1
