# DATA CLEARNING FOR ZFC

In [1]:
%matplotlib inline

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

## Tool functions preparation

### Combine base and knowledge data

In [2]:
def combined_base_knowledge_data(base_data, knowledge_data):
    combined_base_knowledge_data = base_data.set_index('ID').join(knowledge_data.set_index('ID'))
    combined_base_knowledge_data = combined_base_knowledge_data.fillna(combined_base_knowledge_data.median())
    return combined_base_knowledge_data

### Function definition for year_fill_na

In [3]:
# fill null year 
def year_fill_na(oragin_data, years = [2015, 2016, 2017]):
    # pick null year data
    null_years = oragin_data.loc[oragin_data.year.isna()]
    
    # fill year 
    for year in years:
        IDs = oragin_data[["ID"]].loc[oragin_data.year == year]
        for null_year_id in null_years["ID"].unique():
            tmp = IDs.loc[IDs.ID == null_year_id]
            if tmp.empty:
                index = oragin_data.loc[(oragin_data.ID == null_year_id) 
                                        & (oragin_data.year.isna())].index.tolist()
                if len(index) != 0:
                    oragin_data.loc[index[0]:index[0], "year"] = year 
                
    new_data = oragin_data.fillna(oragin_data.median())
    return new_data;

### Function definition for combined_new_year_money_data

In [4]:
def combined_new_year_money_data(new_year_data, new_money_data):
    return pd.merge(new_year_data, new_money_data, on=['ID', 'year'])

### Split year_money_data

In [5]:
def split_data(combined_new_year_money_data):
    # split data with year
    combined_new_year_data_2015 = combined_new_year_money_data.loc[
        combined_new_year_money_data.year == 2015].set_index('ID').add_suffix("_2015").drop(columns=['year_2015'])
    combined_new_year_data_2016 = combined_new_year_money_data.loc[
        combined_new_year_money_data.year == 2016].set_index('ID').add_suffix("_2016").drop(columns=['year_2016'])
    combined_new_year_data_2017 = combined_new_year_money_data.loc[
        combined_new_year_money_data.year == 2017].set_index('ID').add_suffix("_2017").drop(columns=['year_2017'])
    
    # marge data with ID
    combined_new_splited_year_money_data = pd.merge(combined_new_year_data_2015, 
                                                    combined_new_year_data_2016, 
                                                    on=['ID'])
    combined_new_splited_year_money_data = pd.merge(combined_new_splited_year_money_data, 
                                                    combined_new_year_data_2017, 
                                                    on=['ID'])

    return combined_new_splited_year_money_data

## Training Data Set

### Load original training data

In [6]:
import pandas as pd

In [7]:
# load training data
base_train_data = pd.read_csv("data/train/base-train.csv")
year_train_data = pd.read_csv("data/train/year-train.csv")
knowledge_train_data = pd.read_csv("data/train/knowledge-train.csv")
money_train_data = pd.read_csv("data/train/money-train.csv")

### base and knowledge data

In [8]:
base_train_data.head()

Unnamed: 0,ID,注册时间,注册资本,行业,区域,企业类型,控制人类型,控制人持股比例,flag
0,5986361,2014.0,7090.0,服务业,湖北,有限责任公司,自然人,0.93,0
1,5991749,2007.0,5940.0,零售业,湖南,合伙企业,企业法人,0.57,0
2,5998154,2002.0,9720.0,工业,福建,合伙企业,自然人,0.74,0
3,5984390,2000.0,4800.0,商业服务业,山东,股份有限公司,,0.9,0
4,5980535,2004.0,4530.0,零售业,广东,农民专业合作社,自然人,0.95,0


In [9]:
base_train_data.describe()

Unnamed: 0,ID,注册时间,注册资本,控制人持股比例,flag
count,28519.0,28230.0,28220.0,28223.0,28519.0
mean,4332423.0,2007.010627,5024.659816,0.754786,0.392721
std,2161092.0,4.3268,2860.157458,0.145008,0.488364
min,28.0,2000.0,100.0,0.51,0.0
25%,2324856.0,2003.0,2530.0,0.63,0.0
50%,5981915.0,2007.0,5010.0,0.75,0.0
75%,5990992.0,2011.0,7490.0,0.88,1.0
max,6000000.0,2014.0,10000.0,1.0,1.0


In [10]:
knowledge_train_data.head()

Unnamed: 0,ID,专利,商标,著作权
0,28,0.0,1.0,1.0
1,230,0.0,0.0,0.0
2,693,0.0,0.0,0.0
3,990,0.0,0.0,0.0
4,1274,0.0,0.0,0.0


In [11]:
knowledge_train_data.describe()

Unnamed: 0,ID,专利,商标,著作权
count,28519.0,28233.0,28216.0,28237.0
mean,4332423.0,0.342507,0.36334,0.371428
std,2161092.0,0.474557,0.48097,0.483195
min,28.0,0.0,0.0,0.0
25%,2324856.0,0.0,0.0,0.0
50%,5981915.0,0.0,0.0,0.0
75%,5990992.0,1.0,1.0,1.0
max,6000000.0,1.0,1.0,1.0


#### combine base and knowledge training data

In [12]:
combined_base_knowledge_train_data = combined_base_knowledge_data(base_train_data, knowledge_train_data)
combined_base_knowledge_train_data.head()

Unnamed: 0_level_0,注册时间,注册资本,行业,区域,企业类型,控制人类型,控制人持股比例,flag,专利,商标,著作权
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5986361,2014.0,7090.0,服务业,湖北,有限责任公司,自然人,0.93,0,0.0,0.0,0.0
5991749,2007.0,5940.0,零售业,湖南,合伙企业,企业法人,0.57,0,1.0,1.0,0.0
5998154,2002.0,9720.0,工业,福建,合伙企业,自然人,0.74,0,1.0,1.0,0.0
5984390,2000.0,4800.0,商业服务业,山东,股份有限公司,,0.9,0,0.0,0.0,0.0
5980535,2004.0,4530.0,零售业,广东,农民专业合作社,自然人,0.95,0,0.0,1.0,1.0


### year and money data

In [13]:
year_train_data.head(10)

Unnamed: 0,ID,year,从业人数,资产总额,负债总额,营业总收入,主营业务收入,利润总额,纳税总额,所有者权益合计
0,28,2015.0,794.0,16400.0,28700.0,72160.0,28864.0,7216.0,0.0,-12300.0
1,230,2015.0,485.0,23520.0,10080.0,115248.0,57624.0,57624.0,0.0,13440.0
2,693,2015.0,534.0,133760.0,125400.0,655424.0,262169.6,196627.2,0.0,8360.0
3,990,2015.0,863.0,33760.0,25320.0,145168.0,58067.2,14516.8,0.0,8440.0
4,1274,2015.0,254.0,74900.0,104325.0,277130.0,110852.0,55426.0,0.0,-29425.0
5,1560,2015.0,491.0,105000.0,98000.0,147000.0,73500.0,29400.0,0.0,7000.0
6,3261,2015.0,799.0,417000.0,822880.0,1751400.0,1401120.0,350280.0,0.0,-405880.0
7,3313,2015.0,784.0,501600.0,986480.0,2156880.0,1294128.0,431376.0,0.0,-484880.0
8,3537,2015.0,647.0,17800.0,13350.0,8900.0,4450.0,2670.0,0.0,4450.0
9,3719,2015.0,369.0,317000.0,465990.0,380400.0,228240.0,152160.0,0.0,-148990.0


In [14]:
year_train_data.describe()

Unnamed: 0,ID,year,从业人数,资产总额,负债总额,营业总收入,主营业务收入,利润总额,纳税总额,所有者权益合计
count,85548.0,84692.0,84743.0,84621.0,84720.0,84672.0,84703.0,84699.0,84731.0,84673.0
mean,4332626.0,2015.99987,510.808421,135008.595502,162335.8,344171.0,206264.4,102793.9,70796.59,-27390.880033
std,2160933.0,0.816398,283.12969,135222.263523,195862.2,441358.5,276952.9,153667.2,158826.1,108355.730296
min,28.0,2015.0,20.0,100.0,0.0,64.0,25.6,7.8,0.0,-828340.0
25%,2325192.0,2015.0,266.0,36300.0,33040.0,58500.0,33300.0,13967.55,0.0,-53130.0
50%,5981916.0,2016.0,512.0,88800.0,91260.0,176400.0,102752.0,45144.0,1240.2,250.0
75%,5990992.0,2017.0,756.0,190080.0,213722.5,449554.5,264321.4,123840.0,66680.4,8900.0
max,5999999.0,2017.0,1000.0,849150.0,1676640.0,4060875.0,2958000.0,1807398.0,2089620.0,429570.0


In [15]:
money_train_data.head()

Unnamed: 0,ID,year,债权融资额度,债权融资成本,股权融资额度,股权融资成本,内部融资和贸易融资额度,内部融资和贸易融资成本,项目融资和政策融资额度,项目融资和政策融资成本
0,28,2015.0,0.0,0.0,0.0,0.0,21648.0,1298.88,0.0,0.0
1,230,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,470.4,28.224
2,693,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,5350.4,321.024
3,990,,0.0,0.0,0.0,0.0,0.0,0.0,675.2,40.512
4,1274,2015.0,0.0,0.0,11085.2,443.408,0.0,0.0,,0.0


In [16]:
money_train_data.describe()

Unnamed: 0,ID,year,债权融资额度,债权融资成本,股权融资额度,股权融资成本,内部融资和贸易融资额度,内部融资和贸易融资成本,项目融资和政策融资额度,项目融资和政策融资成本
count,85548.0,84703.0,84739.0,84667.0,84699.0,84672.0,84740.0,84720.0,84686.0,84695.0
mean,4332626.0,2016.000378,3353.349261,268.717215,5115.09531,204.777317,25962.72,1555.89423,1020.851124,61.231978
std,2160933.0,0.816496,8883.814614,711.04943,17870.10384,715.756218,80257.68,4811.138407,3000.06213,179.87175
min,28.0,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2325192.0,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5981916.0,2016.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5990992.0,2017.0,0.0,0.0,0.0,0.0,267.3,10.719,41.0,2.52
max,5999999.0,2017.0,84830.0,6786.4,303922.5,12156.9,1215432.0,72925.92,39720.0,2383.2


#### year data fill na

In [17]:
# fill null
new_year_train_data = year_fill_na(year_train_data)
new_year_train_data.set_index(['ID', 'year'])

new_year_train_data.head()

Unnamed: 0,ID,year,从业人数,资产总额,负债总额,营业总收入,主营业务收入,利润总额,纳税总额,所有者权益合计
0,28,2015.0,794.0,16400.0,28700.0,72160.0,28864.0,7216.0,0.0,-12300.0
1,230,2015.0,485.0,23520.0,10080.0,115248.0,57624.0,57624.0,0.0,13440.0
2,693,2015.0,534.0,133760.0,125400.0,655424.0,262169.6,196627.2,0.0,8360.0
3,990,2015.0,863.0,33760.0,25320.0,145168.0,58067.2,14516.8,0.0,8440.0
4,1274,2015.0,254.0,74900.0,104325.0,277130.0,110852.0,55426.0,0.0,-29425.0


#### money data fill na

In [18]:
# fill null
new_money_train_data = year_fill_na(money_train_data)
new_money_train_data.set_index(['ID', 'year'])

# new_money_train_data.head(8)
new_money_train_data.head()

Unnamed: 0,ID,year,债权融资额度,债权融资成本,股权融资额度,股权融资成本,内部融资和贸易融资额度,内部融资和贸易融资成本,项目融资和政策融资额度,项目融资和政策融资成本
0,28,2015.0,0.0,0.0,0.0,0.0,21648.0,1298.88,0.0,0.0
1,230,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,470.4,28.224
2,693,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,5350.4,321.024
3,990,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,675.2,40.512
4,1274,2015.0,0.0,0.0,11085.2,443.408,0.0,0.0,0.0,0.0


#### Merge new year and money data

In [19]:
# Merge new year and money data
combined_new_year_money_train_data = combined_new_year_money_data(new_year_train_data, new_money_train_data)
combined_new_year_money_train_data.head()

Unnamed: 0,ID,year,从业人数,资产总额,负债总额,营业总收入,主营业务收入,利润总额,纳税总额,所有者权益合计,债权融资额度,债权融资成本,股权融资额度,股权融资成本,内部融资和贸易融资额度,内部融资和贸易融资成本,项目融资和政策融资额度,项目融资和政策融资成本
0,28,2015.0,794.0,16400.0,28700.0,72160.0,28864.0,7216.0,0.0,-12300.0,0.0,0.0,0.0,0.0,21648.0,1298.88,0.0,0.0
1,230,2015.0,485.0,23520.0,10080.0,115248.0,57624.0,57624.0,0.0,13440.0,0.0,0.0,0.0,0.0,0.0,0.0,470.4,28.224
2,693,2015.0,534.0,133760.0,125400.0,655424.0,262169.6,196627.2,0.0,8360.0,0.0,0.0,0.0,0.0,0.0,0.0,5350.4,321.024
3,990,2015.0,863.0,33760.0,25320.0,145168.0,58067.2,14516.8,0.0,8440.0,0.0,0.0,0.0,0.0,0.0,0.0,675.2,40.512
4,1274,2015.0,254.0,74900.0,104325.0,277130.0,110852.0,55426.0,0.0,-29425.0,0.0,0.0,11085.2,443.408,0.0,0.0,0.0,0.0


### merge splited data

In [20]:
splited_year_money_train_data = split_data(combined_new_year_money_train_data)
splited_year_money_train_data.head()

Unnamed: 0_level_0,从业人数_2015,资产总额_2015,负债总额_2015,营业总收入_2015,主营业务收入_2015,利润总额_2015,纳税总额_2015,所有者权益合计_2015,债权融资额度_2015,债权融资成本_2015,...,纳税总额_2017,所有者权益合计_2017,债权融资额度_2017,债权融资成本_2017,股权融资额度_2017,股权融资成本_2017,内部融资和贸易融资额度_2017,内部融资和贸易融资成本_2017,项目融资和政策融资额度_2017,项目融资和政策融资成本_2017
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28,794.0,16400.0,28700.0,72160.0,28864.0,7216.0,0.0,-12300.0,0.0,0.0,...,0.0,-77900.0,0.0,0.0,3444.0,137.76,0.0,0.0,0.0,0.0
230,485.0,23520.0,10080.0,115248.0,57624.0,57624.0,0.0,13440.0,0.0,0.0,...,0.0,-70560.0,0.0,0.0,3024.0,120.96,0.0,0.0,0.0,0.0
693,534.0,133760.0,125400.0,655424.0,262169.6,196627.2,0.0,8360.0,0.0,0.0,...,0.0,54340.0,0.0,0.0,32102.4,1284.096,0.0,0.0,0.0,0.0
990,863.0,33760.0,25320.0,145168.0,58067.2,14516.8,0.0,8440.0,0.0,0.0,...,0.0,-25320.0,0.0,0.0,0.0,0.0,111661.2,6699.672,0.0,0.0
1274,254.0,74900.0,104325.0,277130.0,110852.0,55426.0,0.0,-29425.0,0.0,0.0,...,0.0,-205975.0,42800.0,3424.0,0.0,0.0,0.0,0.0,0.0,0.0


## combine all data to train

In [21]:
train = pd.merge(combined_base_knowledge_train_data, 
                 splited_year_money_train_data, 
                 on=['ID'])

In [22]:
train.head()

Unnamed: 0_level_0,注册时间,注册资本,行业,区域,企业类型,控制人类型,控制人持股比例,flag,专利,商标,...,纳税总额_2017,所有者权益合计_2017,债权融资额度_2017,债权融资成本_2017,股权融资额度_2017,股权融资成本_2017,内部融资和贸易融资额度_2017,内部融资和贸易融资成本_2017,项目融资和政策融资额度_2017,项目融资和政策融资成本_2017
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5986361,2014.0,7090.0,服务业,湖北,有限责任公司,自然人,0.93,0,0.0,0.0,...,37435.2,-70900.0,8508.0,680.64,0.0,0.0,0.0,0.0,0.0,0.0
5991749,2007.0,5940.0,零售业,湖南,合伙企业,企业法人,0.57,0,1.0,1.0,...,160380.0,-166320.0,0.0,0.0,0.0,0.0,80190.0,4811.4,0.0,0.0
5998154,2002.0,9720.0,工业,福建,合伙企业,自然人,0.74,0,1.0,1.0,...,291600.0,250.0,0.0,0.0,145800.0,5832.0,0.0,0.0,0.0,0.0
5984390,2000.0,4800.0,商业服务业,山东,股份有限公司,,0.9,0,0.0,0.0,...,65280.0,165600.0,0.0,0.0,0.0,0.0,48960.0,2937.6,0.0,0.0
5980535,2004.0,4530.0,零售业,广东,农民专业合作社,自然人,0.95,0,0.0,1.0,...,0.0,-81540.0,17667.0,1413.36,0.0,0.0,0.0,0.0,0.0,0.0


## Save train

In [23]:
train.to_csv("data/train/train.csv")

In [24]:
train.describe()

Unnamed: 0,注册时间,注册资本,控制人持股比例,flag,专利,商标,著作权,从业人数_2015,资产总额_2015,负债总额_2015,...,纳税总额_2017,所有者权益合计_2017,债权融资额度_2017,债权融资成本_2017,股权融资额度_2017,股权融资成本_2017,内部融资和贸易融资额度_2017,内部融资和贸易融资成本_2017,项目融资和政策融资额度_2017,项目融资和政策融资成本_2017
count,28516.0,28516.0,28516.0,28516.0,28516.0,28516.0,28516.0,28516.0,28516.0,28516.0,...,28516.0,28516.0,28516.0,28516.0,28516.0,28516.0,28516.0,28516.0,28516.0,28516.0
mean,2007.010836,5024.064385,0.754743,0.392692,0.339108,0.359517,0.367794,511.746563,119299.86604,142560.9,...,77943.21,-31054.918642,3647.518095,291.400463,5667.460849,226.713881,28509.57,1712.928322,1115.153493,67.013075
std,4.304831,2844.93878,0.144249,0.488358,0.473415,0.479867,0.482213,281.742345,124479.32416,179471.2,...,170222.4,117106.108417,9425.14597,753.38507,19198.418658,768.471297,86515.85,5202.890113,3229.079861,193.992216
min,2000.0,100.0,0.51,0.0,0.0,0.0,0.0,20.0,100.0,0.0,...,0.0,-828340.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2003.0,2560.0,0.63,0.0,0.0,0.0,0.0,271.0,28840.0,25680.0,...,0.0,-62211.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2007.0,5010.0,0.75,0.0,0.0,0.0,0.0,512.0,77070.0,77097.5,...,1240.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2011.0,7470.0,0.88,1.0,1.0,1.0,1.0,754.0,167410.0,185265.0,...,76704.0,8880.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2014.0,10000.0,1.0,1.0,1.0,1.0,1.0,1000.0,750000.0,1463720.0,...,2010624.0,429570.0,84830.0,6786.4,287287.5,11491.5,1215432.0,72925.92,38930.0,2335.8


## Test Data Set

### Load original testing data

In [25]:
# load testing data
base_test_data = pd.read_csv("data/test/base-test.csv")
year_test_data = pd.read_csv("data/test/year-test.csv")
knowledge_test_data = pd.read_csv("data/test/knowledge-test.csv")
money_test_data = pd.read_csv("data/test/money-test.csv")

### base and knowledge data

In [26]:
# base_test_data.head()
# base_test_data.describe()
# knowledge_test_data.head()

#### combine base and knowledge training data

In [27]:
combined_base_knowledge_test_data = combined_base_knowledge_data(base_test_data, knowledge_test_data)
combined_base_knowledge_test_data.describe()

Unnamed: 0,注册时间,注册资本,控制人持股比例,专利,商标,著作权
count,7132.0,7132.0,7132.0,7132.0,7132.0,7132.0
mean,2007.077257,5039.91447,0.754799,0.342821,0.358385,0.373528
std,4.321929,2839.476208,0.143953,0.474686,0.47956,0.483774
min,2000.0,100.0,0.51,0.0,0.0,0.0
25%,2003.0,2640.0,0.63,0.0,0.0,0.0
50%,2007.0,5040.0,0.75,0.0,0.0,0.0
75%,2011.0,7450.0,0.88,1.0,1.0,1.0
max,2014.0,10000.0,1.0,1.0,1.0,1.0


### year and money data

In [28]:
year_test_data.describe()
money_test_data.describe()

Unnamed: 0,ID,year,债权融资额度,债权融资成本,股权融资额度,股权融资成本,内部融资和贸易融资额度,内部融资和贸易融资成本,项目融资和政策融资额度,项目融资和政策融资成本
count,21396.0,21168.0,21173.0,21185.0,21188.0,21183.0,21200.0,21183.0,21170.0,21167.0
mean,4332655.0,2016.000661,3243.435885,260.012805,5005.051121,200.682928,27021.9,1614.703645,1036.034955,62.366502
std,2163020.0,0.8164,8750.724756,701.04804,17489.497306,700.604837,83067.95,4972.506102,3064.546443,184.264032
min,429.0,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2331607.0,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5981952.0,2016.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5990780.0,2017.0,0.0,0.0,0.0,0.0,770.4,41.364,34.7,2.133
max,5999998.0,2017.0,80410.0,6432.8,311400.0,12456.0,1257150.0,75429.0,40970.0,2458.2


#### year data fill na

In [29]:
# fill null
new_year_test_data = year_fill_na(year_test_data)
new_year_test_data.set_index(['ID', 'year'])

new_year_test_data.describe()

print("unique ID count in base data:", base_test_data["ID"].nunique())
print("2015 unique ID count in year data:", new_year_test_data["ID"].loc[new_year_test_data.year==2015].nunique())
print("2016 unique ID count in year data:", new_year_test_data["ID"].loc[new_year_test_data.year==2016].nunique())
print("2017 unique ID count in year data:", new_year_test_data["ID"].loc[new_year_test_data.year==2017].nunique())

unique ID count in base data: 7132
2015 unique ID count in year data: 7132
2016 unique ID count in year data: 7132
2017 unique ID count in year data: 7132


#### money data fill na

In [30]:
# fill null
new_money_test_data = year_fill_na(money_test_data)
new_money_test_data.set_index(['ID', 'year'])

# new_money_train_data.head(8)
new_money_test_data.describe()

Unnamed: 0,ID,year,债权融资额度,债权融资成本,股权融资额度,股权融资成本,内部融资和贸易融资额度,内部融资和贸易融资成本,项目融资和政策融资额度,项目融资和政策融资成本
count,21396.0,21396.0,21396.0,21396.0,21396.0,21396.0,21396.0,21396.0,21396.0,21396.0
mean,4332655.0,2016.0,3209.631146,257.448648,4956.394801,198.685103,26774.36,1598.629057,1025.091606,61.698997
std,2163020.0,0.816516,8711.231069,698.055589,17411.201243,697.393302,82726.65,4950.288392,3050.157292,183.387577
min,429.0,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2331607.0,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5981952.0,2016.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5990780.0,2017.0,0.0,0.0,0.0,0.0,256.5,0.8505,0.0,0.0
max,5999998.0,2017.0,80410.0,6432.8,311400.0,12456.0,1257150.0,75429.0,40970.0,2458.2


In [31]:
print("unique ID count in base data:", base_test_data["ID"].nunique())
print("2015 unique ID count in money data:", new_money_test_data["ID"].loc[new_money_test_data.year==2015].nunique())
print("2016 unique ID count in money data:", new_money_test_data["ID"].loc[new_money_test_data.year==2016].nunique())
print("2017 unique ID count in money data:", new_money_test_data["ID"].loc[new_money_test_data.year==2017].nunique())

unique ID count in base data: 7132
2015 unique ID count in money data: 7132
2016 unique ID count in money data: 7132
2017 unique ID count in money data: 7132


#### Merge new year and money data

In [32]:
combined_new_year_money_test_data = combined_new_year_money_data(new_year_test_data, new_money_test_data)
combined_new_year_money_test_data.head()

Unnamed: 0,ID,year,从业人数,资产总额,负债总额,营业总收入,主营业务收入,利润总额,纳税总额,所有者权益合计,债权融资额度,债权融资成本,股权融资额度,股权融资成本,内部融资和贸易融资额度,内部融资和贸易融资成本,项目融资和政策融资额度,项目融资和政策融资成本
0,429,2015.0,136.0,193400.0,183730.0,502840.0,351988.0,45342.0,0.0,9670.0,19340.0,1547.2,0.0,0.0,0.0,0.0,0.0,0.0
1,727,2015.0,375.0,366240.0,536280.0,402864.0,282004.8,161145.6,0.0,-170040.0,0.0,0.0,32229.12,1289.1648,0.0,0.0,0.0,0.0
2,1137,2015.0,289.0,87200.0,40320.0,81536.0,57075.2,32614.4,0.0,-11200.0,0.0,0.0,0.0,0.0,24460.8,1467.648,0.0,0.0
3,1873,2015.0,889.0,229320.0,222950.0,137592.0,55036.8,55036.8,0.0,6370.0,0.0,0.0,11007.36,440.2944,0.0,0.0,0.0,0.0
4,2260,2015.0,689.0,225750.0,325080.0,1128750.0,564375.0,338625.0,0.0,-99330.0,0.0,0.0,0.0,0.0,0.0,0.0,11287.5,677.25


In [33]:
print("unique ID count in combined data:", combined_new_year_money_test_data["ID"].nunique())

unique ID count in combined data: 7132


### merge splited data

In [34]:
splited_year_money_test_data = split_data(combined_new_year_money_test_data)
splited_year_money_test_data.head()

Unnamed: 0_level_0,从业人数_2015,资产总额_2015,负债总额_2015,营业总收入_2015,主营业务收入_2015,利润总额_2015,纳税总额_2015,所有者权益合计_2015,债权融资额度_2015,债权融资成本_2015,...,纳税总额_2017,所有者权益合计_2017,债权融资额度_2017,债权融资成本_2017,股权融资额度_2017,股权融资成本_2017,内部融资和贸易融资额度_2017,内部融资和贸易融资成本_2017,项目融资和政策融资额度_2017,项目融资和政策融资成本_2017
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
429,136.0,193400.0,183730.0,502840.0,351988.0,45342.0,0.0,9670.0,19340.0,1547.2,...,0.0,9670.0,0.0,0.0,79371.36,3174.8544,0.0,0.0,0.0,0.0
727,375.0,366240.0,536280.0,402864.0,282004.8,161145.6,0.0,-170040.0,0.0,0.0,...,0.0,-335720.0,0.0,0.0,100454.4,4018.176,0.0,0.0,0.0,0.0
1137,289.0,87200.0,40320.0,81536.0,57075.2,32614.4,0.0,-11200.0,0.0,0.0,...,0.0,-47040.0,0.0,0.0,0.0,0.0,0.0,0.0,1008.0,60.48
1873,889.0,229320.0,222950.0,137592.0,55036.8,55036.8,0.0,6370.0,0.0,0.0,...,0.0,6370.0,0.0,0.0,0.0,0.0,12612.6,756.756,0.0,0.0
2260,689.0,225750.0,325080.0,1128750.0,564375.0,338625.0,0.0,-99330.0,0.0,0.0,...,1116.0,-18060.0,6321.0,505.68,0.0,0.0,0.0,0.0,0.0,0.0


### merge all data to test

In [35]:
test = pd.merge(combined_base_knowledge_test_data, 
                 splited_year_money_test_data, 
                 on=['ID'])

In [36]:
test.head()

Unnamed: 0_level_0,注册时间,注册资本,行业,区域,企业类型,控制人类型,控制人持股比例,专利,商标,著作权,...,纳税总额_2017,所有者权益合计_2017,债权融资额度_2017,债权融资成本_2017,股权融资额度_2017,股权融资成本_2017,内部融资和贸易融资额度_2017,内部融资和贸易融资成本_2017,项目融资和政策融资额度_2017,项目融资和政策融资成本_2017
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5991927,2010.0,8790.0,工业,湖南,股份有限公司,企业法人,0.64,0.0,0.0,0.0,...,0.0,-140640.0,0.0,0.0,0.0,0.0,341491.5,20489.49,0.0,0.0
5998351,2005.0,270.0,服务业,山东,股份有限公司,自然人,0.59,1.0,0.0,1.0,...,18662.4,-2835.0,0.0,0.0,0.0,0.0,0.0,0.0,194.4,11.664
5992703,2012.0,230.0,服务业,湖北,农民专业合作社,企业法人,0.52,1.0,1.0,1.0,...,414.0,-1840.0,0.0,0.0,41.4,1.656,0.0,0.0,0.0,0.0
5979231,2003.0,5980.0,商业服务业,广东,合伙企业,企业法人,0.92,0.0,0.0,0.0,...,1116.0,-116610.0,0.0,0.0,0.0,0.0,75348.0,4520.88,0.0,0.0
5995422,2007.0,160.0,工业,山东,有限责任公司,自然人,0.7,0.0,1.0,1.0,...,5904.0,160.0,0.0,0.0,0.0,0.0,8856.0,531.36,0.0,0.0


In [37]:
test.to_csv("data/test/test.csv")