# 1. Data description
This dataset is from [Hejing Community ](https://www.kesci.com/mw/dataset/5e023cd12823a10036af49b4/file) and it contains 6272 records, 21 fields, dataset size is 2.3 M.

In [1]:
import pandas as pd
data = pd.read_csv('/Users/will/Documents/LandislandGithub/DataAnalysis/BrokeCompany/com.csv')

In [2]:
data.describe()

Unnamed: 0,bianh,live_days,total_money,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20
count,6272.0,6272.0,805.0,0.0,0.0,0.0,0.0,0.0
mean,314.100128,1304.558036,13048.52,,,,,
std,181.071471,646.994339,194922.5,,,,,
min,1.0,-4616.0,0.0,,,,,
25%,157.0,855.0,300.0,,,,,
50%,314.0,1121.0,650.0,,,,,
75%,471.0,1674.0,3300.0,,,,,
max,628.0,8622.0,5510000.0,,,,,


# 2. Data Visualize & Analysis

## 2.1 Broke companies location distribution

In [40]:
from pyecharts import options as opts
from pyecharts.charts import Map

data['com_addr'] = data['com_addr'].apply(lambda x: x.strip()) # strip can set elements in a series apart
s = data.groupby('com_addr').size()

c = (
Map()
    .add("Number of broke companies", [*s.items()], "china")
    .set_global_opts(
        title_opts=opts.TitleOpts(title="Location distribution"),
        visualmap_opts=opts.VisualMapOpts(max_=200),
    )
)
c.render_notebook()

## 2.2 Top 10 industries with most broke companies

In [45]:
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.faker import Faker

s = data.groupby('cat').size().sort_values(ascending=False)[:10].to_dict()

c = (
    Bar()
    .add_xaxis(list(s.keys()))
    .add_yaxis("Number of broke companies", list(s.values()))
    .set_global_opts(
        title_opts=opts.TitleOpts(title="Top 10 industries with most broke companies"),
        legend_opts=opts.LegendOpts(pos_left='70%')
    )
)
c.render_notebook()

## 2.3 Top 20 in subdivision

In [16]:
s = data.groupby('se_cat').size().sort_values(ascending=False)[:20].sort_values(ascending=True).to_dict()

c = (
    Bar()
    .add_xaxis(list(s.keys()))
    .add_yaxis("Number of broke companies", list(s.values()))
    .reversal_axis()
    .set_series_opts(label_opts=opts.LabelOpts(position="right"))
    .set_global_opts(title_opts=opts.TitleOpts(title="Top 20 in subdivision"))
)
c.render_notebook()

## 2.4 Year of broke

In [46]:
data['born_year'] = data['born_data'].apply(lambda x: x[:4])
data['death_year'] = data['death_data'].apply(lambda x: x[:4])
s1 = data.groupby('born_year').size()
s2 = data.groupby('death_year').size()
s1 = pd.DataFrame({'year': s1.index, 'born': s1.values})
s2 = pd.DataFrame({'year': s2.index, 'death': s2.values})
s = pd.merge(s1,s2, on='year', suffixes=['born', 'death'])
s = s[s['year'] > '2008']

c = (
    Bar()
    .add_xaxis( s['year'].to_list())
    .add_yaxis("Number of born companies", s['born'].to_list())
    .add_yaxis("Number of broke companies", s['death'].to_list())
    .set_global_opts(title_opts=opts.TitleOpts(title="Year of broke distribution"))
)
c.render_notebook()

## 2.5 Length of life of companies

In [62]:
def live_year(x):
    if x < 365:
        return 'less than 1 year'
    if x < 365 * 2:
        return '1-2 year'
    if x < 365 * 3:
        return '2-3 year'
    if x < 365 * 4:
        return '3-4 year'
    if x < 365 * 5:
        return '4-5 year'
    if x < 365 * 10:
        return '5-10 year'
    return 'more than 10 year'

s = data.groupby(data['live_days'].apply(lambda x: live_year(x))).size()

from pyecharts import options as opts
from pyecharts.charts import Pie

c = (
    Pie()
    .add("", [*s.items()], center=["50%","60%"])
    .set_global_opts(title_opts=opts.TitleOpts(title="Length of life of companies"))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"),
                     legend_opts=opts.LegendOpts(pos_top="30%")
                    )
    
)

c.render_notebook()

## 2.6 Investor word cloud

In [33]:
from pyecharts import options as opts
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType

invest = {}
for row in data['invest_name'].values:
    if not pd.isnull(row):
        for name in row.split('&'):
            invest[name] = invest.get(name, 0) + 1
invest = [*invest.items()]
invest.sort(key=lambda x: x[1], reverse=True)
c = (
    WordCloud()
    .add("", invest[:150], word_size_range=[20, 100], shape=SymbolType.DIAMOND)
    .set_global_opts(title_opts=opts.TitleOpts(title="Investor word cloud"))
)
c.render_notebook()

## 2.7 Reason of broke word cloud

In [34]:
death_reason = {}
for row in data['death_reason'].values:
    if not pd.isnull(row):
        for name in row.split(' '):
            death_reason[name] = death_reason.get(name, 0) + 1
c = (
    WordCloud()
    .add("", [*death_reason.items()], word_size_range=[20, 100], shape=SymbolType.DIAMOND)
    .set_global_opts(title_opts=opts.TitleOpts(title="Reason of broke word cloud"))
)
c.render_notebook()