## 使用python的plotly绘制列线图  
由于算法是自行推演，不好说完全正确，大家自行判断。  
**看点：**  
1. Python没有专业绘制列线图的包，一些研究和尝试，大家可以在此基础上进行修改和完善；  
2. 对于绘制列线图背后的算法的解析，不一定完全正确，大家自行判断；  
3. plotly绘制列线图的方法，比较原始的表现形式，比较容易理解。  


![Image Name](https://cdn.kesci.com/upload/rshdhvxinj.png?imageView2/0/w/960/h/960)  





### 1.读取数据

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd

In [53]:
#读取文件，用户操作
data=pd.read_csv('data_dev_factor_cleaned_remove_space.csv')
#指定分类变量
data['sex']=data['sex'].astype('category')
data['ejection']=pd.Categorical(data['ejection'],categories=['Poor','Fair','Good'],ordered=True)
data=data.loc[:,['sex','ejection','age','bmi','outcome']]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3328 entries, 0 to 3327
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   sex       3328 non-null   category
 1   ejection  3328 non-null   category
 2   age       3328 non-null   int64   
 3   bmi       3328 non-null   float64 
 4   outcome   3328 non-null   object  
dtypes: category(2), float64(1), int64(1), object(1)
memory usage: 84.9+ KB


In [54]:
#一个多分类的数据，是列线图中的一类典型的情况
data['ejection'].value_counts()

ejection
Good    2130
Fair     956
Poor     242
Name: count, dtype: int64

In [55]:
data_m=pd.get_dummies(data,columns=['sex','ejection'],drop_first=True,dtype=int)
data_m.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3328 entries, 0 to 3327
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            3328 non-null   int64  
 1   bmi            3328 non-null   float64
 2   outcome        3328 non-null   object 
 3   sex_Male       3328 non-null   int32  
 4   ejection_Fair  3328 non-null   int32  
 5   ejection_Good  3328 non-null   int32  
dtypes: float64(1), int32(3), int64(1), object(1)
memory usage: 117.1+ KB


In [56]:
le=LabelEncoder()
data_m['outcome']=le.fit_transform(data_m['outcome'])

In [57]:
data_m.columns

Index(['age', 'bmi', 'outcome', 'sex_Male', 'ejection_Fair', 'ejection_Good'], dtype='object')

### 2. 构建线性模型，列线图需要的是模型的参数，beta。

In [58]:
#构建线性模型，列线图需要的是模型的参数。
#再此之前需要进行incode 
model_logit=smf.logit('outcome~ age +bmi+sex_Male+ejection_Good+ejection_Fair',data_m).fit()

Optimization terminated successfully.
         Current function value: 0.238456
         Iterations 7


In [59]:
model_logit_params=model_logit.params
model_logit_params

Intercept       -2.661101
age              0.032976
bmi             -0.039664
sex_Male        -0.193388
ejection_Good   -1.297959
ejection_Fair   -0.936878
dtype: float64

In [60]:
model_logit_params.values

array([-2.66110094,  0.03297645, -0.03966392, -0.19338821, -1.29795908,
       -0.93687808])

In [61]:
model_logit_params.index.to_list()

['Intercept', 'age', 'bmi', 'sex_Male', 'ejection_Good', 'ejection_Fair']

### 3.构建绘制列线图所需要的数据  
- 绘制线条的数据可能和标签数据并不是同一组数据，只是两者之间有对应关系，这在列线图中是常见的一个操作；  
- 连续变量，二分类和多分类具有不同的处理方式；  
- 下一步可能是需要这部分更加自动化一些。设计：包可能包括2～3函数，一个函数计算所需要的数据，另一个函数进行绘图，如果再有就增加一些标记之类的。

In [63]:
import numpy as np
cols=model_logit_params.index.to_list()
params=model_logit_params.values
#原始数据，用于做数字标签
meta_df=data_m.loc[:,cols[1:]]
meta_df['Intercept']=np.repeat(1,meta_df.shape[0])

#meta数据1，beta与X的乘积,现在没有连续变量和分类变量的区别
for col, beta in zip(cols,params):
    meta_df[col]= [x* beta for x in meta_df[col]]

#合计数据，用于计算预测概率
meta_df['total']=meta_df.sum(axis=1)
meta_df['probability']=[1/(1+np.exp(-z)) for z in meta_df['total']]
meta_df.info()
#之前纠结的一个问题是列线图不考虑intercept？现在倾向于认为最终计算概率的时候是考虑的，但是计算总分的时候没有考虑。

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3328 entries, 0 to 3327
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            3328 non-null   float64
 1   bmi            3328 non-null   float64
 2   sex_Male       3328 non-null   float64
 3   ejection_Good  3328 non-null   float64
 4   ejection_Fair  3328 non-null   float64
 5   Intercept      3328 non-null   float64
 6   total          3328 non-null   float64
 7   probability    3328 non-null   float64
dtypes: float64(8)
memory usage: 208.1 KB


In [64]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3328 entries, 0 to 3327
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            3328 non-null   float64
 1   bmi            3328 non-null   float64
 2   sex_Male       3328 non-null   float64
 3   ejection_Good  3328 non-null   float64
 4   ejection_Fair  3328 non-null   float64
 5   Intercept      3328 non-null   float64
 6   total          3328 non-null   float64
 7   probability    3328 non-null   float64
dtypes: float64(8)
memory usage: 208.1 KB


In [65]:
#将数据转化成绘制列线图需要的数据，上面一部分是后台计算所需要的数据
#求最大数据,即每个变量最大值和最小值之间distances中最大的一个， 这个最大的distance会处理为100， 其它的distance根据比例绘制
ls_max_distance=[]
for col in cols:
    one_distance=np.max(meta_df[col].values)-np.min(meta_df[col].values)
    print(col+':'+f"{one_distance}")
    ls_max_distance.append(one_distance)
    
max_distance=np.max(ls_max_distance)

max_distance

Intercept:0.0
age:2.4402575909399884
bmi:1.8714487519021634
sex_Male:0.19338821024701608
ejection_Good:1.2979590821153464
ejection_Fair:0.9368780777646967


2.4402575909399884

In [66]:
score_df=meta_df.copy()[cols]
for col in cols:
    score_df[col]=(meta_df[col]-meta_df[col].min())*100/max_distance
score_df['total']=score_df.sum(axis=1)
score_df['probability']=meta_df['probability']

In [67]:
score_df.head()# score + original data is visualization data

Unnamed: 0,Intercept,age,bmi,sex_Male,ejection_Good,ejection_Fair,total,probability
0,0.0,63.513514,34.002194,0.0,0.0,38.392589,135.908297,0.038372
1,0.0,71.621622,30.813236,7.92491,53.189429,0.0,163.549196,0.072642
2,0.0,78.378378,52.511606,7.92491,0.0,38.392589,177.207484,0.098545
3,0.0,79.72973,47.019067,0.0,0.0,38.392589,165.141386,0.075303
4,0.0,72.972973,38.902235,0.0,0.0,38.392589,150.267797,0.053611


In [68]:
score_df['age'].max()

100.0

In [78]:
score_df_plus=score_df.copy()

In [79]:
plus_cols=['ejection_Good','ejection_Fair']
value_name='ejection'
score_df_plus[value_name]=score_df_plus[plus_cols].sum(axis=1)
#将df['ejcection']中最大值替换为0
score_df_plus[value_name]=[0 if x==score_df_plus[value_name].max() else x for x in score_df_plus[value_name]]
score_df_plus=score_df_plus.drop(plus_cols,axis=1)
score_df_plus.info()
score_df_plus.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3328 entries, 0 to 3327
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Intercept    3328 non-null   float64
 1   age          3328 non-null   float64
 2   bmi          3328 non-null   float64
 3   sex_Male     3328 non-null   float64
 4   total        3328 non-null   float64
 5   probability  3328 non-null   float64
 6   ejection     3328 non-null   float64
dtypes: float64(7)
memory usage: 182.1 KB


Unnamed: 0,Intercept,age,bmi,sex_Male,total,probability,ejection
0,0.0,63.513514,34.002194,0.0,135.908297,0.038372,38.392589
1,0.0,71.621622,30.813236,7.92491,163.549196,0.072642,53.189429
2,0.0,78.378378,52.511606,7.92491,177.207484,0.098545,38.392589
3,0.0,79.72973,47.019067,0.0,165.141386,0.075303,38.392589
4,0.0,72.972973,38.902235,0.0,150.267797,0.053611,38.392589


In [None]:
#连续变量获得step
#获取原始值的distance，使用score的distance除以实际值的distance,获得刻度线的宽度（step），保证最大和最小值都在线条上，分类变量不需要
#用于绘制刻度间隔step，也就是单位变量（比如age）所代表的分数,即每一岁代表的分数
step_age=score_df_plus['age'].max()/(data_m['age'].max()-data_m['age'].min())
step_bmi=score_df_plus['bmi'].max()/(data_m['bmi'].max()-data_m['bmi'].min())

In [None]:
#分类变量不需要step，需要的是几个点
np.unique(score_df_plus['ejection'])

array([ 0.        , 38.39258942, 53.18942914])

In [134]:
score_df_plus['ejection'].value_counts()



ejection
38.392589    2130
53.189429     956
0.000000      242
Name: count, dtype: int64

In [135]:
score_df_plus.to_csv('score_df_plus.csv')

### 4.绘制列线图
绘图需要两个数据集，1. 原始数据集，这里是data_m,包含数据的原始值，作为标签；2. 评分数据集，这里是score_df,包含计算的得分。


还有第三个数据集，中间数据集meta_df,是系数与变量值的乘积，是计算score_df的中间值，但是包含预测概率，max_distance(是一个常数，要从数据集中获得)，每个变量的meta_df最小值要保留。



In [None]:
import plotly.graph_objects as go

In [None]:
#fig整体布局，三行两列，
# title='Explaination of Linear Model'
fig = go.Figure().set_subplots(rows=3,cols=2,vertical_spacing=0.01,
    horizontal_spacing=0.01,column_widths=[0.1,0.9],
    row_heights=[0.1,0.5,0.2])#
fig.update_yaxes(
    autorange=False,
    visible=False
    )#禁止自动调整刻度，通用
fig.update_xaxes(visible=False)#禁止自动调整刻度，通用
fig.update_yaxes(row=2,range=[-2,4])
fig.update_yaxes(row=3,range=[0,4])
fig.update_yaxes(row=1,range=[2,4])
fig.update_xaxes(row=1,col=2,range=[-5,105])
fig.update_xaxes(row=2,col=2,range=[-5,105])
fig.update_xaxes(row=3,col=2,range=[-5,max(score_df_plus['total'])*1.2])

fig.update_layout(width=1000,height=600,showlegend=False,paper_bgcolor="#ffffff",plot_bgcolor="#ffffff")
#---------------------------------------绘制100标尺--------------------------------------------------#
#python arrange的规则是“前包后不包”，所以要多一位数
fig.add_trace(go.Scatter(mode='lines+markers',y=np.repeat(3,105/5),
                         x=np.arange(0,105,5),
                         marker={'symbol':'142',"color":'blue','size':15},  
              ), row=1,col=2)

#绘制次要刻度
fig.add_trace(go.Scatter(mode='lines+markers',
                         x=np.arange(0,101),
                         y=np.repeat(3,101),#离开x轴的位置
                         marker={'symbol':'142','color':'blue'},
                         ),row=1,col=2)
#绘制数据数字标签，原始数据
fig.add_trace(go.Scatter(mode='text',
                         x=np.arange(0,105,5),
                         y=np.repeat(3.5,105/5),
                         text=np.arange(0,105,5),
                        ),row=1,col=2)
#绘制左侧label,col=1
fig.add_trace(go.Scatter(mode='text',x=[-3],y=[3],text='Points'),row=1,col=1)

#------------------------------------------绘制age---------------------------------------------#

x_age_range = np.arange(0, max(score_df_plus['age']) + 2*step_age, 2*step_age)

# 主刻度
fig.add_trace(go.Scatter(
    mode='lines+markers',
    y=np.repeat(3, len(x_age_range)),
    x=x_age_range,
    marker={'symbol': '142', "color": 'red'}
), row=2, col=2)

# 绘制数据数字标签，原始数据
fig.add_trace(go.Scatter(
    mode='text',
    x=x_age_range,
    y=np.repeat(3.5, len(x_age_range)),
    text=[int(x) for x in np.arange(np.floor(min(data_m['age'])),np.floor(max(data_m['age'])+2),2)][::-1],#1step_age 对应1 岁
), row=2, col=2)

# 绘制左侧label，col=1
fig.add_trace(go.Scatter(
    mode='text',
    x=[-3],
    y=[3],
    text='Age'
), row=2, col=1)

#----------------------------绘制ejection_std------------------------#
x_ejection_range=np.unique(score_df_plus['ejection'])
fig.add_trace(go.Scatter(mode='lines+markers',y=np.repeat(1,len(x_ejection_range)),
                         x=x_ejection_range,
                         marker={'symbol':'142',"color":'red'},  
              ), row=2,col=2)

#绘制数据数字标签，这应该是原始数据才对
fig.add_trace(go.Scatter(mode='text',
                         x=x_ejection_range,#标尺数据
                         y=np.repeat(1.5,len(x_ejection_range)),
                         text=['poor','fair','good'],#原始数据
                        ),row=2,col=2)
#绘制左侧label,col=1
fig.add_trace(go.Scatter(mode='text',x=[-3],y=[1],text='Ejection'),row=2,col=1)

# #-----------------------------------sex--------------------------------------#
x_sex_range=np.unique(score_df_plus['sex_Male'])
fig.add_trace(go.Scatter(mode='lines+markers',y=np.repeat(-1,len(x_sex_range)),
                         x=x_sex_range,
                         marker={'symbol':'142',"color":'red'},  
              ), row=2,col=2)

#绘制数据数字标签，这应该是原始数据才对
fig.add_trace(go.Scatter(mode='text',
                         x=x_sex_range,
                         y=np.repeat(-0.5,len(x_sex_range)),
                         text=['femal','male']
                        ),row=2,col=2)
#绘制左侧label,col=1
fig.add_trace(go.Scatter(mode='text',x=[-3],y=[-1],text='Sex'),row=2,col=1)

# #----------------------------------total score------------------------------------------#
#总分与各个变量得分没有对应关系，是和概率之间有对应关系，所以可以是独立的坐标系
x_total_range=np.arange(score_df_plus['total'].min()-50,max(score_df_plus['total'])+50,20)
fig.add_trace(go.Scatter(mode='lines+markers',y=np.repeat(3,len(x_total_range)),
                         x=x_total_range,#
                         marker={'symbol':'142',"color":'green','size':15},  
              ), row=3,col=2)


#绘制数据数字标签，这应该是原始数据才对
fig.add_trace(go.Scatter(mode='text',
                         x=x_total_range,#标尺数据
                         y=np.repeat(3.5,len(x_total_range)),#s
                         text=x_total_range.round(0)#原始数据
                        ),row=3,col=2)
#绘制左侧label,col=1
fig.add_trace(go.Scatter(mode='text',x=[-3],y=[3],text='Total'),row=3,col=1)


# #----------------------------------------prob-用total划线而标记probality----------------------------------------#
#给定一个概率的列表
prob_range=[0,0.1,0.2,0.4,0.5,0.6,0.8,0.85,0.9]

x_proba_range=[]#取toal_betaX_std的值
proba_text_label=[]
for x in prob_range:
    #＜于0.1的，且距离0.1最近的值
    proba_closest=max(score_df_plus[score_df_plus['probability']<=x]['probability'],default=None)
    if proba_closest is not None: 
        label=round(proba_closest,2)
        value=score_df_plus[score_df_plus['probability']==proba_closest]['total']
        x_proba_range.append(value)
        proba_text_label.append(label)
    
x_proba_range=np.unique(x_proba_range)
print(x_proba_range)
proba_text_label=np.unique(proba_text_label)
fig.add_trace(go.Scatter(mode='lines+markers',y=np.repeat(1,len(x_proba_range)),
                         x=x_proba_range,
                         marker={'symbol':'142',"color":'green'},  
              ), row=3,col=2)

#绘制数据数字标签，这应该是原始数据才对
fig.add_trace(go.Scatter(mode='text',
                         x=x_proba_range,#标尺数据
                         y=np.repeat(1.5,len(x_proba_range)),
                         text=proba_text_label#原始数据
                        ),row=3,col=2)
#绘制左侧label,col=1
fig.add_trace(go.Scatter(mode='text',x=[-3],y=[1],text='prob'),row=3,col=1)


[177.83094751 211.03659832 238.0834983  254.81663423]
