# 概述
EDA及数据可视化，分析单列特征统计分布和特征之间的交互影响，绘制相关的可视化图标

# 导入工具包

In [None]:
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# 导入数据集

In [None]:
df = pd.read_csv('./data/heart.csv')

# 特征两两相关性分析

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True, fmt='.2f', square=True)
plt.show()

In [None]:
sns.heatmap?

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(df.corr(), vmax=.3, center=0, annot=True, fmt='.2f', square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df)
plt.show()

# 单个特征统计分布分析

In [None]:
sns.distplot(df["age"])
plt.show()

In [None]:
df["age"].max()

In [None]:
df.age.max()

In [None]:
df.age.unique()

In [None]:
df.target.value_counts()

In [None]:
sns.countplot(x="target", data=df, palette="bwr")
plt.show()

In [None]:
sns.countplot(x="sex", data=df, palette="mako_r")
plt.xlabel("Sex (0 = female, 1 = male)")
plt.show()

# 单列特征与标签的关系

## 不同年龄段，患心脏病和不患心脏病患者的分布

In [None]:
pd.crosstab(df.age, df.target).plot(kind="bar", figsize=(20, 6))
plt.title("Heart Disease Frequency for Ages")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.savefig("./temp/heartDiseaseAndAges.png")
plt.show()

## 箱型图和小提琴图

In [None]:
sns.boxplot(x=df["target"], y=df["age"])
# sns.boxplot(x=df.target, y=df.age)
plt.savefig("./temp/heartDiseaseAndAges_box.png")
plt.show()

In [None]:
sns.violinplot(x=df.target, y=df.age)
plt.savefig("./temp/heartDiseaseAndAges_violin.png")
plt.show()

## 不同性别，患心脏病和不患心脏病患者的分布

In [None]:
pd.crosstab(df.sex, df.target).plot(kind="bar", figsize=(15, 6), color=["#1CA53B", "#AA1111"])
plt.title("Heart Disease Frequency for Sex")
plt.xlabel("Sex (0 = female, 1 = male)")
plt.xticks(rotation=0)
plt.legend(["Haven't Disease", "Have Disease"])
plt.ylabel("Frequency")
plt.show()