数据描述

作者：谢文伟

邮件：jim.xie.cn@outlook.com

主页：https://github.com/jim-xie-cn/ai-cv

In [None]:
#引用以下包做数据处理
import numpy as np
import pandas as pd
from sklearn import preprocessing
from scipy.stats import norm,skewnorm
#引用以下包做数据可视化
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings

In [None]:
filterwarnings('ignore')
pd.set_option('display.float_format',lambda x:'%.2f'%x) #不使用科学计数法
sns.set(font_scale=1.5) #设置统计图字体大小
plt.rcParams['font.sans-serif']=['SimHei'] #在统计图上显示中文
plt.style.use({'figure.figsize':(24, 8)})  #设置画布大小

# 偏度

In [None]:
plt.style.use({'figure.figsize':(18, 3)})
fig,axes=plt.subplots(1,3)
a= -3
x = np.linspace(skewnorm.ppf(0.01, a),skewnorm.ppf(0.99, a), 50)
y = skewnorm.pdf(x,a)
axes[0].plot(x,y)
axes[0].fill(x,y,'b',alpha=0.5)
axes[0].set_title("偏度:%04f"%a)
axes[0].set_xlabel("左偏")
a= 0
x = np.linspace(skewnorm.ppf(0.01, a),skewnorm.ppf(0.99, a), 50)
y = skewnorm.pdf(x,a)
axes[1].plot(x,y)
axes[1].fill(x,y,'b',alpha=0.5)
axes[1].set_title("偏度:%04f"%a)
axes[1].set_xlabel("正太分布")
a= 3
x = np.linspace(skewnorm.ppf(0.01, a),skewnorm.ppf(0.99, a), 50)
y = skewnorm.pdf(x,a)
axes[2].plot(x,y)
axes[2].fill(x,y,'b',alpha=0.5)
axes[2].set_title("偏度:%04f"%a)
axes[2].set_xlabel("右偏")

# 峰度

In [None]:
plt.style.use({'figure.figsize':(18, 3)})
fig,axes=plt.subplots(1,3)
plt.figure(figsize=(12,4))
x = np.arange(-10, 10, 0.1)
df1 = pd.DataFrame()
df1['x'] = pd.Series(x)
arr = norm.pdf(x, 0, 0.5)
df1['more'] = pd.Series(arr)
#ax=sns.lineplot(x='x',y='more',size=500,data=df,ax=axes[2])
axes[2].plot(x,df1['more'])
axes[2].fill(x,df1['more'],'b',alpha=0.5)
axes[2].set_title("峰度:%04f"%df1['more'].kurt())
axes[2].set_xlabel("高尖")

arr = norm.pdf(x, 0, 1)
df1['norm'] = pd.Series(arr)
axes[1].plot(x,df1['norm'])
axes[1].fill(x,df1['norm'],'b',alpha=0.5)
axes[1].set_title("峰度:%04f"%df1['norm'].kurt())
axes[1].set_xlabel("近似正态分布")

arr = norm.pdf(x, 0, 1.8)
df1['less'] = pd.Series(arr)
axes[0].plot(x,df1['less'])
axes[0].fill(x,df1['less'],'b',alpha=0.5)
axes[0].set_title("峰度:%04f"%df1['less'].kurt())
axes[0].set_xlabel("矮胖")

# 使用Pandas生成常用数据描述

In [None]:
#随机生成一组正太分布的数据
X = np.random.normal(0,0.1,10000)
ds = pd.Series(X)
#常见统计指标
base_stat = ds.describe()
for item,values in base_stat.items():
    print(item,":",values)
print('Skewness',":",ds.skew()) #计算偏度
print('Kurtosis',":",ds.kurt()) #计算峰度
print('IQR',":",(base_stat['75%']-base_stat['25%']))#计算IQR
print('CV',":",ds.std()/ds.mean())#计算变异系数
print('Variance',":",ds.std()*ds.std())#计算方差
#显示数据分布
print('Distribution',":","正态分布") 
sns.distplot(X,bins=50) 