In [1]:
import numpy as np
import pandas as pd

In [2]:
# read data
battery_life = pd.read_excel('../dataset/statistics/2-1.xlsx')
# battry_life.head()
battery_life.describe()

Unnamed: 0,Hours
count,50.0
mean,1257.7
std,277.948553
min,804.0
25%,1002.5
50%,1286.0
75%,1540.25
max,1689.0


In [3]:

# data grouping by cut method of pandas
bins = range(800, 1701, 100)    
battery_life_bins = pd.cut(battery_life['Hours'], bins, right=False)
battery_life_bins = battery_life_bins.value_counts().to_frame(name='value_counts').sort_index()
battery_life_bins

Unnamed: 0,value_counts
"[800, 900)",6
"[900, 1000)",7
"[1000, 1100)",5
"[1100, 1200)",4
"[1200, 1300)",4
"[1300, 1400)",5
"[1400, 1500)",5
"[1500, 1600)",8
"[1600, 1700)",6


In [4]:
# calculate the percentage, upwards cumulative and down the cumulative
battery_life_bins['percentage'] = battery_life_bins['value_counts'] / battery_life_bins['value_counts'].sum()
battery_life_bins['cumsum_up'] = battery_life_bins['value_counts'].cumsum()
battery_life_bins['cumsum_up_perct'] = battery_life_bins['cumsum_up'] / battery_life_bins['value_counts'].sum()

In [5]:
# another method to calculate the percentage, which uses '.loc'
battery_df = pd.DataFrame(battery_life_bins['value_counts'])
battery_df.loc[:, 'percentage'] = battery_df.loc[:, 'value_counts'] / battery_df.loc[:, 'value_counts'].sum()
battery_df.loc[:, 'cumsum_up'] = battery_df.loc[:, 'value_counts'].cumsum()
battery_df.loc[:, 'cumsum_up_perct'] = battery_df.loc[:, 'value_counts'].cumsum() / battery_df.loc[:, 'value_counts'].sum()
battery_df.loc[:, 'cumsum_down'] = battery_df.loc[::-1, 'value_counts'].cumsum()
battery_df.loc[:, 'cumsum_down_penct'] = battery_df.loc[::-1, 'value_counts'].cumsum() / battery_df.loc[:, 'value_counts'].sum()
battery_df

Unnamed: 0,value_counts,percentage,cumsum_up,cumsum_up_perct,cumsum_down,cumsum_down_penct
"[800, 900)",6,0.12,6,0.12,50,1.0
"[900, 1000)",7,0.14,13,0.26,44,0.88
"[1000, 1100)",5,0.1,18,0.36,37,0.74
"[1100, 1200)",4,0.08,22,0.44,32,0.64
"[1200, 1300)",4,0.08,26,0.52,28,0.56
"[1300, 1400)",5,0.1,31,0.62,24,0.48
"[1400, 1500)",5,0.1,36,0.72,19,0.38
"[1500, 1600)",8,0.16,44,0.88,14,0.28
"[1600, 1700)",6,0.12,50,1.0,6,0.12


In [6]:
# distribution and statistics
eval_df = pd.read_excel('../dataset/statistics/2-2.xlsx')
eval_df['Eval'].value_counts()

# matplotlib 
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']    # replace sans-serif font whit SimHei
plt.rcParams['axes.unicode_minus'] = False    # solve the problem of negative axis number
plt.rcParams['savefig.dpi'] = 100    # the quality of image

In [7]:
# description of statistics, mean/median/mode...
df_score = pd.read_excel('../dataset/statistics/2-7.xlsx')
df_score

Unnamed: 0,Score
0,95
1,63
2,78
3,94
4,60
5,96
6,83
7,68
8,88
9,90


In [8]:
# mean/median/mode...
score_statis = pd.DataFrame()
score_statis['mean'] = df_score['Score'].mean()    # 均值
score_statis['median'] = df_score['Score'].median()    # 中位数
score_statis['mode'] = df_score['Score'].mode()    # 众数
score_statis['std'] = df_score['Score'].std()    # 标准差
score_statis['var'] = df_score['Score'].var()    # 方差
score_statis['kurt'] = df_score['Score'].kurt()    # 峰度
score_statis['skew'] = df_score['Score'].skew()    # 偏度
score_statis['max'] = df_score['Score'].max()    # 最大值
score_statis['min'] = df_score['Score'].min()    # 最小值
score_statis['sum'] = df_score['Score'].sum()    # 求和
score_statis['count'] = df_score['Score'].count()    # 样本量
score_statis['se'] = score_statis['std'] / (np.sqrt(score_statis['count']))    # 标准误差 
score_statis['confidence-95%'] = score_statis['se'] * 2    # 置信度(95%)，2倍的标准误差
score_statis['confidence-99.7%'] = score_statis['se'] * 3    # 置信度(99.7%)，3倍的标准误差
score_statis

Unnamed: 0,mean,median,mode,std,var,kurt,skew,max,min,sum,count,se,confidence-95%,confidence-99.7%
0,,,83,12.01085,144.260526,-1.028344,-0.50775,96,60,1621,20,2.685708,5.371416,8.057123
1,,,95,12.01085,144.260526,-1.028344,-0.50775,96,60,1621,20,2.685708,5.371416,8.057123


In [9]:
score_statis['mean_2'] = score_statis['sum'] / score_statis['count']
score_statis

Unnamed: 0,mean,median,mode,std,var,kurt,skew,max,min,sum,count,se,confidence-95%,confidence-99.7%,mean_2
0,,,83,12.01085,144.260526,-1.028344,-0.50775,96,60,1621,20,2.685708,5.371416,8.057123,81.05
1,,,95,12.01085,144.260526,-1.028344,-0.50775,96,60,1621,20,2.685708,5.371416,8.057123,81.05


In [2]:
# description
lamp_life = pd.read_excel('../dataset/statistics/2-10.xlsx')
lamp_life

Unnamed: 0,bins,x,f
0,1000以下,900,2
1,1000-1200,1100,8
2,1200-1400,1300,16
3,1400-1600,1500,35
4,1600-1800,1700,23


In [9]:
# 平均数
lamp_life_mean = (lamp_life['x'] * lamp_life['f']).sum() / lamp_life['f'].sum()

# 标准差
lamp_life_std = np.sqrt((((lamp_life['x'] - lamp_life_mean)**2)*lamp_life['f']).sum() / lamp_life['f'].sum())

# 三阶动差，为计算偏度做准备
lamp_life_three = (((lamp_life['x'] - lamp_life_mean)**3)*lamp_life['f']).sum() / lamp_life['f'].sum()

# 四阶动差，为计算峰度做准备
lamp_life_four = (((lamp_life['x'] - lamp_life_mean)**4)*lamp_life['f']).sum() / lamp_life['f'].sum()

# 偏度
lamp_life_skew = lamp_life_three / (lamp_life_std ** 3)

# 峰度
lamp_life_kurt = lamp_life_four / (lamp_life_std ** 4) - 3

print("均值:{mean}//n 标准差:{std}, 三阶动差:{three}, 四阶动差:{four}, 偏度:{skew}, 峰度:{kurt}"
      .format(mean=lamp_life_mean, std=lamp_life_std, three=lamp_life_three, four=lamp_life_four, skew=lamp_life_skew, kurt=lamp_life_kurt))

均值:1464.2857142857142//n 标准差:202.7447710222652, 三阶动差:-6121720.116618068, 四阶动差:5075925829.515478, 偏度:-0.734555277612486, 峰度:0.0041154496430850784
