# 1次元データの整理

### データの中心の指標

In [1]:
import numpy as np
import pandas as pd

# Jupyter Notebookの出力を小数点以下3桁に抑える
%precision 3
# Dataframeの出力を小数点以下3桁に抑える
pd.set_option('precision', 3)

In [2]:
df = pd.read_csv('../data/ch2_scores_em.csv',
                 index_col='生徒番号')
# dfの最初の5行を表示
df.head()

Unnamed: 0_level_0,英語,数学
生徒番号,Unnamed: 1_level_1,Unnamed: 2_level_1
1,42,65
2,69,80
3,56,63
4,41,63
5,57,76


In [3]:
scores = np.array(df['英語'])[:10]
scores

array([42, 69, 56, 41, 57, 48, 65, 49, 65, 58])

In [4]:
scores_df = pd.DataFrame({'点数':scores},
                         index=pd.Index(['A', 'B', 'C', 'D', 'E',
                                         'F', 'G', 'H', 'I', 'J'],
                                        name='生徒'))
scores_df

Unnamed: 0_level_0,点数
生徒,Unnamed: 1_level_1
A,42
B,69
C,56
D,41
E,57
F,48
G,65
H,49
I,65
J,58


In [5]:
sum(scores)/ len(scores)

55.000

In [6]:
np.mean(scores)

55.000

In [7]:
scores_df.mean()

点数    55.0
dtype: float64

### 中央値

In [8]:
sorted_scores = np.sort(scores)
sorted_scores

array([41, 42, 48, 49, 56, 57, 58, 65, 65, 69])

In [9]:
n = len(sorted_scores)
if n % 2 == 0:
    m0 = sorted_scores[n//2 - 1]
    m1 = sorted_scores[n//2]
    median =(m0 + m1)/2
else:
    median = sorted_scores[(n+1)//2-1]
median

56.500

In [10]:
np.median(scores)

56.500

In [11]:
scores_df.median()

点数    56.5
dtype: float64

In [12]:
df.head(10)

Unnamed: 0_level_0,英語,数学
生徒番号,Unnamed: 1_level_1,Unnamed: 2_level_1
1,42,65
2,69,80
3,56,63
4,41,63
5,57,76
6,48,60
7,65,81
8,49,66
9,65,78
10,58,82


### 最頻値

In [13]:
pd.Series([1, 1, 1, 2, 2, 3]).mode()

0    1
dtype: int64

In [14]:
pd.Series([1, 2, 3, 4, 5]).mode()

Series([], dtype: int64)

## データのばらつきの指標

### 分散と標準偏差

#### 偏差

In [15]:
mean = np.mean(scores)
deviation = scores -mean
deviation

array([-13.,  14.,   1., -14.,   2.,  -7.,  10.,  -6.,  10.,   3.])

In [16]:
another_scores = [50, 60, 58, 54, 51, 56, 57, 53, 52, 59]
another_mean = np.mean(another_scores)
another_deviation = another_scores - another_mean
another_deviation

array([-5.,  5.,  3., -1., -4.,  1.,  2., -2., -3.,  4.])

In [17]:
np.mean(deviation)

0.000

In [18]:
np.mean(another_deviation)

0.000

In [19]:
summary_df = scores_df.copy()
summary_df['偏差'] = deviation
summary_df

Unnamed: 0_level_0,点数,偏差
生徒,Unnamed: 1_level_1,Unnamed: 2_level_1
A,42,-13.0
B,69,14.0
C,56,1.0
D,41,-14.0
E,57,2.0
F,48,-7.0
G,65,10.0
H,49,-6.0
I,65,10.0
J,58,3.0


In [20]:
summary_df.mean()

点数    55.0
偏差     0.0
dtype: float64

#### 分散

In [21]:
np.mean(deviation ** 2)

86.000

In [22]:
np.var(scores)

86.000

In [23]:
scores_df.var()

点数    95.556
dtype: float64

In [24]:
summary_df['偏差二乗'] = np.square(deviation)
summary_df

Unnamed: 0_level_0,点数,偏差,偏差二乗
生徒,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,42,-13.0,169.0
B,69,14.0,196.0
C,56,1.0,1.0
D,41,-14.0,196.0
E,57,2.0,4.0
F,48,-7.0,49.0
G,65,10.0,100.0
H,49,-6.0,36.0
I,65,10.0,100.0
J,58,3.0,9.0


In [25]:
summary_df.mean()

点数      55.0
偏差       0.0
偏差二乗    86.0
dtype: float64

#### 標準偏差