In [1]:
import numpy as np
import pandas as pd

#jupyter notebookの出力を小数点以下３桁に抑える
%precision 3
#DataFrameの出力を小数点以下３桁に抑える
pd.set_option('precision', 3)

df = pd.read_csv('./data/ch2_scores_em.csv', index_col='生徒番号')
#dfの最初の５桁を標示
df.head()


Unnamed: 0_level_0,英語,数学
生徒番号,Unnamed: 1_level_1,Unnamed: 2_level_1
1,42,65
2,69,80
3,56,63
4,41,63
5,57,76


In [2]:
scores = np.array(df['英語'])[:10]
scores

array([42, 69, 56, 41, 57, 48, 65, 49, 65, 58])

In [4]:
#DataFrame=>A~Jの１０人分の点数を表示
scores_df = pd.DataFrame({'点数':scores}, index=pd.Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], name='生徒'))
scores_df

Unnamed: 0_level_0,点数
生徒,Unnamed: 1_level_1
A,42
B,69
C,56
D,41
E,57
F,48
G,65
H,49
I,65
J,58


In [5]:
#平均値
sum(scores) / len(scores)

55.0

In [6]:
#平均値その２
scores_df.mean()

点数    55.0
dtype: float64

In [7]:
#中央値(median)
sorted_scores = np.sort(scores)
sorted_scores

array([41, 42, 48, 49, 56, 57, 58, 65, 65, 69])

In [8]:
#中央値の定義をコードへ落とす
n = len(sorted_scores)
if n % 2 == 0:
    m0 = sorted_scores[n//2 - 1]
    m1 = sorted_scores[n//2]
    median = (m0 + m1) / 2
else:
    median = sorted_scores[(n+1)//2 - 1]
median

56.5

In [9]:
np.median(scores)

56.5

In [10]:
#DataFrameやSeriesのmedianメソッド
scores_df.median()

点数    56.5
dtype: float64

In [11]:
#最頻値(mode)
pd.Series([1, 1, 1, 2, 2, 3]).mode()

0    1
dtype: int64

In [12]:
pd.Series([1, 2, 3, 4, 5]).mode()

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [13]:
#偏差(deviation)
mean = np.mean(scores)
deviation = scores - mean
deviation

array([-13.,  14.,   1., -14.,   2.,  -7.,  10.,  -6.,  10.,   3.])

In [14]:
#点数と偏差
summary_df = scores_df.copy()
summary_df['偏差'] = deviation
summary_df

Unnamed: 0_level_0,点数,偏差
生徒,Unnamed: 1_level_1,Unnamed: 2_level_1
A,42,-13.0
B,69,14.0
C,56,1.0
D,41,-14.0
E,57,2.0
F,48,-7.0
G,65,10.0
H,49,-6.0
I,65,10.0
J,58,3.0


In [15]:
summary_df.mean()

点数    55.0
偏差     0.0
dtype: float64

In [16]:
#分散(=偏差の二乗)=>variance
np.mean(deviation ** 2)

86.0

In [17]:
#分散(NumPy)
np.var(scores)

86.0

In [18]:
#分散(DataFrame/Series)
scores_df.var()

点数    95.556
dtype: float64

In [19]:
#summary_dfに偏差二乗の列を追加
summary_df['偏差二乗'] = np.square(deviation)
summary_df

Unnamed: 0_level_0,点数,偏差,偏差二乗
生徒,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,42,-13.0,169.0
B,69,14.0,196.0
C,56,1.0,1.0
D,41,-14.0,196.0
E,57,2.0,4.0
F,48,-7.0,49.0
G,65,10.0,100.0
H,49,-6.0,36.0
I,65,10.0,100.0
J,58,3.0,9.0


In [20]:
summary_df.mean()

点数      55.0
偏差       0.0
偏差二乗    86.0
dtype: float64

In [21]:
#分散の平方根
np.sqrt(np.var(scores, ddof=0))

9.273618495495704

In [22]:
#分散の平方根(DataFrame/Series)
np.std(scores, ddof=0)

9.273618495495704

In [23]:
#範囲(range)
np.max(scores) - np.min(scores)

28

In [24]:
#四分位範囲(interquatile range)
scores_Q1 = np.percentile(scores, 25)
scores_Q3 = np.percentile(scores, 75)
scores_IQR = scores_Q3 - scores_Q1
scores_IQR

15.0

In [25]:
pd.Series(scores).describe()

count    10.000
mean     55.000
std       9.775
min      41.000
25%      48.250
50%      56.500
75%      63.250
max      69.000
dtype: float64

In [26]:
#正規化(normalization),標準化(standardization)=>テストの点数を標準化
z = (scores - np.mean(scores)) / np.std(scores)
z

array([-1.402,  1.51 ,  0.108, -1.51 ,  0.216, -0.755,  1.078, -0.647,
        1.078,  0.323])

In [27]:
#標準化されたデータの平均と標準偏差
np.mean(z), np.std(z, ddof=0)

(-1.6653345369377347e-17, 0.9999999999999999)

In [28]:
#各生徒の偏差値
z = 50 + 10 * (scores - np.mean(scores)) / np.std(scores)

In [29]:
#各生徒の偏差値
z = 50 + 10 * (scores - np.mean(scores)) / np.std(scores)

In [30]:
z


array([35.982, 65.097, 51.078, 34.903, 52.157, 42.452, 60.783, 43.53 ,
       60.783, 53.235])

In [31]:
# 点数と偏差値の関係=>Dataframe
scores_df['偏差値'] = z
scores_df

Unnamed: 0_level_0,点数,偏差値
生徒,Unnamed: 1_level_1,Unnamed: 2_level_1
A,42,35.982
B,69,65.097
C,56,51.078
D,41,34.903
E,57,52.157
F,48,42.452
G,65,60.783
H,49,43.53
I,65,60.783
J,58,53.235
