# P041 数据指标计算-平均绝对误差 MAE

In [5]:
# 平均绝对误差表达相同现象的成对观测值之间误差的度量

In [7]:
import numpy as np
import pandas as pd

In [13]:
df = pd.read_csv("./p041-predictions.csv")

In [15]:
df

Unnamed: 0,y_true,y_pred
0,109.934283,113.175123
1,97.234714,93.383891
2,112.953771,106.184551
3,130.460597,136.57736
4,95.316933,105.626928


In [17]:
def mean_absolute_error(y_true, y_pred):
    return abs(y_true-y_pred).sum() / len(y_true)

In [19]:
mae = mean_absolute_error(df["y_true"], df["y_pred"])

In [21]:
mae

6.057528199999998

# P042 数据指标计算-均方误差 MSE

In [24]:
# 均方误差表达估计值和实际值之间的平均平方误差

In [26]:
import numpy as np
import pandas as pd

In [28]:
df = pd.read_csv("./p042-predictions.csv")
df

Unnamed: 0,y_true,y_pred
0,109.934283,113.175123
1,97.234714,93.383891
2,112.953771,106.184551
3,130.460597,136.57736
4,95.316933,105.626928
5,95.317261,104.630062
6,131.584256,123.192081
7,115.348695,112.256571
8,90.610512,93.923147
9,110.851201,120.606652


In [30]:
def mean_squared_error(y_true, y_pred):
    return ((y_true-y_pred) ** 2).sum() / len(y_true)

In [32]:
mse = mean_absolute_error(df["y_true"], df["y_pred"])

In [34]:
mse

6.415282700000001

# P043 数据指标计算-Sigmoid函数

In [37]:
# Sigmoid函数是一个在生物学中常见的S型函数，常被用作神经网络的激活函数，将变量映射到0，1之间

In [41]:
import numpy as np
import pandas as pd

In [67]:
df = pd.DataFrame(
    data = np.random.rand(10),
    columns = ["var1"]
)
df

Unnamed: 0,var1
0,0.514967
1,0.012484
2,0.89891
3,0.833433
4,0.888657
5,0.314287
6,0.994596
7,0.826539
8,0.503269
9,0.066761


In [69]:
def sigmoid(x):
    return 1 / (1+np.exp(-x))

In [71]:
sigmoid(np.array([1,2,3]))

array([0.73105858, 0.88079708, 0.95257413])

In [73]:
df["var1_sigmoid"] = df["var1"].map(sigmoid)

In [75]:
df

Unnamed: 0,var1,var1_sigmoid
0,0.514967,0.62597
1,0.012484,0.503121
2,0.89891,0.710726
3,0.833433,0.69708
4,0.888657,0.708613
5,0.314287,0.577931
6,0.994596,0.729995
7,0.826539,0.695623
8,0.503269,0.623227
9,0.066761,0.516684


# P044 数据指标计算-entropy熵函数

In [78]:
# 熵的概念最早起源于物理学，用于度量一个热力学系统的无序程度。在信息论里面，熵是对不确定性的测量，越随机的信源的熵越大

In [80]:
import numpy as np
import pandas as pd

In [86]:
df = pd.DataFrame(
    {
        "val_1": np.arange(0.01, 1, 0.1),
        "val_2": 1- np.arange(0.01, 1, 0.1),
    }
)
df

Unnamed: 0,val_1,val_2
0,0.01,0.99
1,0.11,0.89
2,0.21,0.79
3,0.31,0.69
4,0.41,0.59
5,0.51,0.49
6,0.61,0.39
7,0.71,0.29
8,0.81,0.19
9,0.91,0.09


In [88]:
def entropy(x):
    return -np.sum(x*np.log2(x))

In [100]:
df["entropy"] = df.apply(
    lambda x : entropy([x["val_1"], x["val_2"]]), 
    axis=1
)

In [102]:
df

Unnamed: 0,val_1,val_2,entropy
0,0.01,0.99,0.080793
1,0.11,0.89,0.499916
2,0.21,0.79,0.741483
3,0.31,0.69,0.893173
4,0.41,0.59,0.9765
5,0.51,0.49,0.999711
6,0.61,0.39,0.9648
7,0.71,0.29,0.868721
8,0.81,0.19,0.701471
9,0.91,0.09,0.43647


# P045 数据指标计算-准确率 accuracy_score

In [105]:
# accuracy_score计算准确率，即正确预测的分数（默认）或计数（normalize=False）

In [107]:
import numpy as np
import pandas as pd

In [109]:
from sklearn.metrics import accuracy_score

In [111]:
df = pd.read_csv("./p045-predictions.csv")

In [113]:
df.head(10)

Unnamed: 0,y_true,y_pred
0,1,0
1,0,0
2,1,1
3,2,2
4,1,1
5,0,0
6,1,1
7,1,0
8,0,0
9,1,1


In [115]:
accuracy = accuracy_score(df["y_true"], df["y_pred"])

In [117]:
accuracy

0.8

# P046 数据指标计算-混淆矩阵 confusion-matrix

In [122]:
import numpy as np
import pandas as pd

In [124]:
from sklearn.metrics import confusion_matrix

In [130]:
df = pd.read_csv("./p046-predictions.txt")

In [132]:
df

Unnamed: 0,y_true,y_pred
0,1,0
1,0,0
2,1,1
3,2,2
4,1,1
5,0,0
6,1,1
7,1,0
8,0,0
9,1,1


In [134]:
cm = confusion_matrix(df["y_true"], df["y_pred"])

In [136]:
cm

array([[ 6,  1,  0],
       [ 3, 10,  2],
       [ 0,  2,  5]])