# Python Handbook

## Utils

In [None]:
from Utils import plotEMF
p = 0.1
plotEMF([(0,1-p),(1,p)])

from Utils import plotEDF,emfToEdf 
plotEDF(emfToEdf([(0,0),(0,1-p),(1,p)]))

from scipy.special import binom as binomial
n = 20
p = 0.5
plotEMF([(i,binomial(n,i)*(p**i)*((1-p)**(n-i))) for i in range(n)])

plotEDF(emfToEdf([(i,binomial(n,i)*(p**i)*((1-p)**(n-i))) for i in range(n)]))

## 重要公式


In [None]:
#霍夫丁不等式
epsilon = np.sqrt(-1/(2*n)*np.log((1-0.95)/2))
#本尼特不等式-标准差
def bennett_epsilon(n,b,sigma,alpha):
    import scipy.optimize as so
    h = lambda u: (1+u)*np.log(1+u)-u
    f = lambda epsilon: np.exp(-n*sigma**2/b**2*h(b*epsilon/sigma**2))-alpha/2
    ans = so.fsolve(f,0.002)
    epsilon = np.abs(ans[0])
    print("Numerical error", f(epsilon))
    return epsilon

bennett_epsilon(50,300,20,0.05)


#线性同余
def linConGen(m, a, b, x0, n):
    '''A linear congruential sequence generator.
    
    Param m is the integer modulus to use in the generator.
    Param a is the integer multiplier.
    Param b is the integer increment.
    Param x0 is the integer seed.
    Param n is the integer number of desired pseudo-random numbers.
    
    Returns a list of n pseudo-random integer modulo m numbers.'''
    
    x = x0 # the seed
    retValue = [x % m]  # start the list with x=x0
    for i in range(2, n+1, 1):
        x = (a * x + b) % m # the generator, using modular arithmetic
        retValue.append(x) # append the new x to the list
    return retValue

#markov chain
from Utils import makeFreq
transitions_data = np.stack([all_daysdata[:-1],all_daysdata[1:]])
transitions_data[:,200:210]
transition_counts = makeFreq(transitions_data)
transition_counts
n_00 = transition_counts[0,-1]
n_01 = transition_counts[1,-1]
n_10 = transition_counts[2,-1]
n_11 = transition_counts[3,-1]
n_00, n_01, n_10, n_11
def estimateMatrix(n_00,n_01,n_10,n_11):
    p00 = n_00/(n_00+n_01)
    p11 = n_11/(n_11+n_10)
    p10 = 1-p11
    p01 = 1-p00
    return np.matrix([[p00,p01],[p10,p11]])
estimateMatrix(n_00,n_01,n_10,n_11)
evals,evecs = np.linalg.eig(estimateMatrix(n_00,n_01,n_10,n_11).T)
# During the lecture I forgot to transpose the transition matrix above
# which I should have due to the computation I did on the board
evals,evecs
evecs[:,0]/np.sum(evecs[:,0])

特征值和特征向量在分析马尔可夫链的行为中起着重要的作用。特别是，马尔可夫链的稳态分布可以通过求解转移矩阵的特征向量来找到。

马尔可夫链的转移矩阵是一个方阵，我们可以找到它的特征值和特征向量。对于马尔可夫链，我们特别关注特征值为1的特征向量，因为这个特征向量对应的就是马尔可夫链的稳态分布。

稳态分布是一个概率分布，它描述了马尔可夫链在长时间运行后，各个状态的概率分布。在稳态分布下，马尔可夫链的状态分布不再改变，即使我们继续进行状态转移。

在Python中，我们可以使用 `numpy.linalg.eig` 函数来求解转移矩阵的特征值和特征向量。这个函数返回两个数组，第一个数组包含了所有的特征值，第二个数组的每一列是对应的特征向量。我们可以找到特征值为1的特征向量，然后将这个特征向量归一化，使其元素之和为1，就得到了马尔可夫链的稳态分布。

In [None]:
from Utils import classification_report_interval
print(classification_report_interval(Y_test,per.predict(X_test)))
from sklearn.metrics import average_precision_score
average_precision_score(Y_test,per.predict_proba(X_test)[:,1])
MAE = np.mean(np.abs(Y_test - lr.predict(X_test)))


from Utils import epsilon_bounded
epsilon = epsilon_bounded(len(Y_test),span*2,0.05)
epsilon





### 数据读取

In [None]:
import csv
data = []

with open('data/NYPowerBall.csv',mode='r') as f:
    csv_reader = csv.reader(f)
    header = next(csv_reader)
    
    for line in csv_reader:
        data.append(line)

## Numpy

In [None]:
b = np.loadtxt("temp.csv", delimiter=",")
print(b)                   #打印b数组
print(max(b.reshape(-1)))  #打印b数组中的最大值
print(min(b.reshape(-1)))  #打印b数组中的最小值
import numpy as np
 
a = np.arange(0,10000).reshape(100,100)
np.savetxt("temp.csv", a, delimiter=",")


import numpy as np
a1 = np.arange(10)
# 1.1 进行索引操作
print(a1[4]) # 4
# 1.2 进行切片操作
print(a1[4:6])# [4 5]
# 1.3 使用步长
print(a1[::2])# [0 2 4 6 8]
# 1.4 使用负数来作为索引
print(a1[-1]) # 9


import numpy as np
a2 = np.random.randint(0,10,size=(4,6))
print(a2)

print(a2[0])

print(a2[1:3])

print(a2[[0,2,3]]) # 获取不连续的几行的数据

print(a2[[1,3],[4,4]]) # 花式索引,(1,4)第1行4列，(3,4)3行4列，注意行和列的索引都是从0开始
a2[,[0]]取列
print(a2[1:3,4:6]) # 取1-2行，4-5列之间
'''
[[4 2 0 2 4 0]
 [9 5 5 5 7 9]
 [2 6 3 0 1 6]
 [3 6 9 0 5 4]]
**********
[4 2 0 2 4 0]
**********
[[9 5 5 5 7 9]
 [2 6 3 0 1 6]]
**********
[[4 2 0 2 4 0]
 [2 6 3 0 1 6]
 [3 6 9 0 5 4]]
--------------------
[7 5]
--------------------
[[7 9]
 [1 6]]
'''




In [None]:
①若是a.reshape(x, -1)则是将矩阵a变成行数为x，列数不规定的矩阵，具体列数按照总元素个数除行数，均分得到。
②若是a.reshape(-1, x)则是将矩阵a变成列数为x，行数不规定的矩阵，具体行数按照总元素个数除列数，均分得到。

## Panda

In [None]:
# 创建一个data变量存储数据
data = pd.read_csv('StudentPerformance.csv')
# 展示一下数据
print(data)
# 获取首行（标签）名称
labels = list(data.columns.values)
print(labels)
 
# 查看数据特征
print(data.dtypes)
 
# 列名
print(data.columns)
 
# 索引
print(data.index)
 
#查看特征空值信息，以及数据类型
print(data.info(verbose=True))
 
#输出数据集前n个样本，默认n=5
print(data.head(n=5))
# 列名
print(df.columns)
# 索引
print(df.index)

## 画图

In [None]:
_=plt.hist(np.random.normal(size=100000),bins=200)


x_pred = np.linspace(np.min(X),np.max(X),2)
y_pred = x_pred*result['x'][0]+result['x'][1]
plt.scatter(X,Y)
plt.plot(x_pred,y_pred,color='green')