# 生成学生模拟表现数据

In [9]:
# 使用 python faker
import faker
import scipy.stats as stats
import pandas as pd
import numpy as np

K_MAP = {'K0':'平行四边形计算公式', 'K1':'整数加法', 'K2':'整数减法', 'K3':'整数乘法', 'K4':'整数除法','K5':'常用的面积单位',
         'K6':'单位之间的换算','K7':'货币单位换算','K8':'质量常用换算','K9':'时间单位换算','K10':'鸡兔问题'}

gen_size = 3000

# 生成范围之内的正太分布数据
def gen_k_performance(lower=60,uper=95, mu=80,sigma=10, size=1000):
    x = stats.truncnorm(int((lower-mu)/sigma), int((uper-mu)/sigma), mu, sigma)
    data = x.rvs(size)
    data = data.astype(np.int)
    return data

# 生成模拟成绩数据
def gen_all_data(size=1000):
    data = {}
    data['K1'] = gen_k_performance(size=size)
    data['K2'] = gen_k_performance(size=size)
    data['K3'] = ((data['K1'] + data['K2']) / 2).astype(np.int) + np.random.randint(-5,5,size)
    data['K4'] = ((data['K1'] + data['K2']) / 2).astype(np.int) + np.random.randint(-5,5,size)
    data['K5'] = gen_k_performance(size=size)
    data['K6'] = data['K5'] + np.random.randint(-5,5,size)
    data['K7'] = gen_k_performance(size=size)
    data['K8'] = gen_k_performance(size=size)
    data['K9']=  gen_k_performance(size=size)
    data['K10'] = gen_k_performance(size=size)
    data['K0'] = ((data['K3'] + data['K4'] + data['K6']) / 3).astype(np.int) +  np.random.randint(-5,5,size)
    return data

fk = faker.Faker(locale='zh-CN')
s_names = np.array([fk.name() for _ in range(gen_size)])
p_data  = gen_all_data(gen_size)
p_data_values = np.array([x.tolist() for x in p_data.values()])
p_data =  pd.DataFrame(p_data_values.T, columns=p_data.keys())
p_data['NAME'] = s_names
p_data.to_csv('D:/PROJECT_TW/git/data/kg/s_performance.csv')
print(p_data)




      K1  K2  K3  K4  K5  K6  K7  K8  K9  K10  K0 NAME
0     83  80  80  76  73  75  80  80  62   88  77   陈涛
1     89  80  83  83  76  71  80  87  74   75  82   李伟
2     76  73  70  75  67  71  77  82  85   71  70   位想
3     77  77  77  76  84  81  84  88  84   68  76  栗红梅
4     85  87  81  86  70  72  67  89  85   73  81   蔡文
...   ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ..  ...
2995  76  62  72  72  86  83  75  85  86   82  78  段玉梅
2996  77  82  79  79  63  58  68  84  84   77  76   关凯
2997  86  84  89  85  76  79  77  70  72   73  80  刘淑兰
2998  82  69  78  76  83  83  76  85  89   63  76  顾小红
2999  73  78  79  78  77  73  66  68  75   66  72  曾淑华

[3000 rows x 12 columns]


# 模型训练

## 贝叶斯网络图形化 

In [12]:
# 处理流程：
# 1、从图谱取带有Next关系的知识实体 (省略)
# 2、根据关系建立网络，进行训练
# K1->K3,K4, K2 -> K3, K4, K6 -> K5,      K3,K4,K5 --> K0
from pgmpy.models import BayesianModel
from bokeh.io import show, output_file,output_notebook
from bokeh.models import Plot, Range1d, MultiLine, Circle, HoverTool, TapTool, BoxSelectTool,LabelSet,ColumnDataSource
from bokeh.models.graphs import from_networkx, NodesAndLinkedEdges, EdgesAndLinkedNodes
from bokeh.palettes import Spectral4
import networkx as nx
''', ('K4','K0'),('K6'),('K0')'''
g = nx.DiGraph()
model = BayesianModel([('K1', 'K3'), ('K1', 'K4'), ('K2','K3'),('K2','K4'),('K5','K6'),('K3','K0'),('K4','K0'),('K6','K0')])  # WD -> ST <- Month
print(model.nodes)
print(model.edges)
g.add_nodes_from(model.nodes)
g.add_edges_from(model.edges)
output_notebook() 
position = nx.spring_layout(g, scale=3)
print('position -->{}'.format(position))
plot = Plot(x_range=Range1d(-3.5,3.5), y_range=Range1d(-3.1,3.1))
plot.title.text = '知识图谱关系'
graph_renderer = from_networkx(g, nx.spring_layout, scale=3, center=(0,0))
graph_renderer.node_renderer.glyph = Circle(size=15, fill_color=Spectral4[0])
graph_renderer.node_renderer.selection_glyph = Circle(size=15, fill_color=Spectral4[2])
graph_renderer.node_renderer.hover_glyph = Circle(size=15, fill_color=Spectral4[1])
graph_renderer.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=5)
graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color=Spectral4[2], line_width=5)
graph_renderer.edge_renderer.hover_glyph = MultiLine(line_color=Spectral4[1], line_width=5)
graph_renderer.selection_policy = NodesAndLinkedEdges()
graph_renderer.inspection_policy = EdgesAndLinkedNodes()


# 得到位置信息, 并设置label
x, y = zip(*graph_renderer.layout_provider.graph_layout.values())
x = [j+0.1 for j in x]
print('x,y',x,'-',y)
node_labels = graph_renderer.node_renderer.data_source.data['index']
print('index ---> ',)
source = ColumnDataSource({'x': x, 'y': y,
                           'name': [node_labels[i] for i in range(len(x))]})
labels = LabelSet(x='x', y='y', text='name', source=source, background_fill_color='white')

plot.renderers.append(graph_renderer)
plot.renderers.append(labels)
# show(plot)


['K1', 'K3', 'K4', 'K2', 'K5', 'K6', 'K0']
[('K1', 'K3'), ('K1', 'K4'), ('K3', 'K0'), ('K4', 'K0'), ('K2', 'K3'), ('K2', 'K4'), ('K5', 'K6'), ('K6', 'K0')]


position -->{'K1': array([-0.02504565,  0.95053109]), 'K3': array([ 0.06894955, -0.169889  ]), 'K4': array([-0.26536549,  1.71211327]), 'K2': array([-1.93733498,  0.29118944]), 'K5': array([-0.9941461, -3.       ]), 'K6': array([ 0.79425619, -1.08428635]), 'K0': array([2.35868647, 1.30034156])}
x,y [-0.34089602812408415, -0.6826934332724419, 0.9082385635288357, -1.228823431953393, -0.19881373513219233, 0.2697640181754912, 1.9732240467777833] - (1.245718465244687, -0.324118731145795, 1.758696915448292, 1.509777462201177, -3.0, -1.146173727108124, -0.043900384640237114)
index ---> 


## 参数训练 

In [51]:
import pandas as pd
import numpy as np
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

data = pd.read_csv('D:/PROJECT_TW/git/data/kg/s_performance.csv',index_col=0)
model = BayesianModel([('K1', 'K3'), ('K1', 'K4'), ('K2','K3'),('K2','K4'),('K5','K6'),('K3','K0'),('K4','K0'),('K6','K0')])  # WD -> ST <- Month

p_data = data.loc[:,['K0','K1','K2','K3','K4','K5','K6']]
percen = np.percentile(p_data,[60, 80, 100])
print(percen)
percen_fun = lambda x : 1 if x <= percen[0] else 2 if x > percen[0] and x <= percen[1] else 3  
percen_class_data = p_data.applymap(percen_fun)
mle = MaximumLikelihoodEstimator(model, percen_class_data)
model.fit(percen_class_data, estimator=MaximumLikelihoodEstimator)
# print("\n", mle.estimate_cpd('K0'))
# print("\n", mle.estimate_cpd('K6'))

infer = VariableElimination(model)
# print('K0:\n',infer.query(['K0']))
# print('MFI:BS\n',infer.query(['ST'], evidence={'MFI': 'BS'}))




[79. 83. 93.]


In [62]:
# print(percen_class_data.head(5))
# print(model.get_cpds())
# print(model.check_model())
# print(model.name)
print(infer.query(variables=['K6'],evidence={'K0':3},show_progress=False))


+-------+-----------+
| K6    |   phi(K6) |
| K6(1) |    0.2099 |
+-------+-----------+
| K6(2) |    0.2368 |
+-------+-----------+
| K6(3) |    0.5533 |
+-------+-----------+


# 已有该学生历史成绩，规划学习路径

In [None]:
# MATCH (K1:Knowledge {name:'平行四边形计算公式'})-[relN:NEXT]- (K2:Knowledge) return K1,K2,relN
# 注意：数据因为是模拟生成的不一定精准， 得到平行四边形计算前置的知识点包括：整数加、减、乘、除及常用面积单位，并得到影响权重系数
# 根据学生历史成绩预测 本知识点可能的掌握情况， 并根据学生在该知识点想达到的目标，得到其它的前置知识点需要的最低掌握情况
'''
处理流程：
1、训练贝叶斯模型（根据历史所有成绩）
2、根据该学生成绩，预测知识点的掌握情况。（？是否可以根据权重系数来判断，？？？？？？）
'''

# 无该学生历史成绩，预测学生前置知识点掌握情况

In [None]:
# 无学生历史成绩， 根据学生在该知识点的掌握情况（通过测试得到），得到其它前置知识点掌握情况
