## define training data and test data

In [1]:
# 训练样本, separate by space
train_samples = ["<=30 高 否 中 否",
                 "<=30 高 否 优 否",
                 "31~40 高 否 中 是",
                 ">40 中 否 中 是",
                 ">40 低 是 中 是",
                 ">40 低 是 优 否",
                 "31~40 低 是 优 是",
                 "<=30 中 否 中 否",
                 "<=30 低 是 中 是",
                 ">40 中 是 中 是",
                 "<=30 中 是 优 是",
                 "31~40 中 否 优 是",
                 "31~40 高 是 中 是",
                 ">40 中 否 优 否"]
# 待分类样本
test_sample = '<=30 中 是 中'

## convert training data and test data

In [2]:
from tabulate import tabulate # to print lists in a pretty form
# 描述属性分别用数字替换
# 年龄, <=30-->0, 31~40-->1, >40-->2
# 收入, '低'-->0, '中'-->1, '高'-->2
# 是否学生, '是'-->0, '否'-->1
# 信誉: '中'-->0, '优'-->1
# 类别属性用数字替换
# 购买电脑是-->0, 不购买电脑否-->1
MAP_text2num = [{'<=30': 0, '31~40': 1, '>40': 2},
       {'低': 0, '中': 1, '高': 2},
       {'是': 0, '否': 1},
       {'中': 0, '优': 1},
       {'是': 0, '否': 1}]

# 下面步骤将文字，转化为对应数字
train_samples = [sample.split(' ') for sample in train_samples]
print(f"splited train_samples:\n {tabulate(train_samples)}", '\n')
train_samples = [[MAP_text2num[i][attr] for i, attr in enumerate(sample)]for sample in train_samples]
print(f"splited train_samples after maping:\n {tabulate(train_samples)}", '\n')
# convert the test sample
test_sample = [MAP_text2num[i][attr] for i, attr in enumerate(test_sample.split(' '))]
print(f"splited test sample after maping:\n {test_sample}", '\n')

ModuleNotFoundError: No module named 'tabulate'

In [40]:
# statistic

# 单个样本的维度： 描述属性和类别属性个数
dim_sample = len(train_samples[0])

# 计算每个属性有哪些取值
attr_list = []
for i in range(0, dim_sample):
    attr_list.append([])
print(f"empty attrs:\n{attr_list}\n")

for sample in train_samples:
    for i in range(0, dim_sample):
        if sample[i] not in attr_list[i]:
            attr_list[i].append(sample[i])
print(f"attr list:\n{tabulate(attr_list)}\n")
attr_list_sorted = [sorted(attrs) for attrs in attr_list]
# sort attributes
print(f"attr_list_sorted:\n{tabulate(attr_list_sorted)}\n")
# 每个属性取值的个数
n_attr = [len(attr) for attr in attr_list_sorted]
print(f"n_attr:\n{n_attr}\n")

# 记录不同类别的样本个数
n_c = []
for i in range(0, n_attr[-1]):
    n_c.append(0)
print(f"all n_c:\n{n_c}\n")
# 计算不同类别的样本个数, 是 or 否 for this example
for sample in train_samples:
    n_c[sample[-1]] += 1
    # print(f"After adding sample {sample}, n_c is:\n{n_c}")
print(f"Frequencies for [0, 1] are:\n{n_c}\n")

# 计算不同类别样本所占概率
p_c = [n_cx / sum(n_c) for n_cx in n_c]
print(f"Odds for [0, 1] are:\n{p_c}\n")

# 将用户按照购买电脑case 分类
samples_at_c = {}
for c in attr_list_sorted[-1]:
    samples_at_c[c] = []
print(f"empty dictionary samples_at_c:\n{samples_at_c}\n")
for sample in train_samples:
    samples_at_c[sample[-1]].append(sample)
print(f"classfied samples_at_c:\n{tabulate(samples_at_c)}\n")

empty attrs:
[[], [], [], [], []]

attr list:
-  -  -
0  1  2
2  1  0
1  0
0  1
1  0
-  -  -

attr_list_sorted:
-  -  -
0  1  2
0  1  2
0  1
0  1
0  1
-  -  -

n_attr:
[3, 3, 2, 2, 2]

all n_c:
[0, 0]

Frequencies for [0, 1] are:
[9, 5]

Odds for [0, 1] are:
[0.6428571428571429, 0.35714285714285715]

empty dictionary samples_at_c:
{0: [], 1: []}

classfied samples_at_c:
---------------  ---------------
[1, 2, 1, 0, 0]  [0, 2, 1, 0, 1]
[2, 1, 1, 0, 0]  [0, 2, 1, 1, 1]
[2, 0, 0, 0, 0]  [2, 0, 0, 1, 1]
[1, 0, 0, 1, 0]  [0, 1, 1, 0, 1]
[0, 0, 0, 0, 0]  [2, 1, 1, 1, 1]
[2, 1, 0, 0, 0]
[0, 1, 0, 1, 0]
[1, 1, 1, 1, 0]
[1, 2, 0, 0, 0]
---------------  ---------------



In [41]:
# 记录 每个类别的训练样本中，取待分类样本的某个属性值的样本个数
n_attr_test_sample = {}
for c in attr_list_sorted[-1]:
    n_attr_test_sample[c] = []
    for j in range(0, dim_sample-1):
        n_attr_test_sample[c].append(0)
print(f"empty dictionary n_attr_test_sample:\n{n_attr_test_sample}\n")
# 计算 每个类别的训练样本中，取待分类样本的某个属性值的样本个数
for c in samples_at_c:
    samples_at_cx = samples_at_c[c]
    for sample in samples_at_cx:
        for i in range(0, dim_sample-1):
            if test_sample[i] == sample[i]:
                n_attr_test_sample[c][i] += 1
print(f"dictionary n_attr_test_sample:\n{n_attr_test_sample}\n")

# 字典转化为list
n_attr_test_sample = list(n_attr_test_sample.values())
print(f"list n_attr_test_sample:\n{tabulate(n_attr_test_sample)}\n")
# 存储最终的概率
result_p = []
for i in range(0, n_attr[-1]):
    result_p.append(p_c[i])
print(f"training odds for [0, 1]:\n{result_p}\n")
    
# 计算概率
for i in range(0, n_attr[-1]):
    n_attr_test_sample[i] = [x/n_c[i] for x in n_attr_test_sample[i]]
    for x in n_attr_test_sample[i]:
        result_p[i] *= x

print('概率分别为', result_p)

# 找到概率最大对应的那个类别，就是预测样本的分类情况
predict_class = result_p.index(max(result_p))
print(predict_class)

empty dictionary n_attr_test_sample:
{0: [0, 0, 0, 0], 1: [0, 0, 0, 0]}

dictionary n_attr_test_sample:
{0: [2, 4, 6, 6], 1: [3, 2, 1, 2]}

list n_attr_test_sample:
-  -  -  -
2  4  6  6
3  2  1  2
-  -  -  -

training odds for [0, 1]:
[0.6428571428571429, 0.35714285714285715]

概率分别为 [0.02821869488536155, 0.006857142857142858]
0
