In [2]:
import pandas as pd
import scipy as sp
from scipy.stats import multinomial
import os
import numpy as np
import math

熵
- 公式：$Entropy = - \sum_{i} P(i)log_2(P(i))$
    
    $P(i)$ 表示第 i 种状态出现的可能性。
- 理解：
    是一个衡量概率分布的不确定性的定量指标。
    
    如果熵比较大(即平均编码长度较长)，意味着这一信息有较多的可能状态，相应的每个状态的可能性比较低；
    因此每当来了一个新的信息，我们很难对其作出准确预测，即有着比较大的混乱程度/不确定性/不可预测性。
    
    并且当一个罕见的信息到达时，比一个常见的信息有着更多的信息量，因为它排除了别的很多的可能性，告诉了我们一个确切的信息。

In [3]:
def entropy(c):
    # calculate entropy
    result=-1
    if(len(c)>0):
        result=0
    for x in c:
        result+=(-x)*math.log(x,2)
    return result

In [None]:
# load the training data
df = pd.read_csv('../train_data/train_task_3_4.csv')
# load side information
answer_meta_data = pd.read_csv('../metadata/answer_metadata_task_3_4.csv')
question_meta_data = pd.read_csv('../metadata/question_metadata_task_3_4.csv')
student_meta_data = pd.read_csv('../metadata/student_metadata_task_3_4.csv')

# load the submission csv file
submission_file = pd.read_csv('../starter_kit/submission_templates/submission_task_3.csv')


### 计算每道题目答案取值的熵，并做 Z-Score 标准化

Z-Score 通过 $(x - \mu)/ \delta$ 将两组或多组数据转化为无单位的 Z-Score 分值，使得数据标准统一化，提高了数据可比性，削弱了数据解释性。

In [None]:
choice_entropy = df.groupby('QuestionId')['AnswerValue'].agg(lambda x:multinomial.entropy(1, x.value_counts(normalize=True)))
submission_file['entropy_choice'] = choice_entropy
submission_file['z_entropy_choice'] = (submission_file['entropy_choice']-np.mean(submission_file['entropy_choice']))/np.std(submission_file['entropy_choice'])

### 对每道题统计学生回答时自信度的平均值，并做 Z-Score 标准化

如果题目回答记录中存在 confidence 信息，则计算该题目下的所有已知 confidence 的平均值。

若不存在，则用所有已知题目的自信度的均值做填充。


In [None]:
new_df = df.merge(answer_meta_data, on='AnswerId', how='left')
notnull_confidence = new_df[new_df['Confidence'].notnull()]
# 按照 QuestionID 分组，并统计 Question_ID 的作答数量
que_num = notnull_confidence.groupby('QuestionId')['QuestionId'].agg(lambda x: len(x))
num_confid = {}
for i in que_num.index:
    num_confid[i] = que_num[i]
valid_que = []
for key, value in num_confid.items():
    if value < 20:
        pass
    else:
        valid_que.append(key)
notnull_confidence = notnull_confidence[notnull_confidence['QuestionId'].isin(valid_que)]
que_avg_confid = {}
for idx in notnull_confidence['QuestionId'].unique():
    cut = notnull_confidence[notnull_confidence['QuestionId']==idx]
    que_avg_confid[idx] = cut['Confidence'].mean()
all_que_confid = list(que_avg_confid.values())
submission_file['confidence'] = submission_file['QuestionId'].apply(lambda x: que_avg_confid[x] if x in que_avg_confid else np.mean(all_que_confid))
submission_file['z_confidence'] = (submission_file['confidence']-np.mean(submission_file['confidence']))/np.std(submission_file['confidence'])

### 计算所有题目作答情况（正确与否）的熵，并做 Z-Score 标准化

In [None]:
right_entropy = df.groupby('QuestionId')['IsCorrect'].agg(lambda x: multinomial.entropy(1, x.value_counts(normalize=True)))
submission_file['right_entropy'] = right_entropy
submission_file['z_entropy_right'] = (submission_file['right_entropy']-np.mean(submission_file['right_entropy']))/np.std(submission_file['right_entropy'])

### 对每道题目，以小组为单位，计算该题目作答情况的条件熵，并做 Z-Score 标准化

条件熵计算公式 ：$H(right\&wrong|group)$

一个题目可以属于多个学生小组，也可以在一个小组中重复出现（即，被组内不同的学生做）

one_def: 一个题目（i）的所有作答记录

some_res: 记录题目 i 分别在不同组作答情况的熵

one_score：题目 i 在不同组作答情况熵的期望，即该题目的条件熵（H）


In [None]:
def get_one_condition_entropy(one_df):
    some_res = one_df.groupby('GroupId')['IsCorrect'].agg(lambda x: multinomial.entropy(1, x.value_counts(normalize=True)))
    one_score = 0
    for one_group_idx in some_res.index:
        cut_df = one_df[one_df['GroupId']==one_group_idx]
        # one_score 是该题目在各个小组内作答情况熵的期望值
        one_score += some_res[one_group_idx] * (cut_df.shape[0]/one_df.shape[0])
    return one_score

In [None]:
cond_right_group_entropy = {}
for one_que_id in new_df.QuestionId.unique():
    one_df = new_df[new_df['QuestionId']==one_que_id]
    # 计算 conditional entropy
    cond_right_group_entropy[one_que_id] = get_one_condition_entropy(one_df)
submission_file['cond_entropy_group'] = submission_file['QuestionId'].apply(lambda x: cond_right_group_entropy[x]) # 这里的 apply 可以理解为映射
submission_file['z_cond_entropy'] = (submission_file['cond_entropy_group']-submission_file['cond_entropy_group'].mean())/submission_file['cond_entropy_group'].std()

### 与上面类似，以题目所在 quiz 为单位，计算作答情况的条件熵，并做 Z-Score 标准化

条件熵计算公式 ：$H(right\&wrong|quiz)$


In [None]:
def get_one_quiz_condition_entropy(one_df):
    some_res = one_df.groupby('QuizId')['IsCorrect'].agg(lambda x: multinomial.entropy(1, x.value_counts(normalize=True)))
    one_score = 0
    for one_group_idx in some_res.index:
        cut_df = one_df[one_df['QuizId']==one_group_idx]
        one_score += some_res[one_group_idx] * (cut_df.shape[0]/one_df.shape[0])
    return one_score

In [None]:
cond_right_quiz_entropy = {}
for one_que_id in new_df.QuestionId.unique():
    one_df = new_df[new_df['QuestionId']==one_que_id]
    # 计算conditional entropy
    cond_right_quiz_entropy[one_que_id] = get_one_quiz_condition_entropy(one_df)
submission_file['cond_entropy_quiz'] = submission_file['QuestionId'].apply(lambda x: cond_right_quiz_entropy[x])
submission_file['z_cond_quiz_entropy'] = (submission_file['cond_entropy_quiz']-submission_file['cond_entropy_quiz'].mean())/submission_file['cond_entropy_quiz'].std()

### 排序

z_entropy_choice，z_cond_quiz_entropy，z_entropy_right，z_confidence 加权计算最终分数

In [None]:
float0, float_1, float_2 = 0.7, 0.1, 1
submission_file['final_score'] = submission_file['z_entropy_choice'] + float0*submission_file['z_cond_entropy'] + \
float_1* submission_file['z_cond_quiz_entropy'] + \
float_2*submission_file['z_entropy_right'] - submission_file['z_confidence']

In [None]:
# pd.Series.rank(method = 'first')根据值在原数据中出现的顺序排名（保留原数据的顺序）
ranking = submission_file['final_score'].rank(method='first', ascending=False).astype('int16')
submission_file['ranking'] = ranking
submission_file[['QuestionId','ranking']].to_csv('../submissions/final_report.csv',index=False)
first_try = pd.read_csv('../submissions/final_report.csv')
first_try_zip = first_try.sort_values("ranking", ascending=True)
first_try_zip.to_csv('../submissions/submission_task_3_report.csv',index=False)