In [14]:
import pandas as pd
import numpy as np

# raw data
assessments = pd.read_csv("assessments.csv")
student_info = pd.read_csv("studentInfo.csv")
student_assessment = pd.read_csv("studentAssessment.csv")
student_reg = pd.read_csv("studentRegistration.csv")
student_vle = pd.read_csv("studentVle.csv")
vle_agg = student_vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(
total_clicks=('sum_click', 'sum'), # to get the total number of clicks a student made in the VLE.
n_activities=('id_site', 'nunique') # to see how many different activities students accessed.
).reset_index()

student_assessment = pd.merge(student_assessment, assessments[['id_assessment', 'code_module', 'code_presentation']], 
                             on='id_assessment', how='left')

merge_keys = ['code_module', 'code_presentation', 'id_student']
df = pd.merge(student_info, student_assessment, on=merge_keys, how='left') # to preserve all students, even if they lack VLE/assessment data
df = pd.merge(df, student_reg, on=merge_keys, how='left')
df = pd.merge(df, vle_agg, on=merge_keys, how='left')

# Checks Distinction students with scores < 10
print(df[(df['final_result'] == 'Distinction') & (df['score'] < 10)][['id_student', 'id_assessment', 'score', 'final_result', 'code_module']])

# Checks Fail students with scores > 90
print(df[(df['final_result'] == 'Fail') & (df['score'] > 90)][['id_student', 'id_assessment', 'score', 'final_result', 'code_module']])

# Checks Assessment 15020 Metadata
print(assessments[assessments['id_assessment'] == 15020][['id_assessment', 'assessment_type', 'weight', 'code_module']])

# Checks if other assessments in BBB show similar anomalies
bbb_assessments = assessments[assessments['code_module'] == 'BBB']
print(bbb_assessments[['id_assessment', 'assessment_type', 'weight']])

# Checks if students with 1.0 in 15020 did well in other assessments
student_ids = df[(df['id_assessment'] == 15020) & (df['score'] == 1.0)]['id_student'].unique()
print(df[df['id_student'].isin(student_ids)].groupby('id_student')['score'].describe())

        id_student  id_assessment  score final_result code_module
36040       622505        15016.0    0.0  Distinction         BBB
40228        89850        15020.0    1.0  Distinction         BBB
40339       150398        15020.0    1.0  Distinction         BBB
40354       155484        15020.0    1.0  Distinction         BBB
40573       280152        15020.0    1.0  Distinction         BBB
...            ...            ...    ...          ...         ...
87077        84576        25358.0    0.0  Distinction         DDD
95011       603303        25365.0    6.0  Distinction         DDD
171890      521644        37426.0    0.0  Distinction         GGG
172368      598017        37427.0    0.0  Distinction         GGG
179784      688038        37442.0    0.0  Distinction         GGG

[82 rows x 5 columns]
        id_student  id_assessment  score final_result code_module
3194         23629        14991.0  100.0         Fail         BBB
3195         23629        14992.0  100.0         Fail