In [4]:
import pandas as pd
import numpy as np

# raw data
assessments = pd.read_csv("assessments.csv")
student_info = pd.read_csv("studentInfo.csv")
student_assessment = pd.read_csv("studentAssessment.csv")
student_reg = pd.read_csv("studentRegistration.csv")
student_vle = pd.read_csv("studentVle.csv")
vle_agg = student_vle.groupby(['code_module', 'code_presentation', 'id_student']).agg(
total_clicks=('sum_click', 'sum'), # to get the total number of clicks a student made in the VLE.
n_activities=('id_site', 'nunique') # to see how many different activities students accessed.
).reset_index()

student_assessment = pd.merge(student_assessment, assessments[['id_assessment', 'code_module', 'code_presentation']], 
                             on='id_assessment', how='left')

merge_keys = ['code_module', 'code_presentation', 'id_student']
df = pd.merge(student_info, student_assessment, on=merge_keys, how='left') # to preserve all students, even if they lack VLE/assessment data
df = pd.merge(df, student_reg, on=merge_keys, how='left')
df = pd.merge(df, vle_agg, on=merge_keys, how='left')

duplicates = df.duplicated(subset=merge_keys, keep=False) # to mark all duplicates as True, including the first occurrence.
print("Number of duplicates:", duplicates.sum())
print("Duplicate examples:\n", df[duplicates].sort_values(merge_keys))

# Filter the result to keep only those students who have more than one unique assessment.
# These are the students with possible duplicates due to multiple assessment records.
dup_assessments = df.groupby(merge_keys)['id_assessment'].nunique().reset_index()
dup_assessments = dup_assessments[dup_assessments['id_assessment'] > 1]

print(f"Students with multiple assessments: {len(dup_assessments)}")
print(dup_assessments.head())

true_duplicates = df[df.duplicated(keep=False)]
print(f"Fully identical duplicates: {len(true_duplicates)}")

# Example of a student with duplicates (to compare the records)
sample_student = df[df['id_student'] == 11391]
print(sample_student[['id_assessment', 'score', 'date_submitted']])

print(df[df['id_assessment'].isna()].head()) # to show students missing assessment data

Number of duplicates: 171392
Duplicate examples:
        code_module code_presentation  id_student gender               region  \
0              AAA             2013J       11391      M  East Anglian Region   
1              AAA             2013J       11391      M  East Anglian Region   
2              AAA             2013J       11391      M  East Anglian Region   
3              AAA             2013J       11391      M  East Anglian Region   
4              AAA             2013J       11391      M  East Anglian Region   
...            ...               ...         ...    ...                  ...   
180657         GGG             2014J     2684003      F     Yorkshire Region   
180658         GGG             2014J     2684003      F     Yorkshire Region   
180659         GGG             2014J     2684003      F     Yorkshire Region   
180660         GGG             2014J     2684003      F     Yorkshire Region   
180661         GGG             2014J     2684003      F     Yorkshire 