# Data preperation

In [1]:
import pandas
from IPython.display import display

def read_data(file_name):
    csv_file = pandas.read_csv(f'../data/{file_name}.csv')
    return csv_file

student_course_identifier = ["code_module", "code_presentation", "id_student"]

student_info = read_data('studentInfo')
student_info.loc[0]

code_module                             AAA
code_presentation                     2013J
id_student                            11391
gender                                    M
region                  East Anglian Region
highest_education          HE Qualification
imd_band                            90-100%
age_band                               55<=
num_of_prev_attempts                      0
studied_credits                         240
disability                                N
final_result                           Pass
Name: 0, dtype: object

\pagebreak
## Merge 1
merge: studentinfo\
with: studentRegistration

In [2]:
# merge studentinfo 
# with studentRegistration
student_registration = read_data("studentRegistration")
students_merged_step_1 = pandas.merge(student_info, student_registration, on=student_course_identifier)
students_merged_step_1.loc[0]

code_module                             AAA
code_presentation                     2013J
id_student                            11391
gender                                    M
region                  East Anglian Region
highest_education          HE Qualification
imd_band                            90-100%
age_band                               55<=
num_of_prev_attempts                      0
studied_credits                         240
disability                                N
final_result                           Pass
date_registration                    -159.0
date_unregistration                     NaN
Name: 0, dtype: object

\pagebreak
## Merge 2
merge: studentinfo and studentRegistration\
with: courses

In [3]:
courses = read_data("courses")
students_merged_step_2 = pandas.merge(students_merged_step_1, courses, on=['code_module', 'code_presentation'])
students_merged_step_2.loc[0]

code_module                                   AAA
code_presentation                           2013J
id_student                                  11391
gender                                          M
region                        East Anglian Region
highest_education                HE Qualification
imd_band                                  90-100%
age_band                                     55<=
num_of_prev_attempts                            0
studied_credits                               240
disability                                      N
final_result                                 Pass
date_registration                          -159.0
date_unregistration                           NaN
module_presentation_length                    268
Name: 0, dtype: object

\pagebreak
## Merge 3
merge: studentinfo, studentRegistration and courses\
with: vles and studentvles

In [4]:
# prepare vles for merging later
vles = pandas.merge(read_data("vle"), read_data("studentVle"), on=['code_module', 'code_presentation', 'id_site'])

# group vle clicks per day
grouped_vles_per_day = vles.groupby(["code_module", "code_presentation", "id_student", "id_site", "date", "activity_type"]).agg({
    "sum_click": "sum"
}).reset_index()

# combine vle data as a single column value
grouped_vles_per_day["vles"] = grouped_vles_per_day[["id_site", "date", "activity_type", "sum_click"]].values.tolist()     
grouped_vles_per_day.head()

# combine all seperate from rows to a single row with a list
grouped_vles_per_student = grouped_vles_per_day.groupby(student_course_identifier, as_index=False).agg({
    "vles": lambda x: list(x)
})

In [5]:
students_merged_step_3 = pandas.merge(grouped_vles_per_student, students_merged_step_2, on=student_course_identifier)
students_merged_step_3.loc[0]

code_module                                                                 AAA
code_presentation                                                         2013J
id_student                                                                11391
vles                          [[546614, -5, homepage, 7], [546614, 0, homepa...
gender                                                                        M
region                                                      East Anglian Region
highest_education                                              HE Qualification
imd_band                                                                90-100%
age_band                                                                   55<=
num_of_prev_attempts                                                          0
studied_credits                                                             240
disability                                                                    N
final_result                            

\pagebreak
## Merge 4
merge: studentinfo, studentRegistration, courses, vles and studentvles\
with: assessments

In [6]:
assessments_bm = read_data("assessments")
student_assessments_bm = read_data("studentAssessment")
# couple assesment info to student assesment info
student_assessments = pandas.merge(student_assessments_bm, assessments_bm, on=['id_assessment'])
student_assessments["assessments"] = student_assessments[["id_assessment", "date_submitted", "score", "assessment_type", "date", "weight"]].values.tolist()     
# group per student their assessments
student_assessments_per_student = student_assessments.groupby(student_course_identifier, as_index=False).agg({
    "assessments": lambda x: list(x)
})
student_assessments_per_student.loc[0]

code_module                                                        AAA
code_presentation                                                2013J
id_student                                                       11391
assessments          [[1752, 18, 78.0, TMA, 19.0, 10.0], [1753, 53,...
Name: 0, dtype: object

In [7]:
students_merged_step_4 = pandas.merge(student_assessments_per_student, students_merged_step_3, on=student_course_identifier)
students_merged_step_4.loc[0]

code_module                                                                 AAA
code_presentation                                                         2013J
id_student                                                                11391
assessments                   [[1752, 18, 78.0, TMA, 19.0, 10.0], [1753, 53,...
vles                          [[546614, -5, homepage, 7], [546614, 0, homepa...
gender                                                                        M
region                                                      East Anglian Region
highest_education                                              HE Qualification
imd_band                                                                90-100%
age_band                                                                   55<=
num_of_prev_attempts                                                          0
studied_credits                                                             240
disability                              

## Save Dataframe to CSV file and Feather file

In [11]:
students_merged_step_4.to_csv("../data/merged_data.csv")
students_merged_step_4.to_pickle("../data/merged_data.pickle")