# Student Data Anonymization

Student data at Birkbeck College is stored in a format that includes student's identifying information. This script is used to anonymize individual BSIS module data sheets to prepare them to be exported outside the university. Anonymization entails stripping indentifying student information and assigning a random unique code to each student. A dictionary is used to map each random code to its Birkbeck's student code, and this dictionary is saved in excel and kept at Birkbeck.  

In [None]:
# import standard python libraries and custom script package written by KV
import pandas as pd
import os
import LLB_custom_scripts
import csv

In [None]:
# Paths below must be specified by the user.
#
# If using Windows operating system, the user must:
#
# 1) Use double quotes around the path names
# 2) Use "/" instead of "\"
# 3) Put a "/" at the end of the path

input_path = "path where xls files are located"
csv_path = "path for csv files to be saved"
SPR_dict_path = "path for SPR code dictionary to be saved"
anon_module_csv_path = "path for anonymized modules to be saved"
anon_progression_csv_path = "path for anonymized prog. files to be saved"

# This and the next block use methods from the LLB_custom_scripts
# package KV wrote and which was imported above.
#
# This block converts all xls datafiles into csv files, 
# which are easier to process in Python.
# 
# For each file in path specified by 'input_path':
# convert to csv if necessary and 
# save csv to 'csv_path'.

for name in os.listdir(input_path):
    if name == '.DS_Store': 
        continue
    elif name.split('.')[-1] == 'xlsx' or 'xls':
        xls_name = f'{input_path}' + name  
        LLB_custom_scripts.csv_from_xls(xls_name, csv_path)
    elif name.split('.')[-1] == 'csv': 
        open(f'{csv_path}' + name) 
        close(f'{csv_path}' + name)

In [None]:
# This block anonymizes csv files and saves them to separate
# directories (folders) for module files and progression files. 
#
# These folders should be compressed and sent to KV for further 
# processing.
#
# The keys to the anonymized SPR codes are stored in global 
# SPR_dict. 
#
# Each file's SPR code dictionary is stored temporarily
# in order to add it to the global SPR code dictionary. 
# 
# Each anonymized module and progression file's df is stored 
# in a df dictionary with name as key.

global_SPR_dict = {}
tmp_module_SPR_dict = {}  
tmp_prog_SPR_dict = {}
anon_module_dict = {} 
anon_prog_dict = {}

for filename in os.listdir(csv_path):
    if filename == '.DS_Store':
        continue
    else:
        name = filename.split('.')[0]
        if name.split('_')[1] == 'PROGRESSION': 
            # Run progression anonymizer.
            print('progfile: ' + filename)
            tmp_prog_SPR_dict, new_df = \
                LLB_custom_scripts.anonymize_progression(
                csv_path + f'{filename}', global_SPR_dict)
            global_SPR_dict.update(tmp_prog_SPR_dict)                   
            anon_prog_dict[f'{filename}'.split('.')[0]] = new_df 
        elif name.split('_')[0][0] == str(2): # Module file
            # Run module anonymizer.
            print('modfile: ' + filename)
            tmp_module_SPR_dict, new_df = \
                LLB_custom_scripts.anonymize_module(
                csv_path + f'{filename}', global_SPR_dict)
            global_SPR_dict.update(tmp_module_SPR_dict)
            anon_module_dict[f'{filename}'.split('.')[0]] = new_df
        else: 
            print('error in processing ' + filename)
            

In [None]:
# The global SPR code dictionary is exported to a csv file 
# found at SPR_dict_path.
with open(
    SPR_dict_path + 'SPR Code Dictionary.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in global_SPR_dict.items():
       writer.writerow([key, value])

In [None]:
# Loop through anonymized dataframes and save them to csv
# to be imported by other users if necessary.

for dfname, df in anon_module_dict.items():
    df.to_csv(anon_module_csv_path + f'{dfname}' + '.csv')

for dfname, df in anon_prog_dict.items():
    df.to_csv(anon_progression_csv_path + f'{dfname}' + '.csv')