# Compile the PELIC Dataset

In [1]:
# import libraries
import pandas as pd

In [2]:
# Define file path
path = '../data/PELIC-dataset/corpus_files/'
# Read the data
question = pd.read_csv(path + 'question.csv')
answer = pd.read_csv(path + 'answer.csv')
student_info = pd.read_csv(path + 'student_information.csv')
course = pd.read_csv(path + 'course.csv')
scores = pd.read_csv(path + 'test_scores.csv')

In [3]:
# Merge the DataFrames on 'question_id' and 'anon_id'
merged_df = pd.merge(answer, question, on='question_id', how='left')
merged_df = pd.merge(merged_df, student_info, on='anon_id', how='left')
merged_df = pd.merge(merged_df, course, on='course_id', how='left')
merged_df = pd.merge(merged_df, scores, on='anon_id', how='left')

In [4]:
# Rename some columns to maintain consistency with other data sets
merged_df.rename(columns={'stem': 'question'}, inplace=True)
merged_df.rename(columns={'text': 'answer'}, inplace=True)
merged_df.rename(columns={'native_language': 'L1'}, inplace=True)
merged_df.rename(columns={'level_id': 'level'}, inplace=True)

In [5]:
# Map the question types
question_type_mapping = {
    1: 'Paragraph writing',
    2: 'Short answer',
    3: 'Multiple choice',
    4: 'Essay',
    5: 'Fill-in-the-blank',
    6: 'Sentence completion',
    7: 'Word bank',
    8: 'Chart',
    9: 'Word selection',
    10: 'Audio recording'
}

# Create the new 'question_type' column by mapping 'question_type_id' using the mapping dictionary
merged_df['question_type'] = merged_df['question_type_id'].map(question_type_mapping)

In [6]:
# Look at the columns, their types, and which columns have null values
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47667 entries, 0 to 47666
Data columns (total 47 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   answer_id                   47667 non-null  int64  
 1   question_id                 47667 non-null  int64  
 2   anon_id                     47667 non-null  object 
 3   course_id                   47667 non-null  int64  
 4   version                     47667 non-null  int64  
 5   created_date                47667 non-null  object 
 6   text_len                    47667 non-null  int64  
 7   answer                      47664 non-null  object 
 8   tokens                      47667 non-null  object 
 9   tok_lem_POS                 47667 non-null  object 
 10  question_type_id            47538 non-null  float64
 11  question                    47241 non-null  object 
 12  allow_text                  47538 non-null  float64
 13  gender                      476

In [7]:
merged_df.to_csv('../data/PELIC_compiled.csv')