# Compile the PELIC Dataset
This notebook is to compile the PELIC dataset.

In [1]:
# Import libraries
import pandas as pd
import spacy

In [2]:
# I'm going to be using this to process the data later on, so I'll load it now.
nlp = spacy.load('en_core_web_sm')

In [3]:
# Define file paths
path = '../data/PELIC-dataset/corpus_files/'
question = pd.read_csv(path + 'question.csv')
answer = pd.read_csv(path + 'answer.csv')
student_info = pd.read_csv(path + 'student_information.csv')
course = pd.read_csv(path + 'course.csv')

In [4]:
# Merge the DataFrames on 'question_id' and 'anon_id'
merged_df = pd.merge(answer, question, on='question_id', how='left')
merged_df.rename(columns={'stem': 'question'}, inplace=True)
merged_df = pd.merge(merged_df, student_info, on='anon_id', how='left')
merged_df.rename(columns={'native_language': 'L1'}, inplace=True)
merged_df = pd.merge(merged_df, course, on='course_id', how='left')

In [5]:
# Mapping dictionary for question type
question_type_mapping = {
    1: 'Paragraph writing',
    2: 'Short answer',
    3: 'Multiple choice',
    4: 'Essay',
    5: 'Fill-in-the-blank',
    6: 'Sentence completion',
    7: 'Word bank',
    8: 'Chart',
    9: 'Word selection',
    10: 'Audio recording'
}

# Create the new 'question_type' column by mapping 'question_type_id' using the mapping dictionary
merged_df['question_type'] = merged_df['question_type_id'].map(question_type_mapping)

In [6]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46204 entries, 0 to 46203
Data columns (total 38 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   answer_id                   46204 non-null  int64  
 1   question_id                 46204 non-null  int64  
 2   anon_id                     46204 non-null  object 
 3   course_id                   46204 non-null  int64  
 4   version                     46204 non-null  int64  
 5   created_date                46204 non-null  object 
 6   text_len                    46204 non-null  int64  
 7   text                        46201 non-null  object 
 8   tokens                      46204 non-null  object 
 9   tok_lem_POS                 46204 non-null  object 
 10  question_type_id            46075 non-null  float64
 11  question                    45778 non-null  object 
 12  allow_text                  46075 non-null  float64
 13  gender                      462

In [7]:
merged_df.head()

Unnamed: 0,answer_id,question_id,anon_id,course_id,version,created_date,text_len,text,tokens,tok_lem_POS,...,ways_of_study_lang3,course_history,yrs_of_english_learning,yrs_in_english_environment,age,class_id,level_id,semester,section,question_type
0,1,5,eq0,149,1,2006-09-20 16:11:08,177,I met my friend Nife while I was studying in a...,"['I', 'met', 'my', 'friend', 'Nife', 'while', ...","[('I', 'I', 'PRP'), ('met', 'meet', 'VBD'), ('...",...,,1;7;13;19;25;42;53;64;75;86;101;113;125;149;16...,more than 5 years,less than 1 year,25.0,g,4,2006_fall,M,Paragraph writing
1,2,5,am8,149,1,2006-09-20 22:09:14,137,"Ten years ago, I met a women on the train betw...","['Ten', 'years', 'ago', ',', 'I', 'met', 'a', ...","[('Ten', 'ten', 'CD'), ('years', 'year', 'NNS'...",...,,19;31;32;33;35;47;58;65;91;106;118;149;182;208,less than 1 year,none,23.0,g,4,2006_fall,M,Paragraph writing
2,3,12,dk5,115,1,2006-09-21 10:16:17,63,In my country we usually don't use tea bags. F...,"['In', 'my', 'country', 'we', 'usually', 'do',...","[('In', 'in', 'IN'), ('my', 'my', 'PRP$'), ('c...",...,,103;115;127;151,more than 5 years,less than 1 year,27.0,w,4,2006_fall,Q,Paragraph writing
3,4,13,dk5,115,1,2006-09-21 10:16:17,6,I organized the instructions by time.,"['I', 'organized', 'the', 'instructions', 'by'...","[('I', 'I', 'PRP'), ('organized', 'organize', ...",...,,103;115;127;151,more than 5 years,less than 1 year,27.0,w,4,2006_fall,Q,Paragraph writing
4,5,12,ad1,115,1,2006-09-21 10:19:01,59,"First, prepare a port, loose tea, and cup.\nSe...","['First', ',', 'prepare', 'a', 'port', ',', 'l...","[('First', 'first', 'RB'), (',', ',', ','), ('...",...,,46;57;68;79;90;103;115;129;153;175;208,1-2 years,none,18.0,w,4,2006_fall,Q,Paragraph writing


In [8]:
# Specify the column to include in the new dataframe
desired_columns = ['level_id','L1','question_type','question','text']
# Create the new dataframe with only the desired columns
df = merged_df[desired_columns].copy()
# Rename the columns
df = df.rename(columns={'level_id': 'level', 'text': 'answer'})
# Convert the answer column to string so that the folloing function works properly
df['answer'] = df['answer'].astype('string')

In [9]:
# Check if there are any answers that contain only empty space
df.loc[df['answer'].str.isspace()].index

Index([], dtype='int64')

In [10]:
# Drop NA values
df = df.dropna()

In [11]:
# Define a function to add the sentence length, 
# the number of sentences per text,
# and the average sentence length per text to the dataframe
def sentence_length(df):
    '''
    Adds columns to the dataframe for the length of the answer,
    the number of sentences per answer,
    and the average sentence length per answer.
    '''
    # Load the English language model, if not already loaded at the top of the notebook
#     nlp = spacy.load("en_core_web_sm")
    # Create a copy of the DataFrame to avoid the SettingWithCopyWarning
    df = df.copy()
    # Iterate over rows in the DataFrame
    for index, row in df.iterrows():
        # Get the answer text from the DataFrame
        answer_text = row['answer']
        # Process the answer text with spaCy
        doc = nlp(answer_text)
        # Initialize variables to accumulate total tokens and count of sentences
        total_tokens = 0
        num_sentences = 0

        # Iterate over sentences and accumulate total tokens
        for sentence in doc.sents:
            num_tokens = len(sentence)
            total_tokens += num_tokens
            num_sentences += 1
            
        # Calculate the average sentence length
        avg_len = total_tokens / num_sentences
        # Add num_sentences and avg_len as new columns in the DataFrame
        df.loc[index, 'length'] = len(answer_text)
        df.loc[index, 'num_sentences'] = num_sentences
        df.loc[index, 'avg_sentence_length'] = avg_len
        df.loc[index, 'total_tokens'] = total_tokens

    return df

In [12]:
# Apply the function to the dataframe
df = sentence_length(df)

In [13]:
df.head()

Unnamed: 0,level,L1,question_type,question,answer,length,num_sentences,avg_sentence_length,total_tokens
0,4,Arabic,Paragraph writing,Write a paragraph about a relatioship that is...,I met my friend Nife while I was studying in a...,923.0,12.0,16.083333,193.0
1,4,Thai,Paragraph writing,Write a paragraph about a relatioship that is...,"Ten years ago, I met a women on the train betw...",668.0,10.0,15.6,156.0
2,4,Turkish,Paragraph writing,"In five sentences or less, give instructions o...",In my country we usually don't use tea bags. F...,278.0,5.0,14.4,72.0
3,4,Turkish,Paragraph writing,"How do you organize the instructions: by time,...",I organized the instructions by time.,37.0,1.0,7.0,7.0
4,4,Korean,Paragraph writing,"In five sentences or less, give instructions o...","First, prepare a port, loose tea, and cup. Sec...",290.0,5.0,15.6,78.0


In [17]:
# Take a look at an answer to make sure that the number of sentences, token, and length look correct.
df.answer[0]

'I met my friend Nife while I was studying in a middle school. I was happy when I met him because he was a good student in our school. We continued the middle and high school to gather in the same school. We were studying in the different classes in the middle school; however, in the high school we were studying in the same class. We went to many places in the free time while we were studying in the high school. When we finished from the high school, I went to K.S University and he went to I.M University. While we were enjoying in academic life, we made many achievement in these universities. I graduated when Nife was studying in the last semester in the university. After that, I got a job. Fortunately, it was nearby my home. I worked two years then I got scholarship from ministry of high education in my country. When I came here to U.S, my friend Nife arrange some documents to study at grad school in Malaysia.'

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45775 entries, 0 to 46203
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   level                45775 non-null  int64  
 1   L1                   45775 non-null  object 
 2   question_type        45775 non-null  object 
 3   question             45775 non-null  object 
 4   answer               45775 non-null  string 
 5   length               45775 non-null  float64
 6   num_sentences        45775 non-null  float64
 7   avg_sentence_length  45775 non-null  float64
 8   total_tokens         45775 non-null  float64
dtypes: float64(4), int64(1), object(3), string(1)
memory usage: 4.5+ MB


In [16]:
df.to_csv('../data/PELIC_compiled.csv')