# Compile and clean the ASAG dataset
This notebook is to compile and clean the ASAG dataset.

In [1]:
# Import libraries
import pandas as pd
from bs4 import BeautifulSoup as bs
import os
import spacy

In [2]:
path = '../data/cefr-asag-dataset-1.0.1/corpus/release-1.0/labelled/'

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
# Create a function that reads the CEFR data from the XML files into a Pandas DataFrame
def create_cefr_df(path):
    '''
    This function reads the CEFR data
    from XML files in the directory
    to a Pandas DataFrame
    '''

    # Define DataFrame columns
    df_columns = ['file_name', 'age_participant','sex_participant', 'education', 'L1', 'sex_examiner1', 'sex_examiner2', 'sex_examiner3', 'setting', 'question', 'word_limit', 'level', 'answer', 'grade_examiner1', 'grade_examiner2', 'grade_examiner3', 'grade_majority_vote']

    # Create an empty list to store dictionaries
    all_data = []

    # Loop through XML files in the directory
    for filename in os.listdir(path):
        if filename.endswith('.xml'):
            file_path = os.path.join(path, filename)

            # Read XML file
            with open(file_path, 'r') as file:
                contents = file.read()

            # Parse XML
            soup = bs(contents, 'xml')

            # Extract data from XML
            age_participant = soup.find('person', {'role': 'participant'})
            sex_participant = age_participant.get('sex') if age_participant else None
            age_participant = age_participant.get('age') if age_participant else None
            education = soup.find('education').get('type') if soup.find('education') else None
            L1 = soup.find('langKnown').get('tag') if soup.find('langKnown') else None
            sex_examiner1 = soup.find('person', {'xml:id': 'examiner.1'}).get('sex') if soup.find('person', {'xml:id': 'examiner.1'}) else None
            sex_examiner2 = soup.find('person', {'xml:id': 'examiner.2'}).get('sex') if soup.find('person', {'xml:id': 'examiner.2'}) else None
            sex_examiner3 = soup.find('person', {'xml:id': 'examiner.3'}).get('sex') if soup.find('person', {'xml:id': 'examiner.3'}) else None
            setting = soup.find('settingDesc').find('p').text if soup.find('settingDesc') else None
            question = soup.find('div', {'type': 'question'}).find('p').text if soup.find('div', {'type': 'question'}) else None
            word_limit = soup.find('note', {'type': 'word-limit'}).text if soup.find('note', {'type': 'word-limit'}) else None
            level = soup.find('label', {'type': 'level'}).find('span').text if soup.find('label', {'type': 'level'}) else None
            answer = soup.find('div', {'type': 'answer'}).find('p').text if soup.find('div', {'type': 'answer'}) else None
            grade_examiner1 = soup.find('label', {'corresp': '#examiner.1'}).find('span').text if soup.find('label', {'corresp': '#examiner.1'}) else None
            grade_examiner2 = soup.find('label', {'corresp': '#examiner.2'}).find('span').text if soup.find('label', {'corresp': '#examiner.2'}) else None
            grade_examiner3 = soup.find('label', {'corresp': '#examiner.3'}).find('span').text if soup.find('label', {'corresp': '#examiner.3'}) else None
            grade_majority_vote = soup.find('label', {'subtype': 'majority-vote'}).find('span').text if soup.find('label', {'subtype': 'majority-vote'}) else None

            # Append data to list as a dictionary
            all_data.append({
                'file_name': filename,
                'age_participant': age_participant,
                'sex_participant': sex_participant,
                'education': education,
                'L1': L1,
                'sex_examiner1': sex_examiner1,
                'sex_examiner2': sex_examiner2,
                'sex_examiner3': sex_examiner3,
                'setting': setting,
                'question': question,
                'word_limit': word_limit,
                'level': level,
                'answer': answer,
                'grade_examiner1': grade_examiner1,
                'grade_examiner2': grade_examiner2,
                'grade_examiner3': grade_examiner3,
                'grade_majority_vote': grade_majority_vote
            })

    # Create DataFrame from the list of dictionaries
    df_all = pd.DataFrame(all_data)

    return df_all

In [5]:
# Call the function and display the resulting DataFrame
df = create_cefr_df(path)
df.head()

Unnamed: 0,file_name,age_participant,sex_participant,education,L1,sex_examiner1,sex_examiner2,sex_examiner3,setting,question,word_limit,level,answer,grade_examiner1,grade_examiner2,grade_examiner3,grade_majority_vote
0,0001.xml,18,M,higher-secondary,fr,F,F,F,collected in a university-level language learn...,What are your daily habits? What time do you g...,(at least 30 words),A1,everyday i get up at 8 a clock. I always turn ...,A1,A2,A2,A2
1,0002.xml,19,F,higher-secondary,fr,F,F,F,collected in a university-level language learn...,Describe your family.,(at least 30 words),A1,My family is very small. I have a big borther....,A1,A1,A1,A1
2,0003.xml,22,F,lower-secondary,fr,F,F,F,collected in a university-level language learn...,Describe your family.,(at least 30 words),A1,My name is {name},A1,A1,A1,A1
3,0004.xml,21,F,higher-secondary,fr,F,F,F,collected in a university-level language learn...,Describe your hobbies.,(at least 30 words),A1,"Hi my name is {name},",A2,A2,A2,A2
4,0005.xml,18,F,higher-secondary,fr,F,F,F,collected in a university-level language learn...,Describe your family.,(at least 30 words),A1,"I have one sister, she is married and she has ...",A2,A1,A2,A2


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   file_name            299 non-null    object
 1   age_participant      299 non-null    object
 2   sex_participant      299 non-null    object
 3   education            299 non-null    object
 4   L1                   299 non-null    object
 5   sex_examiner1        299 non-null    object
 6   sex_examiner2        299 non-null    object
 7   sex_examiner3        299 non-null    object
 8   setting              299 non-null    object
 9   question             299 non-null    object
 10  word_limit           299 non-null    object
 11  level                299 non-null    object
 12  answer               299 non-null    object
 13  grade_examiner1      299 non-null    object
 14  grade_examiner2      299 non-null    object
 15  grade_examiner3      299 non-null    object
 16  grade_ma

In [7]:
# Replace levels with integers.
replacement_dict = {
    'A1': 1,
    'A2': 2,
    'B1': 3,
    'B2': 4,
    'C1': 5,
    'C2': 6
}

# Perform the string replacements across the whole DataFrame
df.replace(replacement_dict, inplace=True)

In [8]:
# Define a mapping dictionary for language codes to language names
language_mapping = {
    'fr': 'French',
    'it': 'Italian',
    'ru': 'Russian',
    'es': 'Spanish',
    'sw': 'Swahili',
    'ar': 'Arabic',
    'kab': 'Kabyle',
    'fa': 'Persian',
    'nl': 'Dutch',
    'de': 'German',
    'bg': 'Bulgarian'
}

# Assuming 'L1' column contains the language codes, replace them with language names
df['L1'] = df['L1'].map(language_mapping)

In [9]:
# Create a new dataframe to store the transformed data
columns = ['L1','question','answer','grade_majority_vote']
df = df[columns]
# Rename the column 'grade_majority_vote' to 'level'
df.rename(columns={'grade_majority_vote': 'level'}, inplace=True)
# To maintain consistency with PELIC, add a question_type column
df['question_type'] = 'Paragraph writing'

In [10]:
# Define a function to add the number of sentences per text
def num_sentences(df):
    # Create a copy of the DataFrame to avoid the SettingWithCopyWarning
    df = df.copy()
    # Iterate over rows in the DataFrame
    for index, row in df.iterrows():
        # Get the answer text from the DataFrame
        answer_text = row['answer']
        # Process the answer text with spaCy
        doc = nlp(answer_text)
        # Initialize variables to accumulate total tokens and count of sentences
        num_sentences = 0
        # Iterate over sentences and accumulate total tokens
        for sentence in doc.sents:
            num_sentences += 1
        # Add num_sentences in the DataFrame
        df.loc[index, 'num_sentences'] = num_sentences
    return df

In [11]:
# Apply the function that adds a column to the df with the number of sentences
df = num_sentences(df)

In [12]:
# Remove answers that contain only 1 or 2 sentences
df = df[(df.num_sentences != 1) & (df.num_sentences != 2)]

In [13]:
# Check to make sure that there are no answers with only 1 or 2 sentencs
df.num_sentences.unique()

array([ 9.,  4.,  5.,  3.,  6.,  7., 11.,  8., 15., 10., 13., 14., 16.,
       20., 18., 12.])

In [14]:
# Remove levels 1 and 6 from the dataframe
df = df[(df.level != 1) & (df.level !=6)]

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 172 entries, 0 to 298
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   L1             172 non-null    object 
 1   question       172 non-null    object 
 2   answer         172 non-null    object 
 3   level          172 non-null    int64  
 4   question_type  172 non-null    object 
 5   num_sentences  172 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 9.4+ KB


In [16]:
df.level.value_counts()

level
3    71
2    50
4    38
5    13
Name: count, dtype: int64

In [17]:
# Save the df in a csv to be augmented in another notebook
df.to_csv('../data/ASAG_cleaned.csv')