# Clean the ASAG Dataset

Since the data has been cleanly scraped from XML files, there's not a lot of cleaning to be done. However, to remain consistent with the PELIC dataset, we'll make sure that all answers contain at least one subject and one verb.

In [1]:
# import libraries
import pandas as pd
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
df = pd.read_csv('../data/ASAG_compiled.csv').drop(['Unnamed: 0'],axis=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   file_name        299 non-null    object 
 1   age_participant  299 non-null    int64  
 2   sex_participant  299 non-null    object 
 3   education        299 non-null    object 
 4   L1               299 non-null    object 
 5   sex_examiner1    299 non-null    object 
 6   sex_examiner2    299 non-null    object 
 7   sex_examiner3    299 non-null    object 
 8   setting          299 non-null    object 
 9   question         299 non-null    object 
 10  word_limit       299 non-null    object 
 11  level_course     299 non-null    int64  
 12  answer           299 non-null    object 
 13  grade_examiner1  299 non-null    int64  
 14  grade_examiner2  297 non-null    float64
 15  grade_examiner3  299 non-null    int64  
 16  level            299 non-null    int64  
 17  question_type   

In [5]:
# Define a function to filter the data for answers that contain at least one subject and verb
def contains_subject_and_verb(text):
    '''
    Checks to see if a document contains
    at least one subject and one verb
    '''
    doc = nlp(text)
    # Check if the text contains at least one subject and one verb
    return any(token.dep_ == "nsubj" for token in doc) and any(token.pos_ == "VERB" for token in doc)

def filter_rows_with_subject_and_verb(df):
    '''
    Applies the contains_subject_and_verb function
    '''
    # Apply the contains_subject_and_verb function to each row in the 'answer' column
    mask = df['answer'].apply(contains_subject_and_verb)
    # Filter the DataFrame to keep only the rows where the condition is True
    return df[mask]

In [6]:
# Apply the function
df = filter_rows_with_subject_and_verb(df)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 268 entries, 0 to 298
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   file_name        268 non-null    object 
 1   age_participant  268 non-null    int64  
 2   sex_participant  268 non-null    object 
 3   education        268 non-null    object 
 4   L1               268 non-null    object 
 5   sex_examiner1    268 non-null    object 
 6   sex_examiner2    268 non-null    object 
 7   sex_examiner3    268 non-null    object 
 8   setting          268 non-null    object 
 9   question         268 non-null    object 
 10  word_limit       268 non-null    object 
 11  level_course     268 non-null    int64  
 12  answer           268 non-null    object 
 13  grade_examiner1  268 non-null    int64  
 14  grade_examiner2  267 non-null    float64
 15  grade_examiner3  268 non-null    int64  
 16  level            268 non-null    int64  
 17  question_type    268 

In [8]:
df.to_csv('../data/ASAG_cleaned.csv')