# Compile the PELIC and ASAG datasets
This notebook is to compile the PELIC and ASAG datasets, and add columns for token tags and dependencies

In [1]:
import pandas as pd
import spacy
import re

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
pelic = pd.read_csv('../data/PELIC_sub_verb_cleaned.csv').drop(['Unnamed: 0'],axis=1)
asag = pd.read_csv('../data/ASAG_sub_verb_cleaned.csv').drop(['Unnamed: 0'],axis=1)
df = pd.concat([pelic,asag]).reset_index(drop=True)

In [4]:
df.head()

Unnamed: 0,level,L1,question_type,question,answer,length,num_sentences,avg_sentence_length,total_tokens,dataset
0,4,Arabic,Paragraph writing,Write a paragraph about a relatioship that is...,I met my friend Nife while I was studying in a...,923.0,12.0,16.083333,193.0,PELIC
1,4,Thai,Paragraph writing,Write a paragraph about a relatioship that is...,"Ten years ago, I met a women on the train betw...",668.0,10.0,15.6,156.0,PELIC
2,4,Turkish,Paragraph writing,"In five sentences or less, give instructions o...",In my country we usually don't use tea bags. F...,278.0,5.0,14.4,72.0,PELIC
3,4,Turkish,Paragraph writing,"How do you organize the instructions: by time,...",I organized the instructions by time.,37.0,1.0,7.0,7.0,PELIC
4,4,Korean,Paragraph writing,"In five sentences or less, give instructions o...","First, prepare a port, loose tea, and cup.\nSe...",290.0,5.0,15.6,78.0,PELIC


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33324 entries, 0 to 33323
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   level                33324 non-null  int64  
 1   L1                   33324 non-null  object 
 2   question_type        33324 non-null  object 
 3   question             33324 non-null  object 
 4   answer               33324 non-null  object 
 5   length               33324 non-null  float64
 6   num_sentences        33324 non-null  float64
 7   avg_sentence_length  33324 non-null  float64
 8   total_tokens         33324 non-null  float64
 9   dataset              33324 non-null  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 2.5+ MB


## Extract token tags and dependencies

In [6]:
# Define a function to extract token tags
def extract_tags(text):
    '''
    Converts each text to a small English core object
    and returns a list of token dependencies
    for each text.
    '''
    doc = nlp(text)
    return [token.tag_ for token in doc]

In [7]:
# Define a function to extract token dependecies
def extract_deps(text):
    '''
    Converts each text to a small English core object
    and returns a list of token dependencies
    for each text.
    '''
    doc = nlp(text)
    return [token.dep_ for token in doc]

In [8]:
# Apply the function to a new column in the df
df['token_tag'] = df['answer'].apply(extract_tags)

# Flatten the token dependency column
df['token_tag_flat'] = df['token_tag'].apply(lambda x: ' '.join(x))

In [9]:
# Apply the function to a new column in the df
df['token_dep'] = df['answer'].apply(extract_deps)

# Flatten the token dependency column
df['token_dep_flat'] = df['token_dep'].apply(lambda x: ' '.join(x))

In [10]:
df.to_csv('../data/PELIC_ASAG_Compiled_tags_deps.csv')

In [11]:
df.head()

Unnamed: 0,level,L1,question_type,question,answer,length,num_sentences,avg_sentence_length,total_tokens,dataset,token_tag,token_tag_flat,token_dep,token_dep_flat
0,4,Arabic,Paragraph writing,Write a paragraph about a relatioship that is...,I met my friend Nife while I was studying in a...,923.0,12.0,16.083333,193.0,PELIC,"[PRP, VBD, PRP$, NN, NNP, IN, PRP, VBD, VBG, I...",PRP VBD PRP$ NN NNP IN PRP VBD VBG IN DT JJ NN...,"[nsubj, ROOT, poss, dobj, npadvmod, mark, nsub...",nsubj ROOT poss dobj npadvmod mark nsubj aux a...
1,4,Thai,Paragraph writing,Write a paragraph about a relatioship that is...,"Ten years ago, I met a women on the train betw...",668.0,10.0,15.6,156.0,PELIC,"[CD, NNS, RB, ,, PRP, VBD, DT, NNS, IN, DT, NN...","CD NNS RB , PRP VBD DT NNS IN DT NN IN DT NN V...","[nummod, npadvmod, advmod, punct, nsubj, ccomp...",nummod npadvmod advmod punct nsubj ccomp det d...
2,4,Turkish,Paragraph writing,"In five sentences or less, give instructions o...",In my country we usually don't use tea bags. F...,278.0,5.0,14.4,72.0,PELIC,"[IN, PRP$, NN, PRP, RB, VBP, RB, VB, NN, NNS, ...",IN PRP$ NN PRP RB VBP RB VB NN NNS . RB PRP VB...,"[prep, poss, pobj, nsubj, advmod, aux, neg, RO...",prep poss pobj nsubj advmod aux neg ROOT compo...
3,4,Turkish,Paragraph writing,"How do you organize the instructions: by time,...",I organized the instructions by time.,37.0,1.0,7.0,7.0,PELIC,"[PRP, VBD, DT, NNS, IN, NN, .]",PRP VBD DT NNS IN NN .,"[nsubj, ROOT, det, dobj, prep, pobj, punct]",nsubj ROOT det dobj prep pobj punct
4,4,Korean,Paragraph writing,"In five sentences or less, give instructions o...","First, prepare a port, loose tea, and cup.\nSe...",290.0,5.0,15.6,78.0,PELIC,"[RB, ,, VB, DT, NN, ,, JJ, NN, ,, CC, NN, ., _...","RB , VB DT NN , JJ NN , CC NN . _SP NNP , VB N...","[advmod, punct, ROOT, det, dobj, punct, amod, ...",advmod punct ROOT det dobj punct amod conj pun...


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33324 entries, 0 to 33323
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   level                33324 non-null  int64  
 1   L1                   33324 non-null  object 
 2   question_type        33324 non-null  object 
 3   question             33324 non-null  object 
 4   answer               33324 non-null  object 
 5   length               33324 non-null  float64
 6   num_sentences        33324 non-null  float64
 7   avg_sentence_length  33324 non-null  float64
 8   total_tokens         33324 non-null  float64
 9   dataset              33324 non-null  object 
 10  token_tag            33324 non-null  object 
 11  token_tag_flat       33324 non-null  object 
 12  token_dep            33324 non-null  object 
 13  token_dep_flat       33324 non-null  object 
dtypes: float64(4), int64(1), object(9)
memory usage: 3.6+ MB
