# Add the Augmented Data to the ASAG Dataset
This notebook is for adding the augmented data to the ASAG dataset

In [1]:
# Import libraries
import pandas as pd

In [2]:
# Read augmented data files into dataframes
asag = pd.read_csv('../data/ASAG_cleaned.csv').drop(['Unnamed: 0'],axis=1)
level_2 = pd.read_csv('../data/ASAG_level_2_augmented.csv').drop(['Unnamed: 0'],axis=1)
level_5 = pd.read_csv('../data/ASAG_level_5_augmented.csv').drop(['Unnamed: 0'],axis=1)

In [3]:
# Add a column to the original ASAG dataframe to indicate that the answers in it
# are the non-augmented ones
asag['is_augmented'] = 0

In [4]:
asag.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 0 to 267
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   file_name        268 non-null    object 
 1   age_participant  268 non-null    int64  
 2   sex_participant  268 non-null    object 
 3   education        268 non-null    object 
 4   L1               268 non-null    object 
 5   sex_examiner1    268 non-null    object 
 6   sex_examiner2    268 non-null    object 
 7   sex_examiner3    268 non-null    object 
 8   setting          268 non-null    object 
 9   question         268 non-null    object 
 10  word_limit       268 non-null    object 
 11  level_course     268 non-null    int64  
 12  answer           268 non-null    object 
 13  grade_examiner1  268 non-null    int64  
 14  grade_examiner2  267 non-null    float64
 15  grade_examiner3  268 non-null    int64  
 16  level            268 non-null    int64  
 17  question_type   

In [5]:
asag.level.value_counts()

level
3    97
4    67
2    54
5    28
1    17
6     5
Name: count, dtype: int64

In [6]:
# Add a column to level 2 to indicate that the answers are augmented
level_2['is_augmented'] = 1
# Change the answer column to the augmented answer
level_2['answer'] = level_2['augmented_answer']
# Drop the augmented answer column
level_2 = level_2.drop(['augmented_answer'],axis=1)
# Choose only the first two augmented answers, since this is all we will need for balancing
level_2 = level_2.loc[0:1]

In [7]:
level_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   file_name        2 non-null      object 
 1   age_participant  2 non-null      int64  
 2   sex_participant  2 non-null      object 
 3   education        2 non-null      object 
 4   L1               2 non-null      object 
 5   sex_examiner1    2 non-null      object 
 6   sex_examiner2    2 non-null      object 
 7   sex_examiner3    2 non-null      object 
 8   setting          2 non-null      object 
 9   question         2 non-null      object 
 10  word_limit       2 non-null      object 
 11  level_course     2 non-null      int64  
 12  answer           2 non-null      object 
 13  grade_examiner1  2 non-null      int64  
 14  grade_examiner2  2 non-null      float64
 15  grade_examiner3  2 non-null      int64  
 16  level            2 non-null      int64  
 17  question_type    2 n

In [8]:
level_2.answer.head()

0    And that's why i always want you to know, i'm ...
1    My mother is a big man. I have one sister and ...
Name: answer, dtype: object

In [9]:
# Add a column to level 5 to indicate that the answers are augmented
level_5['is_augmented'] = 1
# Move the augmented answers to the answer column
level_5['answer'] = level_5['augmented_answer']
# Drop the augmented answer column
level_5 = level_5.drop(['augmented_answer'],axis=1)
# We're keeping the whole dataframe here because we'll need it all for balancing

In [10]:
level_5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   file_name        28 non-null     object 
 1   age_participant  28 non-null     int64  
 2   sex_participant  28 non-null     object 
 3   education        28 non-null     object 
 4   L1               28 non-null     object 
 5   sex_examiner1    28 non-null     object 
 6   sex_examiner2    28 non-null     object 
 7   sex_examiner3    28 non-null     object 
 8   setting          28 non-null     object 
 9   question         28 non-null     object 
 10  word_limit       28 non-null     object 
 11  level_course     28 non-null     int64  
 12  answer           28 non-null     object 
 13  grade_examiner1  28 non-null     int64  
 14  grade_examiner2  27 non-null     float64
 15  grade_examiner3  28 non-null     int64  
 16  level            28 non-null     int64  
 17  question_type    2

In [11]:
level_5.answer.head()

0    I love this book because it is a story about c...
1    I think that Japan's economy is recovering. I ...
2    You need to think about your career. \nSo, her...
3    The researchers analysed a large number of soc...
4    You may be online with people who are online w...
Name: answer, dtype: object

In [12]:
# Concatenate the three dataframes and reset the index
df = pd.concat([asag,level_2,level_5]).reset_index(drop=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298 entries, 0 to 297
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   file_name        298 non-null    object 
 1   age_participant  298 non-null    int64  
 2   sex_participant  298 non-null    object 
 3   education        298 non-null    object 
 4   L1               298 non-null    object 
 5   sex_examiner1    298 non-null    object 
 6   sex_examiner2    298 non-null    object 
 7   sex_examiner3    298 non-null    object 
 8   setting          298 non-null    object 
 9   question         298 non-null    object 
 10  word_limit       298 non-null    object 
 11  level_course     298 non-null    int64  
 12  answer           298 non-null    object 
 13  grade_examiner1  298 non-null    int64  
 14  grade_examiner2  296 non-null    float64
 15  grade_examiner3  298 non-null    int64  
 16  level            298 non-null    int64  
 17  question_type   

In [14]:
df.level.value_counts()

level
3    97
4    67
2    56
5    56
1    17
6     5
Name: count, dtype: int64

In [15]:
df.to_csv('../data/ASAG_augmented.csv')