# Balance the ASAG dataset
This notebook is for balancing the ASAG dataset. Before balancing, some preprocessing steps are required.

In [1]:
# Import libraries
import pandas as pd
import re
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
df = pd.read_csv('../data/ASAG_augmented.csv')
df = df.drop(['Unnamed: 0'], axis=1)

In [4]:
# Step 1: Replace text in 'answer' column with text from 'augmented_answer' column
df['answer'] = df.apply(lambda row: row['augmented_answer'] if pd.notnull(row['augmented_answer']) else row['answer'], axis=1)

# Step 2: Add a column indicating original (0) or augmented (1) answer
df['is_augmented'] = df['augmented_answer'].notnull().astype(int)

# Step 3: Drop the 'augmented_answer' column
df.drop(columns=['augmented_answer'], inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   L1             185 non-null    object 
 1   question       185 non-null    object 
 2   answer         185 non-null    object 
 3   level          185 non-null    int64  
 4   question_type  185 non-null    object 
 5   num_sentences  185 non-null    float64
 6   is_augmented   185 non-null    int32  
dtypes: float64(1), int32(1), int64(1), object(4)
memory usage: 9.5+ KB


## Remove augmented answers with repeating phrases
Let's take a look and see if we have augmented answers that have repeating phrases.

In [6]:
# Filter the df for augmented answers and arrange them in descending order by length
augmented = df[df['is_augmented'] == 1].copy()
augmented['length'] = augmented['answer'].apply(len)
augmented_sorted = augmented.sort_values(by='length', ascending=False)
augmented_sorted.head(10)

Unnamed: 0,L1,question,answer,level,question_type,num_sentences,is_augmented,length
179,French,Eating a balanced diet is the most important f...,"It brings you everything you need ( vitamins, ...",5,Paragraph writing,5.0,1,1852
182,German,Eating a balanced diet is the most important f...,Eat well in the evening\n\n36. Eat well in the...,5,Paragraph writing,9.0,1,1415
177,French,Do people who live in the public eye have a ri...,I think that his writing skills are very good....,5,Paragraph writing,10.0,1,1374
180,French,Do people who live in the public eye have a ri...,"They are either using it to get information, o...",5,Paragraph writing,6.0,1,1144
176,French,You are invited to a friend's birthday. You re...,"There is no character limit, and the game is s...",5,Paragraph writing,6.0,1,632
178,French,What is the impact of social networks on relat...,"I did not know that I was a bad person, and I ...",5,Paragraph writing,8.0,1,593
173,French,Do people who live in the public eye have a ri...,The government has shown that it can make peop...,5,Paragraph writing,3.0,1,360
175,French,What is the impact of social networks on relat...,I think we can say things that we really want ...,5,Paragraph writing,3.0,1,323
181,French,Eating a balanced diet is the most important f...,"In addition, they have good health effects. Th...",5,Paragraph writing,3.0,1,284
174,French,Do people who live in the public eye have a ri...,It is guaranteed by the Constitution. The righ...,5,Paragraph writing,5.0,1,238


In [7]:
sample_idx = 179
sample = augmented_sorted.answer[sample_idx]
sample

'It brings you everything you need ( vitamins, calcium,... ) and keeps away illness. You get the nutrients you need in a healthy way. \n- The "healthy" way to lose weight\n\n- The "fat" way to lose weight\n\n- The "fat" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n\n- The "healthy" way to lose weight\n

Now, let's remove repeating sentences from the augmented answers.

In [8]:
def split_into_sentences(text):
    # Define regex pattern to find all sentences in the text
    sentences = re.findall(r'[^.!?]+[.!?]', text)
    return sentences

def remove_repeating_sentence(text):
    # Split the text into sentences using custom logic
    sentences = split_into_sentences(text)

    # Keep track of the last three unique sentences
    unique_sentences = []

    # Iterate through the sentences
    for sentence in sentences:
        # If the current sentence is not the same as any of the last three unique sentences, add it
        if sentence not in unique_sentences:
            unique_sentences.append(sentence)

    # Join the remaining unique sentences into a single string
    cleaned_text = ' '.join(unique_sentences)
    return cleaned_text.strip()

In [9]:
# Apply the function to rows where "is_augmented" is equal to 1
df.loc[df['is_augmented'] == 1, 'answer'] = df[df['is_augmented'] == 1]['answer'].apply(remove_repeating_sentence)

In [10]:
# Check to see what the longest answers are now that the function has been applied
augmented = df[df['is_augmented'] == 1].copy()
augmented['length'] = augmented['answer'].apply(len)
augmented_sorted = augmented.sort_values(by='length', ascending=False)
augmented_sorted.head(10)

Unnamed: 0,L1,question,answer,level,question_type,num_sentences,is_augmented,length
182,German,Eating a balanced diet is the most important f...,Eat well in the evening\n\n36. Eat well in th...,5,Paragraph writing,9.0,1,1438
177,French,Do people who live in the public eye have a ri...,I think that his writing skills are very good....,5,Paragraph writing,10.0,1,1394
180,French,Do people who live in the public eye have a ri...,"They are either using it to get information, o...",5,Paragraph writing,6.0,1,1156
176,French,You are invited to a friend's birthday. You re...,"There is no character limit, and the game is s...",5,Paragraph writing,6.0,1,640
178,French,What is the impact of social networks on relat...,"I did not know that I was a bad person, and I ...",5,Paragraph writing,8.0,1,598
173,French,Do people who live in the public eye have a ri...,The government has shown that it can make peop...,5,Paragraph writing,3.0,1,363
175,French,What is the impact of social networks on relat...,I think we can say things that we really want ...,5,Paragraph writing,3.0,1,327
181,French,Eating a balanced diet is the most important f...,"In addition, they have good health effects. T...",5,Paragraph writing,3.0,1,288
174,French,Do people who live in the public eye have a ri...,It is guaranteed by the Constitution. The rig...,5,Paragraph writing,5.0,1,240
184,French,Do people who live in the public eye have a ri...,9. The media is not trustworthy. \nThe media...,5,Paragraph writing,7.0,1,223


In [11]:
sample_idx = 182
sample = augmented_sorted.answer[sample_idx]
sample

'Eat well in the evening\n\n36.  Eat well in the evening\n\n37.  Eat well in the evening\n\n38.  Eat well in the evening\n\n39.  Eat well in the evening\n\n40.  Eat well in the evening\n\n41.  Eat well in the evening\n\n42.  Eat well in the evening\n\n43.  Eat well in the evening\n\n44.  Eat well in the evening\n\n45.  Eat well in the evening\n\n46.  Eat well in the evening\n\n47.  Eat well in the evening\n\n48.  Eat well in the evening\n\n49.  Eat well in the evening\n\n50.  Eat well in the evening\n\n51.  Eat well in the evening\n\n52.  Eat well in the evening\n\n53.  Eat well in the evening\n\n54.  Eat well in the evening\n\n55.  Eat well in the evening\n\n56.  Eat well in the evening\n\n57.  Eat well in the evening\n\n58.  Eat well in the evening\n\n59.  Eat well in the evening\n\n60.  Eat well in the evening\n\n61.  Eat well in the evening\n\n62.  Eat well in the evening\n\n63.  Eat well in the evening\n\n64.  Eat well in the evening\n\n65.  Eat well in the evening\n\n66.  Eat wel

In [12]:
sample_idx = 177
sample = augmented_sorted.answer[sample_idx]
sample

"I think that his writing skills are very good.  The best thing about his writing is that he is very open about his sexuality and he doesn't talk about his sexuality too much.  He is very kind and he has a very open mind.  I think that he has the ability to make a lot of great, important and interesting things happen.  \nI think that Charles Dickens is a great writer and a great person.  He is a very intelligent and very funny man.  I think he has the ability to be a very good teacher.  He is very smart and he has a great sense of humour.  I think that he has a great sense of humour, and he is very good at making things interesting.  \nI have a friend who is a professional writer.  He is very good at making things interesting and he is very good at making people feel like they are living in a fantasy world.  I think that he is a great writer and a great person.  \nI am very happy that I have met Charles Dickens.  He is very nice to talk to.  He is very kind and he has a very good sense

In [13]:
sample_idx = 180
sample = augmented_sorted.answer[sample_idx]
sample

'They are either using it to get information, or are using it to get information about their activities, or are using it to do something else, or they are using it to use your phone to do something else.  It is a very confusing situation.  \nSo, here is what I am trying to say, that I\'m trying to help you with this situation.  I\'m not saying that you should just use this tool, but that you should try to find the right person, and that you should use it as an effective tool.  You need to use it for your own personal gain, so that you can use it for the right purpose, and not for the interests of the people who use it.  If you want to learn more about what this tool is, I suggest you read my article, "How to Use Google Now for Your Personal Use".  \nSo, the next step is to use the tool to find out the person you are talking to.  \nThe next step is to find out who you are talking to.  \nYou have to find out who you are talking to.  You have to find out what your phone is doing.  You hav

<p>The longest paragraphs in the dataframe are repetitive; however, there is only one remaining that has very repetitive sentences. We're going to remove that one data point.</p>

In [14]:
# Remove row with index 182
df.drop(index=182, inplace=True)

# Reset the index
df.reset_index(drop=True, inplace=True)

<p>Since the augmented answers were added after the first time that the number of sentences was calculated, we now need to do a re-calculation. We also want to add a column for the length of each answer.</p>

In [15]:
# Define a function to add the number of sentences per text
def num_sentences(df):
    # Create a copy of the DataFrame to avoid the SettingWithCopyWarning
    df = df.copy()
    # Iterate over rows in the DataFrame
    for index, row in df.iterrows():
        # Get the answer text from the DataFrame
        answer_text = row['answer']
        # Process the answer text with spaCy
        doc = nlp(answer_text)
        # Initialize variables to accumulate total tokens and count of sentences
        num_sentences = 0
        # Iterate over sentences and accumulate total tokens
        for sentence in doc.sents:
            num_sentences += 1
        # Add num_sentences in the DataFrame
        df.loc[index, 'num_sentences'] = num_sentences
    return df

In [16]:
df = num_sentences(df)

In [17]:
df['length'] = df['answer'].apply(len)

I'm also going to add a column that indicates that the data from this set comes from the ASAG dataset, so that when I merge all the data later, it'll be easily identifiable.

In [18]:
df['dataset'] = 'ASAG'

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184 entries, 0 to 183
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   L1             184 non-null    object 
 1   question       184 non-null    object 
 2   answer         184 non-null    object 
 3   level          184 non-null    int64  
 4   question_type  184 non-null    object 
 5   num_sentences  184 non-null    float64
 6   is_augmented   184 non-null    int32  
 7   length         184 non-null    int64  
 8   dataset        184 non-null    object 
dtypes: float64(1), int32(1), int64(2), object(5)
memory usage: 12.3+ KB


In [20]:
df.head()

Unnamed: 0,L1,question,answer,level,question_type,num_sentences,is_augmented,length,dataset
0,French,What are your daily habits? What time do you g...,everyday i get up at 8 a clock. I always turn ...,2,Paragraph writing,9.0,0,248,ASAG
1,French,Describe your family.,"I have one sister, she is married and she has ...",2,Paragraph writing,5.0,0,293,ASAG
2,French,Describe your family.,I have a mother and a father and they are stil...,2,Paragraph writing,3.0,0,204,ASAG
3,French,Describe your hobbies.,I really like playing video games online with ...,2,Paragraph writing,3.0,0,175,ASAG
4,French,Describe your family.,"I have a little family, i live with my father ...",2,Paragraph writing,4.0,0,235,ASAG


## Balance the dataset by level

In [21]:
df.level.value_counts()

level
3    71
2    50
4    38
5    25
Name: count, dtype: int64

In [22]:
# Select 25 longest answers from level 2
level_2_sorted = df[df.level == 2].sort_values(by='length', ascending=False)
level_2_selected = level_2_sorted.head(25).copy()

# Select 25 longest answers from level 3
level_3_sorted = df[df.level == 3].sort_values(by='length', ascending=False)
level_3_selected = level_3_sorted.head(25).copy()

# Select 25 longest answers from level 4
level_4_sorted = df[df.level == 4].sort_values(by='length', ascending=False)
level_4_selected = level_4_sorted.head(25).copy()

# Select all from level 5
level_5_selected = df[df.level == 5].copy()

In [23]:
print(len(level_2_selected))
print(len(level_3_selected))
print(len(level_4_selected))
print(len(level_5_selected))

25
25
25
25


In [24]:
# Merge all dataframes into a single dataframe
merged_df = pd.concat([level_2_selected, level_3_selected, level_4_selected, level_5_selected])

In [25]:
merged_df.head()

Unnamed: 0,L1,question,answer,level,question_type,num_sentences,is_augmented,length,dataset
124,French,Imagine your life in 10 years time. What will ...,My life in 10 years time will be Happy !!! i h...,2,Paragraph writing,20.0,0,1364,ASAG
90,French,Describe your family.,"I'm seventy seven,years old. I've got a big fa...",2,Paragraph writing,16.0,0,869,ASAG
126,French,"Share a memory of a holiday. Where was it, who...","Last year, I went to Antalya for ten days with...",2,Paragraph writing,10.0,0,625,ASAG
105,Arabic,"Share a memory of a holiday. Where was it, who...","Idream to fly again to Moskow, where i was in ...",2,Paragraph writing,9.0,0,620,ASAG
9,French,"Share a memory of a holiday. Where was it, who...",Last week was the easter break. The first real...,2,Paragraph writing,11.0,0,599,ASAG


In [26]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 124 to 183
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   L1             100 non-null    object 
 1   question       100 non-null    object 
 2   answer         100 non-null    object 
 3   level          100 non-null    int64  
 4   question_type  100 non-null    object 
 5   num_sentences  100 non-null    float64
 6   is_augmented   100 non-null    int32  
 7   length         100 non-null    int64  
 8   dataset        100 non-null    object 
dtypes: float64(1), int32(1), int64(2), object(5)
memory usage: 7.4+ KB


In [27]:
merged_df.level.value_counts()

level
2    25
3    25
4    25
5    25
Name: count, dtype: int64

In [29]:
merged_df.to_csv('../data/ASAG_balanced.csv')