In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
# Import libraries
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import re

In [3]:
df = pd.read_csv('drive/MyDrive/ESL_Writing_Classification/ASAG_compiled.csv')

In [4]:
df.level.value_counts()

level
3    97
4    67
2    54
5    28
1    17
6     5
Name: count, dtype: int64

In [37]:
# Filter rows where level equals 2 and sample 1049 rows
level_5 = df[df['level'] == 5].drop(['Unnamed: 0'],axis=1)

In [38]:
level_5.head()

Unnamed: 0,L1,question,answer,level,question_type,length,num_sentences,avg_sentence_length,total_tokens,dataset
54,French,What is the best book you've ever read and why?,"The best book I've ever read was ""La guerre de...",5,Paragraph writing,732.0,13.0,12.846154,167.0,ASAG
65,French,"If you had 1 million euros, what would you do ...",The most important thing to know is that if I ...,5,Paragraph writing,321.0,2.0,35.5,71.0,ASAG
91,French,Should study abroad be a compulsory part of ed...,"In my opinion, it should be a part of educatio...",5,Paragraph writing,173.0,1.0,38.0,38.0,ASAG
92,Spanish,What is the impact of social networks on relat...,It seems to be rather obvious that social netw...,5,Paragraph writing,348.0,2.0,32.0,64.0,ASAG
93,French,What is the impact of social networks on relat...,Social networks have grown to become a huge pa...,5,Paragraph writing,284.0,2.0,26.5,53.0,ASAG


In [39]:
level_5.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28 entries, 54 to 263
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   L1                   28 non-null     object 
 1   question             28 non-null     object 
 2   answer               28 non-null     object 
 3   level                28 non-null     int64  
 4   question_type        28 non-null     object 
 5   length               28 non-null     float64
 6   num_sentences        28 non-null     float64
 7   avg_sentence_length  28 non-null     float64
 8   total_tokens         28 non-null     float64
 9   dataset              28 non-null     object 
dtypes: float64(4), int64(1), object(5)
memory usage: 2.4+ KB


In [8]:
# Load pre-trained GPT model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [9]:
def generate_augmented_answer(original_answer, temperature=0.7, top_p=0.9, max_repetitions=1):
    '''
    Rephrases a paragraph and continues writing in the same style as the original paragraph.
    Top-k and nucleus sampling are used to ensure the consistency in style.
    Truncates the last half of the paragraph to get a new paragraph.
    '''
    # Tokenize the original answer
    input_ids = tokenizer.encode(original_answer, return_tensors="pt", max_length=len(original_answer), truncation=True)

    # Calculate the max length for generation (twice the length of the original answer)
    max_length = min(len(original_answer) * 2, tokenizer.model_max_length)

    try:
        # Generate text using the model with top-k sampling and nucleus sampling
        output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, do_sample=True, temperature=temperature, top_p=top_p, top_k=50, pad_token_id=tokenizer.eos_token_id)

        # Decode the generated text
        augmented_answer = tokenizer.decode(output[0], skip_special_tokens=True)

        # Find sentence boundaries in the generated text
        sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", augmented_answer)

        # Calculate the midpoint index
        midpoint_index = len(sentences) // 2

        # Extract sentences from the second half of the generated text
        second_half_sentences = sentences[midpoint_index:]

        # Keep track of unique sentences to detect repetition
        unique_sentences = set()

        # Construct the new paragraph, avoiding repetition
        new_paragraph = ''
        repetitions = 0
        for sentence in second_half_sentences:
            if sentence not in unique_sentences:
                new_paragraph += sentence + ' '
                unique_sentences.add(sentence)
            else:
                repetitions += 1
                if repetitions >= max_repetitions:
                    break

        return new_paragraph.strip()
    except IndexError as e:
        print("Error:", e)
        return None


In [10]:
doc = 'I tried this so many times but it was taking so long.'
doc_augmented = generate_augmented_answer(doc)
doc_augmented

"I wanted to keep on top of it. I wanted to be able to do whatever I wanted to do. I was a child of the Internet, and that's what I wanted to be, a kid who was always on top of things. I wanted to be able to do whatever I"

In [40]:
# Apply augmentation to the first 30 sampled rows
augmented_answers = []

for answer in level_5['answer'].iloc[0:28]:
    augmented_answer = generate_augmented_answer(answer)
    augmented_answers.append(augmented_answer)

# Add augmented answers back to the DataFrame
level_5['augmented_answer'] = None  # Initialize the column with None values
level_5['augmented_answer'].iloc[0:28] = augmented_answers  # Assign augmented answers to the first 30 rows

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  level_5['augmented_answer'].iloc[0:28] = augmented_answers  # Assign augmented answers to the first 30 rows


In [41]:
# Append augmented samples back to the original DataFrame
# df = pd.concat([df, level_2], ignore_index=True)

level_5.to_csv('drive/MyDrive/ESL_Writing_Classification/ASAG_level_5_Augmented.csv')