In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
df_2 = pd.read_csv("/kaggle/input/ielts-writing-scored-essays-dataset/ielts_writing_dataset.csv")

df_2 = df_2.rename(columns={'Question': 'prompt', 'Essay': 'essay'})
# Get essay with band greater than 5.5
df_2 = df_2[df_2["Overall"] >= 6.0].reset_index()[["prompt", "essay"]]

# Get the last 10 rows
df_test = df_2.tail(20)
df_test = df_test.head(10)

df_test

Unnamed: 0,prompt,essay
1095,Financial education should be a mandatory comp...,In many countries the discussion about financi...
1096,The best way to reduce the number of traffic a...,With the rapid increase in the number of vehic...
1097,Dieting can change a person’s life for the bet...,Dieting seems to be of our life in this modern...
1098,Even though globalization affects the world’s ...,Everything has two sides and the globalization...
1099,In many countries children are engaged in some...,"In the Third World, children are usually sent ..."
1100,"You can get up-to-date news from the radio, TV...","Nowadays, there are several channels to get ne..."
1101,Millions of people every year move to English-...,"Nowadays, language becomes a major factor to ...."
1102,The best way to reduce the number of traffic a...,Traffic accidents are on the rise these days. ...
1103,Financial education should be a mandatory comp...,Children in this modern era have all the magni...
1104,Even though globalization affects the world’s ...,Globalization is a process of advancement and ...


# Go through data

In [4]:
df = pd.read_csv("/kaggle/input/ielts-rubrish-score-for-evaluate/IELTS-gen_dataset.csv")

df = df.rename(columns={'Topic': 'prompt', 'Essay': 'essay'})
# Get essay with band greater than 5.5
df = df[df["Band"] >= 6.0].reset_index()[["prompt", "essay"]]

In [5]:
df

Unnamed: 0,prompt,essay
0,Some people believe that entertainers are paid...,The entertainment industry is one of the large...
1,School children are becoming far too dependent...,Children are born into the digital world. From...
2,One of the consequences of improved medical ca...,Since the medical care system has made signifi...
3,Some people think that physical strength is im...,Physical energy is considered to be an imperat...
4,In some countries governments are encouraging ...,"Nowadays, industry and business have been enco..."
...,...,...
16452,Some people believe that teenagers should conc...,All foundation school subjects should be focus...
16453,Some people say it is a waste of time to plan ...,People have always been concerned with their f...
16454,Do you agree or disagree with the following st...,"Education empowers knowledge, In today's circu..."
16455,2.It is becoming increasingly popular to have ...,Taking a gap year off between finishing school...


In [6]:
step = 6000
for i in range(500, len(df), step):
    print("Prompt:", df["prompt"][i])
    print("Essay:", df["essay"][i])


Prompt: People living in the 21st century have a better quality of life than the previous centuries. To what extent do you agree or disagree?
Essay: Given the fast-paced development, an increase in quality of life is more frequently witnessed in individuals’ lives these days than in the past. I agree with the opinion and this essay attempts to shed light on the reasons explaining my perspective.
First and foremost, with the presence and evolution of technology, humans can take advantage of them to facilitate their lives in terms of health services. Specifically, top-notch equipment would be an excellent tool to allow doctors to implement several sophisticated treatments, thereby leading to a decline in the death rate due to some diseases that were considered not to be able to be dealt with in the previous centuries. For instance, medical devices can be provided for doctors and nurses in order that they can carry out essential research to figure out the best way to treat their patients 

In [7]:
# n_test = 40

# # test set: last 40 rows
# test_df = df.iloc[-n_test:].reset_index(drop=True)

# # train set: all the other rows
# train_df = df.iloc[:-n_test].reset_index(drop=True)

In [8]:
# test_df

In [9]:
# text = "Prompt: " + test_df["prompt"][2] + "\nEssay: " + test_df["essay"][2]

In [10]:
# print(text)

## Custom Dataset

In [11]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, GPT2Config

2025-05-26 09:05:10.271487: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748250310.435434      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748250310.487331      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [12]:
class PromptEssayDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.examples = []
        for _, row in dataframe.iterrows():
            prompt = row['prompt'].strip()
            essay = (row['essay'] + " <end_of_essay>").strip()
            # Combine prompt and essay with EOS separators
            text = prompt + tokenizer.eos_token + essay
            # Tokenize + pad + truncate in one go (fast tokenizer)
            enc = tokenizer(
                text,
                truncation=True,
                max_length=self.max_length,
                padding='max_length',
                return_tensors='pt'
            )
            self.examples.append({
                'input_ids': enc['input_ids'].squeeze(),
                'attention_mask': enc['attention_mask'].squeeze(),
                'labels': enc['input_ids'].squeeze().clone()
            })

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

## Get tokenizer of GPT2

In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [14]:

# # Import tokenizer
# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2-medium")
# special_tokens_dict = {
#     'eos_token': '',
#     'pad_token': '',
#     'additional_special_tokens': ['<end_of_essay>']
# }
# tokenizer.add_special_tokens(special_tokens_dict)

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/fine-tunedgpt2/gpt2-essay-finetuned")

dataset = PromptEssayDataset(df, tokenizer, max_length=512)

# Model

In [15]:
# from transformers import GPT2Config

# # Load cogfig of gpt2
# config = GPT2Config.from_pretrained("gpt2-medium", loss_type="causal_lm")

# """ Load model with config """
# model = GPT2LMHeadModel.from_pretrained("gpt2-medium", config=config)

# model.resize_token_embeddings(len(tokenizer))

In [16]:
# from transformers import AutoTokenizer, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("/kaggle/input/fine-tunedgpt2/gpt2-essay-finetuned")

## Trainer api

In [17]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-essay-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=250,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

# 4. Data collator (just handles LM labels)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# 5. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)


## Train model

In [18]:
print("Start Training")
trainer.train()
print("Train OK!")

Start Training


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,2.7001
100,2.6752
150,2.6379
200,2.6574
250,2.6744
300,2.6632
350,2.6168
400,2.6325
450,2.6318
500,2.5311


Train OK!


## Save model

In [19]:
# Save
trainer.save_model("./gpt2-essay-finetuned")

## Call model if have

In [20]:
device = 'cuda' # 'cpu'

In [21]:
# # This model i finetune like above in anorther version
# from transformers import AutoTokenizer, AutoModelForCausalLM
# tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/fine-tunedgpt2/gpt2-essay-finetuned")
# model = AutoModelForCausalLM.from_pretrained("/kaggle/input/fine-tunedgpt2/gpt2-essay-finetuned")

In [22]:
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=4096, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50258, bias=False)
)

# Generation Function

In [23]:
def generate_essay(prompt: str, device='cuda', max_length: int = 512):
    # Encode the input prompt with the EOS token
    input_ids = tokenizer.encode(prompt + tokenizer.eos_token, return_tensors="pt").to(device)

    # Generate output
    output = model.generate(
        input_ids,
        max_length=input_ids.shape[-1] + max_length,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.convert_tokens_to_ids("<end_of_essay>"),
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8,
        num_return_sequences=1
    )

    # Decode and post-process output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Optional: Cut at the special end token if still included in output
    end_token = "<end_of_essay>"
    if end_token in generated_text:
        generated_text = generated_text.split(end_token)[0]

    return generated_text[len(prompt):].strip()


# Result

In [24]:
prompt = "Write an essay about the importance of artificial intelligence in modern society."
continuation = generate_essay(prompt)
print("Prompt:\n", prompt)
print("\nGenerated essay:\n", continuation)


Prompt:
 Write an essay about the importance of artificial intelligence in modern society.

Generated essay:
 Artificial intelligence has been a hot topic these days. The idea of artificial intelligence (AI) has become a topic of interest due to the fact that it is a concept that is both revolutionary and challenging to comprehend. This essay will examine the significance of AI in the modern world and then propose some possible impacts of AI on society.
First of all, AI has revolutionized the way we live, work, and communicate. It has made tasks, information, and information flows much faster and more efficient. AI is capable of doing everything from calculations to creating models of the world. It is also helping to solve complex problems that require a lot of imagination and creativity. For example, AI can analyze data, analyze complex data, and even program to make decisions.
Another important benefit of AI is the way it makes people’s lives more convenient. It helps us get things d

In [25]:
prompt = "The best way to solve world’s environmental problem is to increase the cost of fuel for cars and other vehicles. To what extent do you agree or disagree?"
continuation = generate_essay(prompt)
print("Prompt:\n", prompt)
print("\nGenerated essay:\n", continuation)


Prompt:
 The best way to solve world’s environmental problem is to increase the cost of fuel for cars and other vehicles. To what extent do you agree or disagree?

Generated essay:
 It is argued that increasing the cost of petrol is the best solution to environmental issues. I partially agree with this statement.
There are several reasons why I agree with this statement. First, by increasing the cost of petrol, we will reduce the number of vehicles and use less fuel. This is because if people use less petrol they will use less fuel and eventually reduce the amount of pollution. Second, if the government increases the cost of petrol it will encourage people to use public transport instead of private vehicles. People will therefore become more careful with their actions and save the environment from damage.
However, I also believe that there are better ways to solve environmental issues. One way to solve environmental issues is to increase the usage of renewable energy. This can be done 

In [26]:
prompt = "Some people think that all teenagers should be required to do unpaid work in their free time to help the local community. They believe this would benefit both the individual teenager and society as a whole.\
Do you agree or disagree?"
continuation = generate_essay(prompt, device)
print("Prompt:\n", prompt)
print("\nGenerated essay:\n", continuation)

Prompt:
 Some people think that all teenagers should be required to do unpaid work in their free time to help the local community. They believe this would benefit both the individual teenager and society as a whole.Do you agree or disagree?

Generated essay:
 In this day and age, it is believed that it is beneficial for teenagers to contribute to their community for free. In my opinion, I totally agree with this idea, as it benefits both the young and the community.
To begin with, young people can acquire valuable life experiences and skills through volunteer work. They are likely to learn the importance of hard work and perseverance, which are indispensable for personal growth. For example, volunteering at a local hospital can help individuals understand the importance of hard work and develop a strong sense of responsibility for their well-being. As a result, this practice could inspire young people to put in more effort and dedicate themselves to making a difference in their communi

In [27]:
print(len(df_test))

10


In [28]:
for idx, row in df_test.iterrows():
    prompt = row["prompt"];
    continuation = generate_essay(prompt, device)
    print("Prompt:\n", prompt)
    print("\nGenerated essay:\n", continuation)
    print("--------------------------------------------------------------------------------------------\n")
    print()

Prompt:
 Financial education should be a mandatory component of the school program. To what extent do you agree or disagree with this statement?

Generated essay:
 In today’s modern society, education plays a crucial role in shaping the future of a nation. While some argue that education should be mandatory in schools, I strongly disagree with this view and believe that children should be provided with an opportunity to gain knowledge through their extracurricular activities.
First of all, acquiring knowledge through extracurricular activities helps students to cultivate critical thinking and analytical abilities. This can be useful in various fields, including business, science, and technology. For instance, in businesses, the ability to think independently is essential for effective decision-making. Furthermore, in science, students often need to understand the basics of science to have a better understanding of how the world operates.
Moreover, extracurricular activities contribute 