In [1]:
!pip install -q together datasets

In [2]:
!pip install -q huggingface_hub

In [1]:
from datasets import load_dataset

ds = load_dataset("Zlovoblachko/REALEC_GEC_dataset")

In [2]:
from together import Together
from tqdm import tqdm

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['source', 'target', '__index_level_0__'],
        num_rows: 28465
    })
    validation: Dataset({
        features: ['source', 'target', '__index_level_0__'],
        num_rows: 3558
    })
    test: Dataset({
        features: ['source', 'target', '__index_level_0__'],
        num_rows: 3559
    })
})

In [4]:
len(ds['test']['source'])

3559

In [218]:
len(ds['validation']['source'])

3558

In [13]:
corrections = []

In [6]:
chunks = [ds['test']['source'][i:i+10] for i in range(0, len(ds['test']['source']), 10)]

In [None]:
client = Together(api_key=API_KEY)

In [13]:
corrections_test_1 = []

In [14]:
from tqdm import tqdm
import json
from pydantic import BaseModel, Field

for chunk_sentences in tqdm(chunks[76:]):
    try:
        sentences_text = '\n'.join(chunk_sentences)
        sentence_count = len(chunk_sentences)
        
        # Define the schema for the output
        class CorrectedSentences(BaseModel):
            corrections: list[str] = Field(
                description="A list of corrected sentences with grammatical, lexical and style errors fixed",
                min_items=sentence_count,
                max_items=sentence_count
            )
        
        prompt = f"Correct grammatical, lexical and style errors in the following {sentence_count} sentences without changing the meaning. Paraphrase the sentences to avoid uncommon and ungrammatical expressions. Do not make more corrections than necessary. Return exactly {sentence_count} corrected sentences in the same order as the original."
        
        # Call the LLM with the JSON schema
        completion = client.chat.completions.create(
            model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
            messages=[
                {
                    "role": "system",
                    "content": "You are an assistant proficient in English that corrects text. Only answer in JSON format. Return exactly the same number of sentences as provided."
                },
                {
                    "role": "user",
                    "content": prompt + "\n\n" + sentences_text
                }
            ],
            response_format={
                "type": "json_object",
                "schema": CorrectedSentences.model_json_schema()
            },
            temperature=0.3,
            seed=42,
            stream=False  # JSON mode isn't compatible with streaming
        )
        
        # Parse the JSON response
        output = json.loads(completion.choices[0].message.content)
        
        # Validate the count of returned sentences
        if len(output["corrections"]) != sentence_count:
            raise ValueError(f"Expected {sentence_count} sentences, but got {len(output['corrections'])}")
            
        corrections_test_1.extend(output["corrections"])
    except Exception as e:
        print(f"Error processing chunk: {e}")
        break

  0%|          | 0/280 [00:00<?, ?it/s]

100%|██████████| 280/280 [35:18<00:00,  7.57s/it]


In [15]:
with open('corrections_test.json', 'a+') as f:
    f.write('\n'.join(corrections_test_1))

In [10]:
chunks[75]

['Another noticeable similarity is the numbers of men/women workes are almost the same in the industial field in both countries.',
 'The chart shows the time spent doing sport and exercise in England in 2012 by men and women of different age.',
 'At the same way the rate went down in Latin America but only for 1 percent.',
 'The first chart illustrates number of users in millions and the second one - different reasons to use Facebook for men and women.',
 'However, this age-category prevailed in attending art courses.',
 'Industry sector was more popular amongst male and female - 32% male and 11% female.',
 'Graphs provide us information about changes in population size, birth rate and death rate in the USA from 1750 till 2000.',
 'There are 2 kind of transportation: by rail and by road.',
 'The bar chart shows the percentage of overweight girls and boys in the period between 1985 and 2005, and the table below it depicts percentage of boys and girls who did regular physical activity fr

In [11]:
output['corrections']

['Another noticeable similarity is that the number of men and women working in the industrial field is almost the same in both countries.',
 'The chart shows the time spent on sports and exercise in England in 2012 by men and women of different ages.',
 'In the same way, the rate decreased in Latin America, but only by 1 percent.',
 'The first chart illustrates the number of users in millions, and the second one shows different reasons for men and women to use Facebook.',
 'However, this age category prevailed in terms of attending art courses.',
 'The industry sector was more popular among males and females, with 32% male and 11% female.',
 'The graphs provide information about changes in population size, birth rate, and death rate in the USA from 1750 to 2000.',
 'There are two kinds of transportation: by rail and by road.',
 'The bar chart shows the percentage of overweight girls and boys between 1985 and 2005, and the table below it depicts the percentage of boys and girls who enga

In [12]:
with open('corrections_test.json', 'a+') as f:
    f.write('\n'.join(output['corrections']))

In [205]:
len(corrections)

3558

In [206]:
len(ds['validation']['source'])

3558

In [187]:
chunks[1886]

['The charts shows us goods transportation by rail and by road in Eastern Europe in the year 2008.',
 'Moreover, boys who have not access to primary school are more than girls around the world.',
 'The graph which is painted below demonstrate us the differences in the proportions of population of 65 - years and over people for 100 years between 1940 and 2040 in Japan, Sweden and in the USA.',
 'While the table presents the propotion of them involved in physical activity daily.',
 'All the other changes shown on a chart are smooth and steady.',
 'Over the span of 12 years a number of boys and girls in the rets of world decreased from 10, 6 and 12,8 millions to 5,5 and 7,6 millions respectively.',
 'on the countrary the lowest temputer in Rio de Janeiro is in Julay (about 14 degrees above zero), if we look at graph loketted to Russia, we can notice that in comparing to Brazil the difference between the maximum and minimum temperature is more than 60 degrees.',
 'That goods constitute jus

In [161]:
output['corrections']

['Overall, it can be clearly seen that the mentioned difference decreased in all countries.',
 'Overall, the gap in the amount of investment in renewable energy between developed and developing countries decreased.',
 'The majority of students in this course are 18 to 25 years old, accounting for slightly more than half of the applicants, and the age group with the least number of students was 40 years old and above.',
 'For Indonesia, it can be clearly seen that most men preferred working in services, but for women, there was only a slight difference between the number of employees in the agricultural sector, with 43% of workers, and the services sector, with 49% of workers.',
 'The first countries are the USA, UK, and Japan, whose numbers started at around 40 in 1980 and decreased to less than 15 in 1995.',
 'The rest of the world presents a different picture.',
 'Moreover, it can be said that Japan had less than 5% until 2000.',
 'The agricultural sector in Indonesia, as the table s

In [156]:
with open('corrections.json', 'a+') as f:
    f.write('\n'.join(output['corrections']))

In [159]:
len(corrections_cont_21)

840

In [None]:
with open('corrections_validation.json', 'a+') as f:
    f.write('\n'.join(corrections))

In [167]:
len(ds['train']['source'])

28465

In [168]:
with open('sources.json', 'a+') as f:
    f.write('\n'.join(ds['train']['source']))

In [172]:
with open('corrections_clean.txt', 'a+') as f:
    f.write('\n'.join(corrections_cont))

In [24]:
corrections_cont[0:10]

['The graph below shows the volume of goods transported between Eastern European countries in 2008.',
 'Overall, it can be seen that the younger the people, the more time they spent on sports and exercises.',
 'Many women use Facebook primarily to share photos and videos with other users or to view funny posts.',
 'It is noticeable that in both countries, there are more male workers than female workers in the industry sector, while in the other two sectors, the percentage depends on the country.',
 'The interesting fact is that the 55-64 age group differs significantly from the other groups.',
 'Additionally, chemicals are transported almost equally by rail and road, although the percentage is relatively high (11-16%).',
 'In conclusion, it is clear that the number of Facebook users started to decline, and the main audience of the network is women who enjoy using the site to share photos and enjoy memes.',
 'The first graphic illustrates the goods that were transported by rail.',
 'Ove

In [173]:
len(chunks)

2847

In [174]:
for chunk_sentences in tqdm(chunks):
    with open('sources_clean.json', 'a+') as f:
        f.write('\n'.join(chunk_sentences))

100%|██████████| 2847/2847 [00:00<00:00, 77484.30it/s]


In [188]:
with open('corrections.json', 'r') as f:
    corrections_fw = f.readlines()
with open('sources.json', 'r') as f:
    sources_fw = f.readlines()

In [189]:
assert len(corrections_fw) == len(sources_fw)

In [None]:
from huggingface_hub import login
login(TOKEN)

In [208]:
ds['validation']['source']

['Sports and Health cources can be named as the "oldest" because their visitors mostly 26-40 years old- 62% and older than 40 years old - 23% from total amount of people (630 students).',
 'To sum up, young generation prefer communication via the Internet more often than elderly people.',
 'Also, the percentage of 40 years old and above people on different courses is less then half.',
 'It can be clearly seen from the graph that the trend of Japan and Sweden old citizens is similar from 1940 to approximately 1992.',
 'On the second place manufactured goods with 25%.',
 'In 1990 the gradual decline slowed down in UK, USA and New Zealand, while Germany even experienced a significant incline of the income difference.',
 'In sum, Agriculture, Industry and Services had 72, 23, and 105 workers respectively.',
 'The least active category, in terms of time of exercising, is an age group over 75, the time spent on training does not exceed for both genders 30 minutes.',
 'This feature covers Uni

In [207]:
corrections

["Sports and Health courses can be considered the 'oldest' because their visitors are mostly between 26-40 years old (62%) and older than 40 years old (23%) out of a total of 630 students.",
 'To sum up, the young generation prefers communication via the Internet more often than elderly people.',
 'Also, the percentage of people 40 years old and above on different courses is less than half.',
 "It can be clearly seen from the graph that the trend of Japan and Sweden's older citizens is similar from 1940 to approximately 1992.",
 'In second place are manufactured goods, with 25%.',
 'In 1990, the gradual decline slowed down in the UK, USA, and New Zealand, while Germany even experienced a significant increase in the income difference.',
 'In sum, Agriculture, Industry, and Services had 72, 23, and 105 workers, respectively.',
 'The least active category, in terms of exercise time, is the age group over 75, with both genders spending no more than 30 minutes on training.',
 'This feature 

In [209]:
dataset_dict = {
    'train': {
        'source': sources_fw,
        'target': corrections_fw
    },
    'validation': {
        'source': ds['validation']['source'],
        'target': corrections
    }
}

In [210]:
from datasets import Dataset, DatasetDict

In [211]:
hf_dataset = DatasetDict({
    'train': Dataset.from_dict(dataset_dict['train']),
    'validation': Dataset.from_dict(dataset_dict['validation'])
})

In [None]:
ds['validation']['source']

In [212]:
print(f"Dataset contains {len(hf_dataset['train'])} examples")
print(f"Dataset contains {len(hf_dataset['validation'])} examples")
print("\nFirst 3 examples:")
for i in range(min(3, len(hf_dataset['train']))):
    print(f"Source: {hf_dataset['train'][i]['source']}")
    print(f"Target: {hf_dataset['train'][i]['target']}")
print("\nFirst 3 examples:")
for i in range(min(3, len(hf_dataset['train']))):
    print(f"Source: {hf_dataset['validation'][i]['source']}")
    print(f"Target: {hf_dataset['validation'][i]['target']}")

Dataset contains 28497 examples
Dataset contains 3558 examples

First 3 examples:
Source: First of all, we can see increasing tendency of overweighting during the hole period.

Target: First of all, we can see an increasing tendency of overweight during the whole period.

Source: Food products were mostly transportaded by the road.

Target: Food products were mostly transported by road.

Source: For the same period of life, man dramastically decreases spent time from 282,1 minutes to 60,5 minutes.

Target: For the same period of life, men dramatically decrease the time spent from 282.1 minutes to 60.5 minutes.


First 3 examples:
Source: Sports and Health cources can be named as the "oldest" because their visitors mostly 26-40 years old- 62% and older than 40 years old - 23% from total amount of people (630 students).
Target: Sports and Health courses can be considered the 'oldest' because their visitors are mostly between 26-40 years old (62%) and older than 40 years old (23%) out of 

In [214]:
username = "Zlovoblachko"
dataset_name = "Llama-REALEC-GEC-train_val"

In [215]:
hf_dataset.push_to_hub(f"{username}/{dataset_name}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Zlovoblachko/Llama-REALEC-GEC-train_val/commit/9839b681894165a151a9dd7599a96c1f85dbe047', commit_message='Upload dataset', commit_description='', oid='9839b681894165a151a9dd7599a96c1f85dbe047', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Zlovoblachko/Llama-REALEC-GEC-train_val', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Zlovoblachko/Llama-REALEC-GEC-train_val'), pr_revision=None, pr_num=None)