# Prepare dataset (jsonl file)

Datasets with **features as text (explicit form)** for **all-in-one predictions**

https://jsonlines.org/

## Libraries

In [1]:
import os
import json
import pandas as pd
import random

## Load csv file

In [2]:
data_dir = os.path.join(os.getcwd(), "data")

In [3]:
df = pd.read_csv(os.path.join(data_dir, "persuasive_essays_dataset.csv"), index_col=0)

In [4]:
# df.isna().sum()

In [5]:
df.head()

Unnamed: 0,tag,label,start,end,argument_component,essay_file,essay_title,essay_text,sentence,nr_essay_paragraphs,paragraph_nr,paragraph,is_component_in_intro_paragraph,is_component_in_conclusion_paragraph,is_component_first_in_paragraph,is_component_last_in_paragraph,split,structral_featxt,argument_counter
0,T1,MajorClaim,503,575,we should attach more importance to cooperatio...,essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,"From this point of view, I firmly believe that...",4,1,It is always said that competition can effecti...,1,0,1,1,TRAIN,Topic: Should students be taught to compete or...,1
1,T3,Claim,591,714,"through cooperation, children can learn about ...",essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,"First of all, through cooperation, children ca...",4,2,"First of all, through cooperation, children ca...",0,0,1,0,TRAIN,Topic: Should students be taught to compete or...,2
2,T4,Premise,716,851,What we acquired from team work is not only ho...,essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,What we acquired from team work is not only ho...,4,2,"First of all, through cooperation, children ca...",0,0,0,0,TRAIN,Topic: Should students be taught to compete or...,3
3,T5,Premise,853,1086,"During the process of cooperation, children ca...",essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,"During the process of cooperation, children ca...",4,2,"First of all, through cooperation, children ca...",0,0,0,0,TRAIN,Topic: Should students be taught to compete or...,4
4,T6,Premise,1088,1191,All of these skills help them to get on well w...,essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,All of these skills help them to get on well w...,4,2,"First of all, through cooperation, children ca...",0,0,0,1,TRAIN,Topic: Should students be taught to compete or...,5


In [6]:
len(df)

6089

In [7]:
df.split.value_counts()

split
TRAIN    4823
TEST     1266
Name: count, dtype: int64

In [8]:
train_essays_l = list(df[df.split=="TRAIN"].essay_file.value_counts().index)
len(train_essays_l)

322

In [9]:
# validation set: 10% of train set

val_size = int(322 * 10/100)
val_size

32

In [10]:
random.seed(42)
val_essays_l = random.sample(train_essays_l, val_size)

In [11]:
len(val_essays_l)
# val_essays_l

32

In [12]:
train_essays_l = list(set(train_essays_l) - set(val_essays_l))
len(train_essays_l)

290

In [13]:
test_essays_l = list(df[df.split=="TEST"].essay_file.value_counts().index)
len(test_essays_l)

80

In [14]:
# test_essays_l

## Prepare prompt

In [15]:
# Dataset in chat completion format

def formatting_fct(task_description="", question="", answer="", mode="train"):
    
    prompt_d = {"messages": [
        {"role": "system", "content": f"{task_description}"},
        {"role": "user", "content": f"{question}"},
        {"role": "assistant", "content": f"{answer if mode=='train' else ''}"}
    ]
             }
    
    return prompt_d

In [16]:
my_task_description = """### Your task is to classify an ordered list of argument components from an essay into three possible classes: major claim, claim, or premise. For this purpose, we give you the following information in order:
A. The essay in which the argument components appear
B. The ordered list of argument components, where each component is followed by 5 structural features: paragraph number of the component, is the component in the introduction (1=yes/0=no), is the component in the conclusion (1=yes/0=no), is the component first in its paragraph (1=yes/0=no), is the component last in its paragraph (1=yes/0=no).
"""

In [17]:
print(my_task_description)

### Your task is to classify an ordered list of argument components from an essay into three possible classes: major claim, claim, or premise. For this purpose, we give you the following information in order:
A. The essay in which the argument components appear
B. The ordered list of argument components, where each component is followed by 5 structural features: paragraph number of the component, is the component in the introduction (1=yes/0=no), is the component in the conclusion (1=yes/0=no), is the component first in its paragraph (1=yes/0=no), is the component last in its paragraph (1=yes/0=no).



In [18]:
df.columns

Index(['tag', 'label', 'start', 'end', 'argument_component', 'essay_file',
       'essay_title', 'essay_text', 'sentence', 'nr_essay_paragraphs',
       'paragraph_nr', 'paragraph', 'is_component_in_intro_paragraph',
       'is_component_in_conclusion_paragraph',
       'is_component_first_in_paragraph', 'is_component_last_in_paragraph',
       'split', 'structral_featxt', 'argument_counter'],
      dtype='object')

In [19]:
def build_question(essay_text, argument_components, structural_features):
    
    ac_l = ""
    
    for i, x in enumerate(argument_components):
        
        ac_l = ac_l + f"""argument {i+1}: {x}
Structural features {i+1}: {structural_features["paragraph_nr"][i]}, {structural_features["is_component_in_intro_paragraph"][i]}, {structural_features["is_component_in_conclusion_paragraph"][i]}, {structural_features["is_component_first_in_paragraph"][i]}, {structural_features["is_component_last_in_paragraph"][i]}.
"""
    
    question = f"""### Here is the information about the argument components:
A. Essay:\n{essay_text}

B. Ordered list of argument components:\n{ac_l}
You must return the list of classes of the sucessive argument components given in point B in format [<class of argument 1>,...,<class of argument n>]. The list must be of lenght n, where n is the number of components. The 3 possible classes are major claim, claim or premise.
"""
    
    return question

In [20]:
df.columns

Index(['tag', 'label', 'start', 'end', 'argument_component', 'essay_file',
       'essay_title', 'essay_text', 'sentence', 'nr_essay_paragraphs',
       'paragraph_nr', 'paragraph', 'is_component_in_intro_paragraph',
       'is_component_in_conclusion_paragraph',
       'is_component_first_in_paragraph', 'is_component_last_in_paragraph',
       'split', 'structral_featxt', 'argument_counter'],
      dtype='object')

In [21]:
essay_file = train_essays_l[0]
essay_sub_df = df[df.essay_file==essay_file]
essay_text = essay_sub_df.iloc[0].essay_text
argument_components = list(essay_sub_df.argument_component.values)
structural_fts = essay_sub_df[["paragraph_nr",
                               "is_component_in_intro_paragraph",
                               "is_component_in_conclusion_paragraph", 
                               "is_component_first_in_paragraph", 
                               "is_component_last_in_paragraph"]].to_dict(orient="list")
argument_labels = list(essay_sub_df.label.values)

question = build_question(essay_text, argument_components, structural_fts)
print(question)

### Here is the information about the argument components:
A. Essay:
Advertising impact on people's choice of consumption

Nowdays, people's choice of consumption is heavily affected by various of advertising on television, street, magazine etc. This argument may be true that people will be swayed by the advertising and buy things they do not need it for their life. For example, in most asian countries, advertisement company usually hire popular singers or actors to advertise the products to boosts the sales, and some people, especially adolescent may buy things from their favourite singer advertised to support their idol instead of to fill their needs.
In addition, products usually look more gorgeous and with good quality in advertisement, people are more likely to buy goods impulsively without a second thought. A good advertisement can convince people to believe that it is what they want, even though it is not the real needs of the society. And there are always people purchase partic

In [22]:
def build_answer(argument_labels):
    
    map_d = {"MajorClaim": "major claim", "Claim": "claim", "Premise": "premise"}
    
    return [map_d[x] for x in argument_labels]

In [23]:
answer = build_answer(argument_labels)
print(answer)

['claim', 'premise', 'premise', 'claim', 'premise', 'premise', 'claim', 'premise', 'premise', 'premise', 'claim', 'premise', 'major claim', 'premise', 'claim']


In [24]:
print(formatting_fct(my_task_description, question, answer, mode="train"))

{'messages': [{'role': 'system', 'content': '### Your task is to classify an ordered list of argument components from an essay into three possible classes: major claim, claim, or premise. For this purpose, we give you the following information in order:\nA. The essay in which the argument components appear\nB. The ordered list of argument components, where each component is followed by 5 structural features: paragraph number of the component, is the component in the introduction (1=yes/0=no), is the component in the conclusion (1=yes/0=no), is the component first in its paragraph (1=yes/0=no), is the component last in its paragraph (1=yes/0=no).\n'}, {'role': 'user', 'content': "### Here is the information about the argument components:\nA. Essay:\nAdvertising impact on people's choice of consumption\n\nNowdays, people's choice of consumption is heavily affected by various of advertising on television, street, magazine etc. This argument may be true that people will be swayed by the ad

## Prepare data files

### Train set

In [25]:
data_file_train = []

for essay_file in train_essays_l:
    
    essay_sub_df = df[df.essay_file==essay_file]
    essay_text = essay_sub_df.iloc[0].essay_text
    argument_components = list(essay_sub_df.argument_component.values)
    structural_fts = essay_sub_df[["paragraph_nr",
                               "is_component_in_intro_paragraph",
                               "is_component_in_conclusion_paragraph", 
                               "is_component_first_in_paragraph", 
                               "is_component_last_in_paragraph"]].to_dict(orient="list")
    argument_labels = list(essay_sub_df.label.values)

    question = build_question(essay_text, argument_components, structural_fts)
    answer = build_answer(argument_labels)
    
    data_file_train.append( formatting_fct(my_task_description, question, answer, mode="train") )

In [26]:
len(data_file_train)

290

In [27]:
for i in range(3):
    
    print(data_file_train[i])
    print()

{'messages': [{'role': 'system', 'content': '### Your task is to classify an ordered list of argument components from an essay into three possible classes: major claim, claim, or premise. For this purpose, we give you the following information in order:\nA. The essay in which the argument components appear\nB. The ordered list of argument components, where each component is followed by 5 structural features: paragraph number of the component, is the component in the introduction (1=yes/0=no), is the component in the conclusion (1=yes/0=no), is the component first in its paragraph (1=yes/0=no), is the component last in its paragraph (1=yes/0=no).\n'}, {'role': 'user', 'content': "### Here is the information about the argument components:\nA. Essay:\nAdvertising impact on people's choice of consumption\n\nNowdays, people's choice of consumption is heavily affected by various of advertising on television, street, magazine etc. This argument may be true that people will be swayed by the ad

### Validation set

In [28]:
data_file_val = []

for essay_file in val_essays_l:
    
    essay_sub_df = df[df.essay_file==essay_file]
    essay_text = essay_sub_df.iloc[0].essay_text
    argument_components = list(essay_sub_df.argument_component.values)
    argument_labels = list(essay_sub_df.label.values)
    structural_fts = essay_sub_df[["paragraph_nr",
                               "is_component_in_intro_paragraph",
                               "is_component_in_conclusion_paragraph", 
                               "is_component_first_in_paragraph", 
                               "is_component_last_in_paragraph"]].to_dict(orient="list")
    question = build_question(essay_text, argument_components, structural_fts)
    answer = build_answer(argument_labels)
    
    data_file_val.append( formatting_fct(my_task_description, question, answer, mode="train") )

In [29]:
len(data_file_val)

32

In [30]:
for i in range(3):
    
    print(data_file_val[i])
    print()

{'messages': [{'role': 'system', 'content': '### Your task is to classify an ordered list of argument components from an essay into three possible classes: major claim, claim, or premise. For this purpose, we give you the following information in order:\nA. The essay in which the argument components appear\nB. The ordered list of argument components, where each component is followed by 5 structural features: paragraph number of the component, is the component in the introduction (1=yes/0=no), is the component in the conclusion (1=yes/0=no), is the component first in its paragraph (1=yes/0=no), is the component last in its paragraph (1=yes/0=no).\n'}, {'role': 'user', 'content': '### Here is the information about the argument components:\nA. Essay:\nSome people believe that the Earth is being harmed by human\n\nWhether human activity is making the Earth a better place to live or damaging it is a debatable issue. Some people advocate the idea that human activity is advantageously influen

### Test set

In [31]:
data_file_test = []

for essay_file in test_essays_l:
    
    essay_sub_df = df[df.essay_file==essay_file]
    essay_text = essay_sub_df.iloc[0].essay_text
    argument_components = list(essay_sub_df.argument_component.values)
    argument_labels = list(essay_sub_df.label.values)
    structural_fts = essay_sub_df[["paragraph_nr",
                               "is_component_in_intro_paragraph",
                               "is_component_in_conclusion_paragraph", 
                               "is_component_first_in_paragraph", 
                               "is_component_last_in_paragraph"]].to_dict(orient="list")
    question = build_question(essay_text, argument_components, structural_fts)
    answer = build_answer(argument_labels)
    
    data_file_test.append( formatting_fct(my_task_description, question, answer, mode="test") )

In [32]:
len(data_file_test)

80

In [33]:
for i in range(3):
    
    print(data_file_test[i])
    print()

{'messages': [{'role': 'system', 'content': '### Your task is to classify an ordered list of argument components from an essay into three possible classes: major claim, claim, or premise. For this purpose, we give you the following information in order:\nA. The essay in which the argument components appear\nB. The ordered list of argument components, where each component is followed by 5 structural features: paragraph number of the component, is the component in the introduction (1=yes/0=no), is the component in the conclusion (1=yes/0=no), is the component first in its paragraph (1=yes/0=no), is the component last in its paragraph (1=yes/0=no).\n'}, {'role': 'user', 'content': "### Here is the information about the argument components:\nA. Essay:\nThe precondition of doing research by professors\n\nNowadays, many professors conduct research while teaching in colleges or universities. Although research could bring funding and latest achievements in the field, the research takes up too 

## Save `jsonl` files

In [34]:
file_name = "data_train_v4.jsonl"

with open(os.path.join(data_dir, file_name), 'w') as fh:
    
    for entry in data_file_train:
        
        json.dump(entry, fh)
        fh.write('\n')

In [35]:
file_name = "data_val_v4.jsonl"

with open(os.path.join(data_dir, file_name), 'w') as fh:
    
    for entry in data_file_val:
        
        json.dump(entry, fh)
        fh.write('\n')

In [36]:
file_name = "data_test_v4.jsonl"

with open(os.path.join(data_dir, file_name), 'w') as fh:
    
    for entry in data_file_test:
        
        json.dump(entry, fh)
        fh.write('\n')