# Prepare dataset (jsonl file)

Datasets with **features as text** for **all-in-one predictions**

https://jsonlines.org/

## Libraries

In [1]:
import os
import json
import pandas as pd
import random

## Load csv file

In [2]:
data_dir = os.path.join(os.getcwd(), "data")

In [3]:
df = pd.read_csv(os.path.join(data_dir, "persuasive_essays_dataset.csv"), index_col=0)

In [4]:
#Â df.isna().sum()

In [5]:
df.head()

Unnamed: 0,tag,label,start,end,argument_component,essay_file,essay_title,essay_text,sentence,nr_essay_paragraphs,paragraph_nr,paragraph,is_component_in_intro_paragraph,is_component_in_conclusion_paragraph,is_component_first_in_paragraph,is_component_last_in_paragraph,split,structral_featxt,argument_counter
0,T1,MajorClaim,503,575,we should attach more importance to cooperatio...,essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,"From this point of view, I firmly believe that...",4,1,It is always said that competition can effecti...,1,0,1,1,TRAIN,Topic: Should students be taught to compete or...,1
1,T3,Claim,591,714,"through cooperation, children can learn about ...",essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,"First of all, through cooperation, children ca...",4,2,"First of all, through cooperation, children ca...",0,0,1,0,TRAIN,Topic: Should students be taught to compete or...,2
2,T4,Premise,716,851,What we acquired from team work is not only ho...,essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,What we acquired from team work is not only ho...,4,2,"First of all, through cooperation, children ca...",0,0,0,0,TRAIN,Topic: Should students be taught to compete or...,3
3,T5,Premise,853,1086,"During the process of cooperation, children ca...",essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,"During the process of cooperation, children ca...",4,2,"First of all, through cooperation, children ca...",0,0,0,0,TRAIN,Topic: Should students be taught to compete or...,4
4,T6,Premise,1088,1191,All of these skills help them to get on well w...,essay001.txt,Should students be taught to compete or to coo...,Should students be taught to compete or to coo...,All of these skills help them to get on well w...,4,2,"First of all, through cooperation, children ca...",0,0,0,1,TRAIN,Topic: Should students be taught to compete or...,5


In [6]:
len(df)

6089

In [7]:
df.split.value_counts()

split
TRAIN    4823
TEST     1266
Name: count, dtype: int64

In [8]:
train_essays_l = list(df[df.split=="TRAIN"].essay_file.value_counts().index)
len(train_essays_l)

322

In [9]:
# validation set: 10% of train set

val_size = int(322 * 10/100)
val_size

32

In [10]:
random.seed(42)
val_essays_l = random.sample(train_essays_l, val_size)

In [11]:
len(val_essays_l)
# val_essays_l

32

In [12]:
train_essays_l = list(set(train_essays_l) - set(val_essays_l))
len(train_essays_l)

290

In [13]:
test_essays_l = list(df[df.split=="TEST"].essay_file.value_counts().index)
len(test_essays_l)

80

In [14]:
test_essays_l

['essay169.txt',
 'essay154.txt',
 'essay289.txt',
 'essay243.txt',
 'essay252.txt',
 'essay310.txt',
 'essay172.txt',
 'essay221.txt',
 'essay373.txt',
 'essay149.txt',
 'essay202.txt',
 'essay061.txt',
 'essay180.txt',
 'essay331.txt',
 'essay182.txt',
 'essay220.txt',
 'essay234.txt',
 'essay204.txt',
 'essay006.txt',
 'essay278.txt',
 'essay097.txt',
 'essay126.txt',
 'essay328.txt',
 'essay021.txt',
 'essay265.txt',
 'essay212.txt',
 'essay364.txt',
 'essay142.txt',
 'essay229.txt',
 'essay301.txt',
 'essay211.txt',
 'essay245.txt',
 'essay306.txt',
 'essay322.txt',
 'essay348.txt',
 'essay359.txt',
 'essay266.txt',
 'essay398.txt',
 'essay193.txt',
 'essay163.txt',
 'essay129.txt',
 'essay104.txt',
 'essay086.txt',
 'essay117.txt',
 'essay218.txt',
 'essay187.txt',
 'essay341.txt',
 'essay255.txt',
 'essay241.txt',
 'essay091.txt',
 'essay287.txt',
 'essay316.txt',
 'essay277.txt',
 'essay052.txt',
 'essay077.txt',
 'essay072.txt',
 'essay355.txt',
 'essay199.txt',
 'essay227.txt

## Prepare prompt

In [15]:
# Dataset in chat completion format

def formatting_fct(task_description="", question="", answer="", mode="train"):
    
    prompt_d = {"messages": [
        {"role": "system", "content": f"{task_description}"},
        {"role": "user", "content": f"{question}"},
        {"role": "assistant", "content": f"{answer if mode=='train' else ''}"}
    ]
             }
    
    return prompt_d

In [68]:
my_task_description = """### Your task is to classify an ordered list of argument components from an essay. Each argument component must be classified into one of the three possible classes: 'major claim', 'claim', or 'premise'. For this purpose, we give you the following information in order:
A. The essay in which the argument components appear
B. The ordered list of argument components
"""

In [69]:
print(my_task_description)

### Your task is to classify an ordered list of argument components from an essay. Each argument component must be classified into one of the three possible classes: 'major claim', 'claim', or 'premise'. For this purpose, we give you the following information in order:
A. The essay in which the argument components appear
B. The ordered list of argument components



In [70]:
def build_question(essay_text, argument_components):
    
    ac_l = ""
    for i, x in enumerate(argument_components):
        ac_l = ac_l + f"""argument {i+1}: {x}\n"""
    
    question = f"""### Here is the information about the argument components:
A. Essay:\n{essay_text}\n
B. Ordered list of argument components:\n{ac_l}""" + """
You must return a JSON with format {'Argument 1': <predicted class for Argument 1 (str)>, ..., 'Argument n': <predicted class for Argument n (str)>}. The 3 possible classes are: 'major claim', 'claim' or 'premise'.
"""
    return question

In [71]:
df.columns

Index(['tag', 'label', 'start', 'end', 'argument_component', 'essay_file',
       'essay_title', 'essay_text', 'sentence', 'nr_essay_paragraphs',
       'paragraph_nr', 'paragraph', 'is_component_in_intro_paragraph',
       'is_component_in_conclusion_paragraph',
       'is_component_first_in_paragraph', 'is_component_last_in_paragraph',
       'split', 'structral_featxt', 'argument_counter'],
      dtype='object')

In [72]:
essay_file = train_essays_l[0]
essay_sub_df = df[df.essay_file==essay_file]
essay_text = essay_sub_df.iloc[0].essay_text
argument_components = list(essay_sub_df.argument_component.values)
argument_labels = list(essay_sub_df.label.values)

question = build_question(essay_text, argument_components)
print(question)

### Here is the information about the argument components:
A. Essay:
Reading fictions is more pleasant

There are many different opinions regarding hobbies. Some people would rather watch TV. However, in my view, reading fiction is more enjoyable than watching movies for the following reasons.
Firstly, fiction books pay more attention to details. In fact, we can know characters, places, and times of a novel very well during reading it. Most of the novels describe not only main characters but also other people in the novels, where each scene is happening. For example, after I read animal farms which is a story of Russian revolution, I could understand the leaders and different segments of the Russian people. However, a movie just gives us a very small review of its stories and characters.
Secondly, reading a novel is more convenient. Some novels have around 100 pages and readers cannot finish them in just one hour. Yet these people can continue to read their novels wherever they find fr

In [73]:
def build_answer(argument_labels):
    
    map_d = {"MajorClaim": "major claim", "Claim": "claim", "Premise": "premise"}
    
    return {f"Argument {i+1}": map_d[x] for i, x in enumerate(argument_labels)}

In [74]:
answer = build_answer(argument_labels)
print(answer)

{'Argument 1': 'major claim', 'Argument 2': 'claim', 'Argument 3': 'premise', 'Argument 4': 'premise', 'Argument 5': 'premise', 'Argument 6': 'premise', 'Argument 7': 'claim', 'Argument 8': 'premise', 'Argument 9': 'premise', 'Argument 10': 'premise', 'Argument 11': 'premise', 'Argument 12': 'premise', 'Argument 13': 'premise', 'Argument 14': 'claim', 'Argument 15': 'premise', 'Argument 16': 'premise', 'Argument 17': 'premise', 'Argument 18': 'premise', 'Argument 19': 'major claim', 'Argument 20': 'claim'}


In [75]:
print(formatting_fct(my_task_description, question, answer, mode="train"))

{'messages': [{'role': 'system', 'content': "### Your task is to classify an ordered list of argument components from an essay. Each argument component must be classified into one of the three possible classes: 'major claim', 'claim', or 'premise'. For this purpose, we give you the following information in order:\nA. The essay in which the argument components appear\nB. The ordered list of argument components\n"}, {'role': 'user', 'content': "### Here is the information about the argument components:\nA. Essay:\nReading fictions is more pleasant\n\nThere are many different opinions regarding hobbies. Some people would rather watch TV. However, in my view, reading fiction is more enjoyable than watching movies for the following reasons.\nFirstly, fiction books pay more attention to details. In fact, we can know characters, places, and times of a novel very well during reading it. Most of the novels describe not only main characters but also other people in the novels, where each scene i

## Prepare data files

### Train set

In [76]:
data_file_train = []

for essay_file in train_essays_l:
    
    essay_sub_df = df[df.essay_file==essay_file]
    essay_text = essay_sub_df.iloc[0].essay_text
    argument_components = list(essay_sub_df.argument_component.values)
    argument_labels = list(essay_sub_df.label.values)

    question = build_question(essay_text, argument_components)
    answer = build_answer(argument_labels)
    
    data_file_train.append( formatting_fct(my_task_description, question, answer, mode="train") )

In [77]:
len(data_file_train)

290

In [78]:
for i in range(3):
    
    print(data_file_train[i])
    print()

{'messages': [{'role': 'system', 'content': "### Your task is to classify an ordered list of argument components from an essay. Each argument component must be classified into one of the three possible classes: 'major claim', 'claim', or 'premise'. For this purpose, we give you the following information in order:\nA. The essay in which the argument components appear\nB. The ordered list of argument components\n"}, {'role': 'user', 'content': "### Here is the information about the argument components:\nA. Essay:\nReading fictions is more pleasant\n\nThere are many different opinions regarding hobbies. Some people would rather watch TV. However, in my view, reading fiction is more enjoyable than watching movies for the following reasons.\nFirstly, fiction books pay more attention to details. In fact, we can know characters, places, and times of a novel very well during reading it. Most of the novels describe not only main characters but also other people in the novels, where each scene i

### Validation set

In [79]:
data_file_val = []

for essay_file in val_essays_l:
    
    essay_sub_df = df[df.essay_file==essay_file]
    essay_text = essay_sub_df.iloc[0].essay_text
    argument_components = list(essay_sub_df.argument_component.values)
    argument_labels = list(essay_sub_df.label.values)

    question = build_question(essay_text, argument_components)
    answer = build_answer(argument_labels)
    
    data_file_val.append( formatting_fct(my_task_description, question, answer, mode="train") )

In [80]:
len(data_file_val)

32

In [81]:
for i in range(3):
    
    print(data_file_val[i])
    print()

{'messages': [{'role': 'system', 'content': "### Your task is to classify an ordered list of argument components from an essay. Each argument component must be classified into one of the three possible classes: 'major claim', 'claim', or 'premise'. For this purpose, we give you the following information in order:\nA. The essay in which the argument components appear\nB. The ordered list of argument components\n"}, {'role': 'user', 'content': '### Here is the information about the argument components:\nA. Essay:\nSome people believe that the Earth is being harmed by human\n\nWhether human activity is making the Earth a better place to live or damaging it is a debatable issue. Some people advocate the idea that human activity is advantageously influencing to the Earth. However, I strongly believe that human activities are having a bad effect upon our planet, for human are polluting the Earth\'s environment, exhausting natural resource and threatening the living of other species.\nPeople 

### Test set

In [82]:
data_file_test = []

for essay_file in test_essays_l:
    
    essay_sub_df = df[df.essay_file==essay_file]
    essay_text = essay_sub_df.iloc[0].essay_text
    argument_components = list(essay_sub_df.argument_component.values)
    argument_labels = list(essay_sub_df.label.values)

    question = build_question(essay_text, argument_components)
    answer = build_answer(argument_labels)
    
    data_file_test.append( formatting_fct(my_task_description, question, answer, mode="test") )

In [83]:
len(data_file_test)

80

In [84]:
for i in range(3):
    
    print(data_file_test[i])
    print()

{'messages': [{'role': 'system', 'content': "### Your task is to classify an ordered list of argument components from an essay. Each argument component must be classified into one of the three possible classes: 'major claim', 'claim', or 'premise'. For this purpose, we give you the following information in order:\nA. The essay in which the argument components appear\nB. The ordered list of argument components\n"}, {'role': 'user', 'content': "### Here is the information about the argument components:\nA. Essay:\nThe precondition of doing research by professors\n\nNowadays, many professors conduct research while teaching in colleges or universities. Although research could bring funding and latest achievements in the field, the research takes up too much teaching time. As far as I am concerned, professors should spend more time on preparing courses than research.\nTo begin with, it is vital that professors should assist students to acquire knowledge. The professors' duty is to nurture s

## Save `jsonl` files

In [85]:
file_name = "data_train_v3b.jsonl"

with open(os.path.join(data_dir, file_name), 'w') as fh:
    
    for entry in data_file_train:
        
        json.dump(entry, fh)
        fh.write('\n')

In [86]:
file_name = "data_val_v3b.jsonl"

with open(os.path.join(data_dir, file_name), 'w') as fh:
    
    for entry in data_file_val:
        
        json.dump(entry, fh)
        fh.write('\n')

In [87]:
file_name = "data_test_v3b.jsonl"

with open(os.path.join(data_dir, file_name), 'w') as fh:
    
    for entry in data_file_test:
        
        json.dump(entry, fh)
        fh.write('\n')