In [1]:
# Create sample dataset from the askreddit.csv file.
# The askreddit.csv file has the following format: the first row is the header, and the rest of the rows are the questions. The second column is the question, and the first column is the question id.
# The dataset should be a jsonl file with the following format:
# {"messages": [{"role": "system", "content": "You are an reddit user posting on r/AskReddit."}, {"role": "user", "content": "Ask a question for the users of r/AskReddit"}, {"role": "model", "content": "Question from the askreddit.csv file"}]}

import csv
import json

system_message_post = "You are an reddit user posting on r/AskReddit."
user_message_post = "Ask a question for the users of r/AskReddit"

# Create a jsonl file from the askreddit.csv file
def create_jsonl_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        header = next(reader)
        questions = [row for row in reader]

    with open(output_file, 'w', encoding='utf-8') as f:
        for question in questions:
            question_id = question[0]
            question_text = question[1]
            messages = [{"role": "system", "content": system_message_post},
                        {"role": "user", "content": user_message_post},
                        {"role": "assistant", "content": question_text}]
            json.dump({"messages": messages}, f, ensure_ascii=False)
            f.write('\n')

create_jsonl_file('askreddit.csv', 'askreddit.jsonl')

In [2]:
# Create a second dataset with data from the askreddit.csv and the askreddit_comments.csv files
# The askreddit_comments.csv file has the following format: the first row is the header, and the rest of the rows are the comments. The third column is the comment, and the first column is the question id, the second column is the comment id.
# The dataset should be a jsonl file with the following format:
# {"messages": [{"role": "system", "content": "You are an reddit user posting on r/AskReddit."}, {"role": "user", "content": "Question from the askreddit.csv file"}, {"role": "model", "content": "Comment from the askreddit_comments.csv file"}]}
# There are multiple comments for each question, so the dataset should have multiple messages for each question.

system_message_comment = "You are an reddit user commenting on a question on r/AskReddit."

def create_jsonl_file_with_comments(questions_file, comments_file, output_file):
    with open(questions_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        header = next(reader)
        questions = [row for row in reader]

    with open(comments_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        header = next(reader)
        comments = [row for row in reader]

    with open(output_file, 'w', encoding='utf-8') as f:
        for question in questions:
            question_id = question[0]
            question_text = question[1]
            for comment in comments:
                if comment[0] == question_id:
                    comment_text = comment[2]
                    messages = [{"role": "system", "content": system_message_comment},
                                {"role": "user", "content": question_text},
                                {"role": "assistant", "content": comment_text}]
                    json.dump({"messages": messages}, f, ensure_ascii=False)
                    f.write('\n')

create_jsonl_file_with_comments('askreddit.csv', 'askreddit_comments.csv', 'askreddit_comments.jsonl')

In [1]:
# Create a validation set with the same format as the training set for the askreddit.jsonl file
import csv
import json

system_message_post = "You are an reddit user posting on r/AskReddit."
user_message_post = "Ask a question for the users of r/AskReddit"

# Create a jsonl file from the askreddit.csv file
def create_jsonl_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        header = next(reader)
        questions = [row for row in reader]

    with open(output_file, 'w', encoding='utf-8') as f:
        for question in questions:
            question_id = question[0]
            question_text = question[1]
            messages = [{"role": "system", "content": system_message_post},
                        {"role": "user", "content": user_message_post},
                        {"role": "assistant", "content": question_text}]
            json.dump({"messages": messages}, f, ensure_ascii=False)
            f.write('\n')

create_jsonl_file('askreddit_post_validation.csv', 'askreddit_post_validation.jsonl')

In [2]:
# Create a validation set with the same format as the training set for the askreddit_comments.jsonl file
import csv
import json

system_message_comment = "You are an reddit user commenting on a question on r/AskReddit."

def create_jsonl_file_with_comments(questions_file, comments_file, output_file):
    with open(questions_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        header = next(reader)
        questions = [row for row in reader]

    with open(comments_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        header = next(reader)
        comments = [row for row in reader]

    with open(output_file, 'w', encoding='utf-8') as f:
        for question in questions:
            question_id = question[0]
            question_text = question[1]
            for comment in comments:
                if comment[0] == question_id:
                    comment_text = comment[2]
                    messages = [{"role": "system", "content": system_message_comment},
                                {"role": "user", "content": question_text},
                                {"role": "assistant", "content": comment_text}]
                    json.dump({"messages": messages}, f, ensure_ascii=False)
                    f.write('\n')

create_jsonl_file_with_comments('askreddit_post_validation.csv', 'askreddit_comments_validation.csv', 'askreddit_comments_validation.jsonl')