In [1]:
import pandas as pd
import numpy as np

np.random.seed(0)

In [2]:
# We use min(8, 9) = 8 prompts, so both corpuses have the same number of prompts
n_prompts = 8
# We use the longest n_samples_per_prompt samples for each prompt, measured by number of words
n_samples_per_prompt = 100

In [3]:
index_0 = np.concatenate([["prompt_id", "text"], ["bert"] * 768])
index_1 = np.concatenate([["prompt_id", "text"], [i for i in range(768)]])
index = pd.MultiIndex.from_arrays([index_0, index_1])

# Reddit

In [4]:
pairs = []
for name in ["train", "test", "valid"]:
    with open("reddit/writingPrompts/" + name + ".wp_source") as f_prompts:
        prompts = f_prompts.readlines()
    with open("reddit/writingPrompts/" + name + ".wp_target") as f_responses:
        texts = f_responses.readlines()
    assert len(prompts) == len(texts)
    pairs.extend(list(zip(prompts, texts)))
df_reddit = pd.DataFrame(pairs, columns=["prompt", "text"])
df_reddit = df_reddit[~df_reddit["prompt"].str.contains("hitler", case=False)]
df_reddit = df_reddit.drop_duplicates(subset=["text"])
df_reddit = df_reddit[
    df_reddit["prompt"].isin(df_reddit["prompt"].value_counts().index[:n_prompts])
]
df_reddit["prompt_id"] = df_reddit["prompt"].astype("category").cat.codes

# Get the longest n_samples responses for each prompt
df_reddit["text_len"] = df_reddit["text"].apply(lambda x: len(str.split(x)))
df_reddit = (
    df_reddit.groupby("prompt_id")
    .apply(lambda x: x.nlargest(n_samples_per_prompt, "text_len"), include_groups=False)
    .reset_index(level=0, drop=False)
    .reset_index(drop=True)
)

# Break prompt into prompt and and prompt_tag
df_reddit["prompt_tag"] = (
    df_reddit["prompt"]
    .str.split(" \]", n=1)
    .str[0]
    .replace("\[", "", regex=True)
    .str.strip()
)
df_reddit["prompt"] = df_reddit["prompt"].str.split(" \]", n=1).str[1].str.strip()
df_reddit["prompt"] = "Prompt\n" + df_reddit["prompt"]
df_reddit["prompt_id"] = df_reddit["prompt"].astype("category").cat.codes

In [5]:
df_reddit_prompts = (
    df_reddit[["prompt_id", "prompt", "prompt_tag"]]
    .drop_duplicates()
    .sort_values("prompt_id")
    .reset_index(drop=True)
)
df_reddit_prompts.to_csv("reddit_prompts.csv", index=False)

In [6]:
df_reddit = df_reddit[["prompt_id", "text"]]
df_reddit = df_reddit.sort_values("prompt_id").reset_index(drop=True)
df_reddit.to_csv("reddit.csv", index=False)

```
WP: Writing Prompt
SP: Simple Prompt
EU: Established Universe
CW: Constrained Writing
TT: Theme Thursday
PM: Prompt Me
MP: Media Prompt
IP: Image Prompt
PI: Prompt Inspired
OT: Off Topic
* OT as an Advertisement!
RF: Reality Fiction
```

https://www.reddit.com/r/WritingPrompts/wiki/how_to_tag_prompts/

In [7]:
df_reddit_prompts

Unnamed: 0,prompt_id,prompt,prompt_tag
0,0,Prompt\nA peaceful alien race is besieged by a...,WP
1,1,Prompt\nThere is no prompt . Just write a stor...,WP
2,2,Prompt\nThis is the prologue ( or the first ch...,WP
3,3,Prompt\nWrite a short story where the first se...,WP
4,4,Prompt\nWrite the first and last paragraph of ...,CW
5,5,Prompt\nWrite the letter that you always wante...,WP
6,6,Prompt\nYou live in a city full of people with...,WP
7,7,Prompt\n`` She said she loved him . '' Insert ...,WP


In [8]:
df_reddit

Unnamed: 0,prompt_id,text
0,0,`` We left them there to study ! '' Proclaimed...
1,0,“ I suggest we initiate protocol Zestraol ” <n...
2,0,Our War Council was surprised when these Human...
3,0,"`` Drax , the Slovians have taken E13-49e , 4t..."
4,0,"`` They 've taken Marin , sir . '' <newline> <..."
...,...,...
795,7,"I 'd see her walking down the hall , her hair ..."
796,7,Wait . <newline> <newline> Doubt was settling ...
797,7,It started as a chauvinistic affair meant to m...
798,7,Izzard stalked through the once slicked stone ...


In [9]:
print("Number of words in the responses:")
df_reddit["text"].apply(lambda x: len(x.split())).describe()

Number of words in the responses:


count     800.00000
mean      695.49625
std       514.91980
min       121.00000
25%       290.00000
50%       520.00000
75%       932.25000
max      2594.00000
Name: text, dtype: float64

# The Hewlett Foundation: Automated Essay Scoring

https://www.kaggle.com/competitions/asap-aes/code

In [10]:
import os

hewlett_prompts_dir = "hewlett/prompts"

prompts = []
for file in os.listdir(hewlett_prompts_dir):
    with open(hewlett_prompts_dir + "/" + file) as f:
        prompt = f.read()
    prompts.append((int(file.split(".")[0]) - 1, prompt))

df_hewlett_prompts = pd.DataFrame(prompts, columns=["prompt_id", "prompt"])
df_hewlett_prompts["prompt_tag"] = df_hewlett_prompts["prompt"].str.contains(
    "Source Essay"
)
df_hewlett_prompts["prompt_tag"] = df_hewlett_prompts["prompt_tag"].replace(
    {True: "source dependent responses", False: "persuasive / narrative / expository"}
)
df_hewlett_prompts = df_hewlett_prompts.sort_values("prompt_id").reset_index(drop=True)
df_hewlett_prompts.to_csv("hewlett_prompts.csv", index=False)

In [11]:
hewlett_dir = "hewlett"

filenames = [
    "training_set_rel3.tsv",
    "valid_set.tsv",
    "test_set.tsv",
]

dfs = []
for filename in filenames:
    df = pd.read_csv(f"{hewlett_dir}/{filename}", sep="\t", encoding="ISO-8859-1")
    df = df[["essay_set", "essay"]]
    df.rename(columns={"essay_set": "prompt_id", "essay": "text"}, inplace=True)
    df["prompt_id"] = df["prompt_id"].astype(int).apply(lambda x: x - 1)
    dfs.append(df)

# Don't need to remove the responses of any prompts because there are 8 distinct prompts in this dataset
df_hewlett = pd.concat(dfs, ignore_index=True)

df_hewlett = df_hewlett[df_hewlett["text"] != ""]
df_hewlett = df_hewlett.dropna()
df_hewlett = df_hewlett.drop_duplicates()

# Get the longest n_samples responses for each prompt
df_hewlett["text_len"] = df_hewlett["text"].apply(lambda x: len(str.split(x)))
df_hewlett = (
    df_hewlett.groupby("prompt_id")
    .apply(lambda x: x.nlargest(n_samples_per_prompt, "text_len"), include_groups=False)
    .reset_index(level=0, drop=False)
    .reset_index(drop=True)
)
df_hewlett = df_hewlett[["prompt_id", "text"]]

df_hewlett.to_csv("hewlett.csv", index=False)

In [12]:
df_hewlett_prompts

Unnamed: 0,prompt_id,prompt,prompt_tag
0,0,"Prompt\nMore and more people use computers, bu...",persuasive / narrative / expository
1,1,"Prompt\nCensorship in the Libraries\n""All of u...",persuasive / narrative / expository
2,2,Source Essay\nROUGH ROAD AHEAD: Do Not Exceed ...,source dependent responses
3,3,Source Essay\nWinter Hibiscus by Minfong Ho\nS...,source dependent responses
4,4,Source Essay\nNarciso Rodriguez\nfrom Home: Th...,source dependent responses
5,5,Source Essay\nThe Mooring Mast\nby Marcia Amid...,source dependent responses
6,6,Prompt\nWrite about patience. Being patient me...,persuasive / narrative / expository
7,7,Prompt\nWe all understand the benefits of laug...,persuasive / narrative / expository


In [13]:
df_hewlett

Unnamed: 0,prompt_id,text
0,0,My standing postion on this cause is that comp...
1,0,"@ORGANIZATION1, @CAPS1? Are you there?"" ""@CAPS..."
2,0,"Dear The @CAPS1 newspaper, @CAPS2 in front of ..."
3,0,Dear @CAPS1 Society: Computers are perhaps one...
4,0,"Dear @ORGANIZATION1, The creation of computers..."
...,...,...
795,7,"We couldn't control our selves, our eyes wate..."
796,7,It all started at the play ground @CAPS9 me ...
797,7,For my family laughter is important to us bec...
798,7,"Laughter, one of the greatest gifts in life. ..."


In [14]:
print("Number of words in the responses:")
df_hewlett["text"].apply(lambda x: len(x.split())).describe()

Number of words in the responses:


count     800.000000
mean      465.585000
std       246.627283
min       205.000000
25%       254.000000
50%       351.500000
75%       718.000000
max      1064.000000
Name: text, dtype: float64

In [15]:
"""
# Expand the bert column into 768 columns
df_hewlett = pd.concat(
    [df_hewlett, pd.DataFrame(np.zeros((len(df_hewlett), 768)))], axis=1
)
df_hewlett.columns = index
"""

'\n# Expand the bert column into 768 columns\ndf_hewlett = pd.concat(\n    [df_hewlett, pd.DataFrame(np.zeros((len(df_hewlett), 768)))], axis=1\n)\ndf_hewlett.columns = index\n'

- tokenize/replacements for both?
- then vectorize