In [272]:
import pandas as pd
import numpy as np
import ast

np.random.seed(0)

In [273]:
# Exclude responses with fewer than 20 words
min_words = 20
# Use min(8, 9) = 8 prompts, so both corpuses have the same number of prompts
n_prompts = 8
# Use min(responses for each prompt in both corpuses) = 421 samples, so both corpuses have the same number of samples for each prompt
n_samples = 421

In [274]:
index_0 = np.concatenate([["prompt_id", "text"], ["bert"] * 768])
index_1 = np.concatenate([["prompt_id", "text"], [i for i in range(768)]])
index = pd.MultiIndex.from_arrays([index_0, index_1])

# Reddit

In [275]:
reddit_dir = "reddit"
df_reddit = pd.read_csv(f"{reddit_dir}/data.csv")
df_reddit = df_reddit[["prompt", "response", "bert"]]

prompts = df_reddit["prompt"].unique()
prompts = np.random.choice(prompts, n_prompts, replace=False)
prompts = pd.DataFrame(prompts, columns=["prompt"])
prompts = prompts.reset_index()
prompts = prompts.rename(columns={"index": "prompt_id"})
prompts.to_csv(f"reddit_prompts.csv", index=False)


df_reddit = df_reddit[df_reddit["prompt"].isin(prompts["prompt"])]
df_reddit = df_reddit.dropna()
df_reddit = df_reddit.drop_duplicates()
df_reddit = df_reddit[df_reddit["response"].str.split().apply(len) >= min_words]
df_reddit = (
    df_reddit.groupby("prompt")
    .apply(lambda x: x.sample(n=n_samples), include_groups=False)
    .reset_index()
)
df_reddit = df_reddit.rename(columns={"response": "text"})
df_reddit = df_reddit.reset_index(drop=True)
df_reddit["prompt_id"] = df_reddit["prompt"].map(
    prompts.set_index("prompt")["prompt_id"]
)
df_reddit = df_reddit[["prompt_id", "text", "bert"]]

# expand the bert column into 768 columns
df_reddit["bert"] = df_reddit["bert"].apply(ast.literal_eval)
df_reddit = pd.concat(
    [df_reddit[["prompt_id", "text"]], pd.DataFrame(df_reddit["bert"].tolist())],
    axis=1,
)

df_reddit.columns = index
df_reddit.to_csv(f"reddit.csv", index=False)
df_reddit

Unnamed: 0_level_0,prompt_id,text,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert
Unnamed: 0_level_1,prompt_id,text,0,1,2,3,4,5,6,7,...,758,759,760,761,762,763,764,765,766,767
0,4,"Once upon a time , there lived a young girl na...",-0.304774,-0.066621,-0.458460,-0.348629,-0.235056,0.135589,0.684298,1.056225,...,-0.431397,-0.122974,0.248682,-0.381900,-0.144469,-0.483375,0.060971,0.093469,0.429061,0.313885
1,4,"Once upon a time , there was a young girl name...",-0.256528,-0.244718,-0.544889,-0.247178,-0.191813,0.082795,0.285416,1.609235,...,-0.249019,-0.138192,0.095791,-0.361293,-0.187787,-0.122126,0.196495,0.472413,0.372561,0.254166
2,4,"Once upon a time , there was a young girl name...",-0.457420,-0.014174,-0.715954,-0.129813,-0.221006,0.117725,0.516658,1.094169,...,-0.854530,-0.360928,0.002992,-0.536758,-0.056030,-0.203375,0.023213,0.044519,0.640650,0.065401
3,4,Once upon a time there was a young girl named ...,-0.119870,0.253916,-0.700468,-0.100959,-0.242528,-0.062642,0.606121,1.204461,...,-0.351698,-0.090153,-0.140591,-0.567719,-0.400338,-0.262284,0.340783,0.313515,0.605345,0.346677
4,4,"Once upon a time , in a far-off kingdom , ther...",-0.312125,-0.081119,-0.847111,-0.184394,-0.388980,0.037795,0.650106,1.244395,...,-0.216049,-0.217917,-0.051048,-0.588675,-0.208345,-0.323563,0.155095,0.100316,0.628687,0.077700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3363,0,Yet only when purely laid pudding cups twice-f...,0.082633,0.038716,0.090078,-0.121538,-0.578873,-0.498635,0.227699,0.645669,...,0.273625,0.267646,-0.218779,-0.269339,0.435099,-0.361356,-0.336388,-0.298026,0.433533,0.481662
3364,0,Here is a story using the sentence `` She said...,-0.310450,0.004769,-0.149769,0.128894,-0.527246,0.003578,0.272628,1.033892,...,-0.193349,-0.057503,0.362484,-0.531271,-0.236963,0.052515,-0.058448,-0.014917,0.681543,0.465854
3365,0,"She said she loved him , only reluctantly . Al...",-0.253552,-0.274067,-0.165790,0.088392,-0.429238,0.036131,0.008327,0.289262,...,-0.165354,-0.047397,0.063767,-0.285791,0.152759,0.148773,-0.103898,-0.361780,0.306210,0.382839
3366,0,`` She said she only loved him . '' Tears stre...,-0.330699,0.388120,-0.043100,-0.140850,-0.233332,-0.199558,0.447998,0.001792,...,0.328447,0.169778,0.117445,-0.042216,-0.276515,-0.109386,-0.061173,-0.142409,0.308049,0.339029


In [276]:
df_reddit["prompt_id"].value_counts()

prompt_id
0            421
1            421
2            421
3            421
4            421
5            421
6            421
7            421
Name: count, dtype: int64

In [277]:
print("Number of words in the responses:")

df_reddit["text", "text"].apply(lambda x: len(x.split())).describe()

Number of words in the responses:


count    3368.000000
mean      356.817399
std       274.552991
min        20.000000
25%       197.750000
50%       282.000000
75%       444.000000
max      2395.000000
Name: (text, text), dtype: float64

# The Hewlett Foundation: Automated Essay Scoring

https://www.kaggle.com/competitions/asap-aes/code

In [278]:
hewlett_dir = "hewlett"

filenames = [
    "training_set_rel3.tsv",
    "valid_set.tsv",
    "test_set.tsv",
]

dfs = []
for filename in filenames:
    df = pd.read_csv(f"{hewlett_dir}/{filename}", sep="\t", encoding="ISO-8859-1")
    df = df[["essay_set", "essay"]]
    df.rename(columns={"essay_set": "prompt_id", "essay": "text"}, inplace=True)
    df["prompt_id"] = df["prompt_id"].astype(int)
    dfs.append(df)

# Don't need to remove the responses of any prompts because there are 8 distinct prompts in this dataset
df_hewlett = pd.concat(dfs, ignore_index=True)
df_hewlett = df_hewlett[df_hewlett["text"] != ""]
df_hewlett = df_hewlett.dropna()
df_hewlett = df_hewlett.drop_duplicates()
df_hewlett = df_hewlett[df_hewlett["text"].apply(lambda x: len(x.split())) >= min_words]
df_hewlett = (
    df_hewlett.groupby("prompt_id")
    .apply(lambda x: x.sample(n=n_samples), include_groups=False)
    .reset_index(drop=False, level=0)
    .reset_index(drop=True)
)
df_hewlett = pd.concat(
    [df_hewlett, pd.DataFrame(np.zeros((len(df_hewlett), 768)))], axis=1
)
df_hewlett.columns = index
df_hewlett.to_csv("hewlett.csv", index=False)
df_hewlett

Unnamed: 0_level_0,prompt_id,text,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert,bert
Unnamed: 0_level_1,prompt_id,text,0,1,2,3,4,5,6,7,...,758,759,760,761,762,763,764,765,766,767
0,1,Daer People of the Newspaper I think that comp...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,"Dear Newspaper @CAPS1, @CAPS2 I have an artcle...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,"Dear news paper, Computers are good for people...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,"Dear local newspaper, @CAPS1 do you keep in to...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,"Dear @ORGANIZATION1, @CAPS1 you know that comp...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3363,8,"Human beings are...unusual...Okay, we're weir...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3364,8,My dad and i went out to teach me how to driv...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3365,8,"Every spring break my brothers and I, go spen...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3366,8,"It has been said that ""Laughter is the spark ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [279]:
df_hewlett["prompt_id"].value_counts()

prompt_id
1            421
2            421
3            421
4            421
5            421
6            421
7            421
8            421
Name: count, dtype: int64

In [280]:
print("Number of words in the responses:")

df_hewlett["text", "text"].apply(lambda x: len(x.split())).describe()

Number of words in the responses:


count    3368.000000
mean      248.237827
std       203.768354
min        20.000000
25%       101.000000
50%       171.000000
75%       350.000000
max       974.000000
Name: (text, text), dtype: float64