I ran this on Databricks PySpark

In [1]:
from datasets import load_dataset_builder
from psutil._common import bytes2human
from datasets import load_dataset

In [2]:
def print_dataset_size_if_provided(*args, **kwargs):
  dataset_builder = load_dataset_builder(*args, **kwargs)

  if dataset_builder.info.download_size and dataset_builder.info.dataset_size:
    print(f'download_size={bytes2human(dataset_builder.info.download_size)}, dataset_size={bytes2human(dataset_builder.info.dataset_size)}')
  else:
    print('Dataset size is not provided by uploader')

print_dataset_size_if_provided("community-datasets/generics_kb", "generics_kb_best")

download_size=37.2M, dataset_size=95.4M


In [3]:
ds = load_dataset("community-datasets/generics_kb", "generics_kb_best")

In [4]:
df = ds["train"].to_pandas()

# Print the first 5 rows
print(df.columns)
print(df.iloc[0:5][["term", "generic_sentence"]])
print(len(df))
df = df[["term", "generic_sentence"]]

Index(['source', 'term', 'quantifier_frequency', 'quantifier_number',
       'generic_sentence', 'score'],
      dtype='object')
              term                                   generic_sentence
0       aa battery  AA batteries maintain the settings if the powe...
1  aardvark female  Aardvark females appear to come into season on...
2    aardvark hole  Aardvark holes are used by small buck as a res...
3    aardvark skin        Aardvark skin is thick and sparsely haired.
4         aardvark                               Aardvark isa mammal.
1020868


In [5]:
# Lowercase the terms
df["term"] = df["term"].str.lower()
# Remove punctuation
df["generic_sentence"] = df["generic_sentence"].str.replace(r'[^\w\s]', '')

# Deduplicate
df = df.drop_duplicates()

  df["generic_sentence"] = df["generic_sentence"].str.replace(r'[^\w\s]', '')


In [6]:
# Filter out rows with terms and sentences that are too short or too long
df = df[(df["term"].apply(len) >= 3) & (df["term"].apply(len) <= 20)]
df = df[(df["generic_sentence"].apply(len) >= 20) & (df["generic_sentence"].apply(len) <= 40)]
print(df.iloc[0:20][["term", "generic_sentence"]])
print(len(df))

        term                         generic_sentence
5   aardvark           Aardvarks also dig to get food
8   aardvark       Aardvarks are a nocturnal creature
11  aardvark                    Aardvarks are animals
13  aardvark         Aardvarks are capable of burrows
23  aardvark                    Aardvarks are mammals
29  aardvark                  Aardvarks are nocturnal
31  aardvark                 Aardvarks are placentals
32  aardvark           Aardvarks are powerful animals
34  aardvark                 Aardvarks are quadrupeds
44  aardvark                Aardvarks are vertebrates
45  aardvark        Aardvarks are very gentle animals
46  aardvark                    Aardvarks dig burrows
49  aardvark           Aardvarks eat first solid food
51  aardvark                    Aardvarks eat insects
52  aardvark  Aardvarks eat mostly ants and termites 
53  aardvark   Aardvarks eat mostly ants and termites
54  aardvark                 Aardvarks eat solid food
55  aardvark          Aardva

In [7]:
# Each term has multiple generic sentences. We will only keep the first few
df = df.groupby("term").head(3)
print(df.iloc[0:20][["term", "generic_sentence"]])
print(len(df))

            term                          generic_sentence
5       aardvark            Aardvarks also dig to get food
8       aardvark        Aardvarks are a nocturnal creature
11      aardvark                     Aardvarks are animals
198     aardwolf                Aardwolfs consume termites
199     aardwolf                Aardwolfs have part brains
200     aardwolf               Aardwolfs have part breasts
298          aba            ABA is incorporated in Florida
302          aba           ABA is the science of behavior 
306          aba  ABA is used to change a persons behavior
318       abacus        An abacus is a calculating machine
323      abalone            Abalone are in peril worldwide
325      abalone                 Abalone are marine snails
326      abalone    Abalone are nocturnal  active at night
356  abandonment                  Abandonment is disposals
357  abandonment                  Abandonment is rejection
360    abasement        Abasement is as infinite as desi

In [8]:
# Save the dataset
display(df)
# Saved as data/generic_sentences.csv
# Turns out the dataset is full of "Noun" is blablabla type of sentences. Not very useful

Unnamed: 0,term,generic_sentence
5,aardvark,Aardvarks also dig to get food
8,aardvark,Aardvarks are a nocturnal creature
11,aardvark,Aardvarks are animals
198,aardwolf,Aardwolfs consume termites
199,aardwolf,Aardwolfs have part brains
...,...,...
1020832,zygote,Zygotes are created by fertilization
1020834,zygote,Zygotes are embryology
1020862,zygotic gene,Zygotic genes pattern the early embryo
1020866,zyplar soil,Zyplar soils are on pediments


This I ran locally

In [9]:
ds = load_dataset("iastate/onestop_english")

df = ds["train"].to_pandas()

In [10]:
df = df[["text"]]
print(len(df))

# Split it by sentence and period
df = df["text"].str.split("\n", expand=True).stack().reset_index(level=1, drop=True).reset_index().rename(columns={0: "text"})
df = df["text"].str.split(".", expand=True).stack().reset_index(level=1, drop=True).reset_index().rename(columns={0: "text"})

# Only keep alphanumeric characters
df["text"] = df["text"].str.replace(r'[^a-zA-Z0-9\s]', '').str.strip()

print(len(df))

567
27412


  df["text"] = df["text"].str.replace(r'[^a-zA-Z0-9\s]', '').str.strip()


In [11]:
# Calculate the length of each sentence
df["length"] = df["text"].apply(len)
print(df["length"].describe())

count    27412.000000
mean        78.844119
std         69.536195
min          0.000000
25%          0.000000
50%         73.000000
75%        123.000000
max        811.000000
Name: length, dtype: float64


In [12]:
df = df[(df["length"] <= 100) & (df["length"] >= 20)]
print(len(df))

10107


In [13]:
from textnoisr import noise

def add_noise(text):
    augmenter = noise.CharNoiseAugmenter(noise_level=0.1)
    text = augmenter.add_noise(text)
    return text

In [14]:
df["noisy_text"] = df["text"].apply(add_noise)

In [15]:
print(df.iloc[0:1][["text", "noisy_text"]])

                                                text  \
1  These are the questions in a debate about the ...   

                                          noisy_text  
1  These arerthe quetMios n a debaWe aou the inte...  


In [16]:
df = df[["text", "noisy_text"]].reset_index(drop=True)
df.to_csv("data/onestop_english_sentences.csv", index=False)

In [17]:
# Convert to ShareGPT style


def convert(row):

    def convert_human(s):
        output = {
            "from": "human",
            "value": "Correct this sentence: " + s
        }

        return output

    def convert_gpt(s):
        output = {
            "from": "gpt",
            "value": s
        }

        return output

    return [convert_human(row["noisy_text"]), convert_gpt(row["text"])]

df["conversations"] = df.apply(convert, axis=1)

In [18]:
df = df[["conversations"]]
df.to_csv("data/onestop_english_sentences_sharegpt.csv", index=False)