# Dataset generator

## Imports

In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import re
import string
tqdm.pandas()

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Helper functions

The following functions are used to extract the right sentences for each class based on certain rules

### normalize

Runs the following normalizations on a string:
- lowercase all letters
- remove all punctuation
- replace all double spaces with a single space

In [None]:
def normalize(sentence: str):
    sentence = sentence.lower()
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    sentence = re.sub(r' +', ' ', sentence)

    return sentence

### get_ngrams

This helper function taks a sentence and an integer n, and returns all ngrams of length n

In [None]:
def get_ngrams(sentence: str, n: int):
    tokens = nltk.word_tokenize(sentence)
    ngrams = nltk.ngrams(tokens, n)
    return list(ngrams)

### parse_wildcard

this function takes a string and a wildcard rule, and checks if the string matches the rule

In [None]:
def parse_wildcard(wildcard: str, string: str):
    if len(wildcard) == 0:
        return True

    n = len(string)
    comparator = wildcard[0]
    if comparator == '<':
        return n < int(wildcard[1:])
    elif comparator == '>':
        return n > int(wildcard[1:])
    elif comparator == '=':
        if wildcard[1:].isnumeric():
            return n == int(wildcard[1:])
        else:
            return string == wildcard[1:]
    elif comparator == '!':
        if wildcard[1] == '(' and wildcard[-1] == ')':
            return not parse_wildcard(wildcard[1:-1], string)
        elif wildcard[1:].isnumeric():
            return n != int(wildcard[1:])
        else:
            return string != wildcard[1:]
    elif comparator == '(' and wildcard[-1] == ')':
        splitters = ['|', '&']
        splitter_index = -1
        splitter = ''
        for splitter in splitters:
            if splitter in wildcard:
                splitter_index = wildcard.index(splitter)
                splitter = wildcard[splitter_index]
                break

        if splitter_index == -1:
            return False
        else:
            left = wildcard[1:splitter_index]
            right = wildcard[splitter_index + 1:-1]

            if splitter == '|':
                return parse_wildcard(left, string) or parse_wildcard(right, string)
            elif splitter == '&':
                return parse_wildcard(left, string) and parse_wildcard(right, string)

### get_matching_ngrams

get_matching_ngrams takes a sentence, and a set of ngrams of length n and returns all matching sentences.

In [None]:
def get_matching_ngrams(sentence: str, ngrams: list, n: int):
    sentence_ngrams = get_ngrams(sentence, n)
    matching_ngrams = []
    for i in range(len(sentence_ngrams)):
        for j in range(len(ngrams)):
            match = True
            for k in range(n):
                match = sentence_ngrams[i][k] == ngrams[j][k]
                wildcard_match = ngrams[j][k].startswith('_') and parse_wildcard(ngrams[j][k][1:], sentence_ngrams[i][k])
                match = match or wildcard_match

                if not match:
                    break

            if match:
                matching_ngrams.append(sentence_ngrams[i])

    return matching_ngrams

### split_ngrams

Splits a list of ngrams of arbitrary length into multiple lists, where each list contains all ngrams of a particular length

In [None]:
def split_ngrams(ngrams: list):
    ngrams_splitted = {}
    for ngram in ngrams:
        length = len(ngram)
        if length not in ngrams_splitted:
            ngrams_splitted[length] = []
        ngrams_splitted[length].append(ngram)

    return ngrams_splitted


### get_matching_ngrams_by_length

Takes a list of arbitrary sentences and a list of ngrams of arbitrary length and returns all matches

In [None]:
def get_matching_ngrams_by_length(sentence: str, ngrams: list):
    ngrams_splitted = split_ngrams(ngrams)
    matching_ngrams = []
    for length in ngrams_splitted:
        matching_ngrams += get_matching_ngrams(sentence, ngrams_splitted[length], length)

    return matching_ngrams


## ngram rules

These rules are used to extract the right sentences for each class

In [None]:
class_1_ngrams = [
    ('it', 'is', 'not', 'the', 'case', 'that', '_=1', 'is'),
    ('it', 'is', 'not', 'the', 'case', 'that', 'for', 'all', '_=1'),
    ('_=1', 'is', '_>1', '_(=and|=or)', '_=1', 'is', '_>1'),
    ('it', 'is', 'not', 'the', 'case', 'that', 'there', 'is', 'an', 'element', '_=1'),
]

class_2_ngrams = [
    ('for', 'all', '_>1'),
    ('then', '_=1'),
    ('there', 'is', 'a', '_>1', '_=1', 'such', 'that', '_=1', 'is', '_>1'),
    ('_=1', 'is', 'not', '_>1', 'than', '_=1'),
    ('_=1', 'is', '_>1', 'than', '_=1'),
    ('_=1', 'is', 'in', 'front', 'of', '_=1'),
    ('something', 'is', 'to', 'the', '_(=left|=right)', 'of', '_=1'),
    ('_=1', 'and', '_=1', 'is', 'a', '_>1'),
    ('all', 'these', 'hold'),
    ('at', 'least', 'one', 'of', 'these', 'holds'),
    ('_=1', 'is', '_>1', '_=1', 'is'),
    ('_=1', 'is', 'a', '_>1', '_=1', 'is'),
    ('_=1', 'is', 'not', '_>1', '_=1', 'is'),
    ('_=1', 'is', 'not', 'a', '_>1', '_=1', 'is'),
]

class_3_ngrams = [
    ('any', '_>1'),
    ('every', '_>1'),
    ('some', '_>1'),
    ('a', '_>1', 'is'),
    ('_>1', 'are', '_>1'),
]

## classify_sentence

This function is used to classify a particular sentence

In [None]:
def classify_sentence(row):
    translation = normalize(row['Translation'])

    if len(get_matching_ngrams_by_length(translation, class_1_ngrams)) > 0:
        return '1'
    elif len(get_matching_ngrams_by_length(translation, class_2_ngrams)) > 0:
        return '2'
    elif len(get_matching_ngrams_by_length(translation, class_3_ngrams)) > 0:
        return '3'
    else:
        return '0'


## Load the dataset and normalize

We use 2 base datasets to generate our new dataset. The first dataset includes more formulaic sentences and comes from [Enhancing and Evaluating the Grammatical Framework Approach to Logic-to-Text Generation](https://aclanthology.org/2022.gem-1.13) (Calò et al., GEM 2022).
The second dataset includes less formulaic sentences and comes from [Harnessing the Power of Large Language Models for Natural Language to First-Order Logic Translation](https://arxiv.org/abs/2305.15541) (Yuan Yang et al., 2023).

This code loads and normalizes the base dataset, we also filter out the following rows:
- Duplicate rows (keeping the first one)
- Rows without a translation

In [None]:
df = pd.read_csv('all_results.csv')
df2 = pd.read_json('https://huggingface.co/datasets/yuan-yang/MALLS-v0/raw/main/MALLS-v0.1-train.json')

def preproc(s):
  s = str(s)
  s = s.replace(r'\item', "<br>"+chr(8226))

  if s.startswith(" "):
    s = s[1:]
  s = s[0].upper() + s[1:]
  s = s.replace(" ,", ",").replace(" :", ":")
  s = s + "."

  return s

new_data = []
for _, row in df.iterrows():
    for translation_type in ["Baseline", "Ranta", "LoLa"]:
        new_row = {
            "Formula": row["Formula"],
            "Translation": preproc(row[translation_type]),
            "Translation_Type": translation_type
        }
        new_data.append(new_row)

for _, row in df2.iterrows():
  new_row = {
    "Formula": row["FOL"],
    "Translation": row["NL"],
    "Translation_Type": "MALLS"
  }
  new_data.append(new_row)

transformed_df = pd.DataFrame(new_data)
transformed_df = transformed_df.drop_duplicates(subset=['Translation'])
transformed_df = transformed_df[transformed_df['Translation'].apply(lambda x: isinstance(x, str))]

## Clasify the dataset

Now we can classify the whole dataset

In [None]:
transformed_df['Class'] = transformed_df.progress_apply(classify_sentence, axis=1)

  0%|          | 0/37523 [00:00<?, ?it/s]

## Quality checks

Now let's check how good our clasifications are

### Basic stats

How many datapoints where we able to classify?

In [None]:
print("Classified: ", len(transformed_df[transformed_df['Class'] != '0']))
print("Unclassifiable: ", len(transformed_df[transformed_df['Class'] == '0']))

Classified:  17620
Unclassifiable:  19903


### Mark difference

Marks all sentences that recieved a different class then we expected based on which system was used to generate the sentence, and print the ratios

In [None]:
class_to_translation_type = {
    '0': ['Unclassifiable'],
    '1': ['Baseline'],
    '2': ['Ranta', 'LoLa'],
    '3': ['MALLS']
}

def mark_difference(row):
    expected_types = class_to_translation_type[row['Class']]
    return row['Translation_Type'] not in expected_types

transformed_df['Difference'] = transformed_df.apply(mark_difference, axis=1)

print(transformed_df['Class'].value_counts()[['0', '1', '2', '3']])
print(transformed_df.groupby('Class')['Difference'].value_counts(normalize=True)[['1', '2', '3']])
print("Difference: ", len(transformed_df.loc[transformed_df['Difference'] == True].loc[transformed_df['Class'] != '0']))

0    19903
1     4117
2     4786
3     8717
Name: Class, dtype: int64
Class  Difference
1      False         0.773864
       True          0.226136
2      False         0.695361
       True          0.304639
3      False         0.980039
       True          0.019961
Name: Difference, dtype: float64
Difference:  2563


### Pick sentences

now lets pick 4000 sentences from each class, so that all the classes are of the same length

In [None]:
transformed_df = transformed_df.groupby('Class').apply(lambda x: x.sample(n=4000, random_state=1)).reset_index(drop=True)

## Save the dataset

Filter out all instances withou a label and save the transformed and classified dataset

In [None]:
transformed_df[transformed_df['Class'] != '0']
transformed_df.to_csv('classified.csv', index=False)

## Split the dataset

Now that we have a good dataset lets split it into a training and test set.

### Count sentence length

First we count the length of each sentence, so we can later check if the lengths are evenly distributed

In [None]:
transformed_df['length'] = transformed_df['Translation'].apply(lambda x: len(normalize(str(x)).split()))

### Check distribution

Before splitting let's check the distribution of the sentence lengths for each class

In [None]:
print(transformed_df.groupby('Class')['length'].mean())
print(transformed_df.groupby('Class')['length'].median())
print(transformed_df.groupby('Class')['length'].max())
print(transformed_df.groupby('Class')['length'].min())

Class
0    15.92825
1    31.80075
2    25.85150
3    16.16975
Name: length, dtype: float64
Class
0    15.0
1    31.0
2    26.0
3    16.0
Name: length, dtype: float64
Class
0    46
1    69
2    50
3    46
Name: length, dtype: int64
Class
0    2
1    9
2    5
3    3
Name: length, dtype: int64


### Split

Now lets actually split the dataset

In [None]:
train, test = train_test_split(transformed_df, test_size=0.05, random_state=1, stratify=transformed_df['Class'])

df_train = pd.DataFrame(train)
df_test = pd.DataFrame(test)

### Check distribution again

Now we check the distribution again, to make sure that it didn't change to much

In [None]:
print("train: ")
print(df_train.groupby('Class')['length'].mean())
print(df_train.groupby('Class')['length'].median())
print(df_train.groupby('Class')['length'].max())
print(df_train.groupby('Class')['length'].min())


print("test: ")
print(df_test.groupby('Class')['length'].mean())
print(df_test.groupby('Class')['length'].median())
print(df_test.groupby('Class')['length'].max())
print(df_test.groupby('Class')['length'].min())

train: 
Class
0    15.948684
1    31.750000
2    25.823421
3    16.171842
Name: length, dtype: float64
Class
0    15.0
1    31.0
2    26.0
3    16.0
Name: length, dtype: float64
Class
0    46
1    69
2    49
3    46
Name: length, dtype: int64
Class
0    2
1    9
2    5
3    3
Name: length, dtype: int64
test: 
Class
0    15.540
1    32.765
2    26.385
3    16.130
Name: length, dtype: float64
Class
0    15.0
1    33.0
2    27.0
3    16.0
Name: length, dtype: float64
Class
0    31
1    59
2    50
3    33
Name: length, dtype: int64
Class
0     3
1    13
2     8
3     3
Name: length, dtype: int64


### Save train and test datasets

Finally, let's save both the train and test datasets to a csv

In [None]:
df_train.to_csv('train.csv', index=False)
df_test.to_csv('test.csv', index=False)