# Cleaning the QuaTrain datasets 

## Sequential filtering 

In [1]:
import pandas as pd

In [2]:
import itertools

In [3]:
import pickle 

In [4]:
import numpy as np 
import math 

In [5]:
# data from https://github.com/potamides/uniformers/tree/main
df_de = pd.read_json('dataset/QuaTrain-de.json', lines=True)
df_en = pd.read_json('dataset/QuaTrain-en.json', lines=True)

In [66]:
len(df_de)

1483685

In [5]:
df_de.head()

Unnamed: 0,text,language,rhyme,meter,alliteration
0,"[Da jener Elbe-Schwan der Gottheit Ruhm, Da Br...",de,ABCD,iambus,0.040973
1,"[Da Brokkes Saitenspiel, so oft das Herze, Das...",de,ABCD,iambus,0.054848
2,"[gen, der belebt Feld, Berg und Thal;, Der erw...",de,ABCD,iambus,0.023747
3,"[Ueber die erhabnen Berge, und erwekt das Feld...",de,ABCD,alexandrine,0.044927
4,[Als mit holden Dünsten füllet; da was aus den...,de,ABCD,other,0.084108


In [67]:
len(df_en)

662885

In [None]:
vocab_de_original = np.unique(np.concatenate(list(df_de["text"])))

In [7]:
len(vocab_de_original)

1416395

In [None]:
vocab_en_original = np.unique(np.concatenate(list(df_en["text"])))


In [10]:
len(vocab_en_original)

969129

In [17]:
def sequential_filter(df):
    last_t = ""
    overlapping = []
    unique_rows = []
    for i, row in df.iterrows():
        if list(row["text"])[0] == last_t and len(overlapping) < 4:
            overlapping.append(row)
        elif len(overlapping) == 0:
            overlapping = [row]
        else:
            unique_rows.append(overlapping[0])
            overlapping = [row]
        last_t = list(row["text"])[1]
    unique_rows.append(overlapping[0]) 
    #print(unique_rows)   
    df_unique = pd.DataFrame(unique_rows)
    return(df_unique)

In [18]:
df_de_unique = sequential_filter(df_de)

In [19]:
len(df_de_unique)

587556

In [20]:
vocab, inv, counts = np.unique(np.concatenate(list(df_de_unique["text"])), return_inverse=True, return_counts = True)

In [30]:
df_en_unique = sequential_filter(df_en)

In [12]:
len(df_de_unique)

363698

In [32]:
len(df_en_unique)

236753

In [41]:
with open('dataset/QuaTrain-en-unique.pkl', 'wb') as f:
    pickle.dump(df_en_unique, f)

In [42]:
with open('dataset/QuaTrain-de-unique.pkl', 'wb') as f:
    pickle.dump(df_de_unique, f)

## Eliminating remaining overlaps 

In [11]:
with open('dataset/QuaTrain-de-unique.pkl', 'rb') as f:
    df_de_unique = pickle.load(f)

In [14]:
with open('dataset/QuaTrain-en-unique.pkl', 'rb') as f:
    df_en_unique = pickle.load(f)

In [25]:
def get_unique_quatrains(quads, vocab, index, p = 10):
    i = 0
    quads_current = quads.copy()
    while sum(np.unique(quads_current, return_counts=True)[1]>1) > 0: 
        print("iteration ", i)
        i += 1 
        tokens, inv, counts = np.unique(quads_current, return_counts=True, return_inverse=True)
        vocab = vocab[tokens]
        quads_current = np.resize(inv, (int(inv.size/4), 4))
        #print("quads_current", quads_current)
        print("number of double elements:", sum(counts>1))
        quad_counts = []
        for arr in quads_current: 
            num_arr = []
            for num in arr:
                num_arr.append(counts[num])
            quad_counts.append(num_arr)
        quad_count_sums = [sum(arr) for arr in quad_counts]
        quad_indices_sorted = sorted(range(len(quad_count_sums)), key=lambda k: quad_count_sums[k])
        double_num = (np.array(quad_count_sums)>4).sum()
        num_to_delete = math.ceil(double_num/p)
        print("elements to be deleted: ", num_to_delete)
        to_delete = np.array(quad_indices_sorted[-num_to_delete:])
        mask = np.ones(len(quads_current), dtype=bool)
        mask[to_delete] = False
        quads_current = quads_current[mask]
        index = index[mask]
        print("number of QuaTrains remaining: ", len(quads_current))
    return(quads_current, vocab, index)

### German data

In [12]:
vocab, inv, counts = np.unique(np.concatenate(list(df_de_unique["text"])), return_inverse=True, return_counts = True)

In [39]:
# number of unique lines in the data 
len(vocab)

1295972

In [22]:
# Quatrains with lines encoded as indexes in vocab
quads = np.resize(inv, (int(inv.size/4), 4))

In [23]:
print(quads)

[[ 108516  102251  144594 1140759]
 [1234052  240664  952902  458133]
 [ 907584 1102406  672988   62561]
 ...
 [ 806337  423299  969311  865509]
 [1160603  843661 1016932 1160434]
 [ 502582   61155  725472  654373]]


In [26]:
# 1/p is the fraction of Quatrains containing duplicates that are eliminated at each step
quads_unique_p10, vocab_new, quad_index = get_unique_quatrains(quads, vocab, df_de_unique.index, p=10)

iteration  0
number of double elements: 993653
elements to be deleted:  51577
number of QuaTrains remaining:  535979
iteration  1
number of double elements: 918406
elements to be deleted:  46200
number of QuaTrains remaining:  489779
iteration  2
number of double elements: 826082
elements to be deleted:  41475
number of QuaTrains remaining:  448304
iteration  3
number of double elements: 743099
elements to be deleted:  37326
number of QuaTrains remaining:  410978
iteration  4
number of double elements: 668425
elements to be deleted:  33592
number of QuaTrains remaining:  377386
iteration  5
number of double elements: 601237
elements to be deleted:  30232
number of QuaTrains remaining:  347154
iteration  6
number of double elements: 540753
elements to be deleted:  27208
number of QuaTrains remaining:  319946
iteration  7
number of double elements: 486333
elements to be deleted:  24487
number of QuaTrains remaining:  295459
iteration  8
number of double elements: 437350
elements to be de

In [27]:
# df containing non-overlapping Quatrains
unique_rows_de = df_de_unique.loc[quad_index, :]

In [28]:
len(vocab_new)

305115

In [29]:
len(unique_rows_de)

76278

In [None]:
with open('dataset/QuaTrain-de-unique-p10.pkl', 'wb') as f:
    pickle.dump(unique_rows_de, f)

### English data

In [33]:
vocab_en, inv_en, counts_en = np.unique(np.concatenate(list(df_en_unique["text"])), return_inverse=True, return_counts = True)
quads_en = np.resize(inv_en, (int(inv_en.size/4), 4))
quads_unique_en_p10, vocab_new_en, quad_index_en = get_unique_quatrains(quads_en, vocab_en, df_en_unique.index, p=10)
unique_rows_en = df_en_unique.loc[quad_index_en, :]
unique_text_en = vocab_new_en[quads_unique_en_p10]
#unique_text_en

iteration  0
number of double elements: 50664
elements to be deleted:  4349
number of QuaTrains remaining:  232404
iteration  1
number of double elements: 46005
elements to be deleted:  3877
number of QuaTrains remaining:  228527
iteration  2
number of double elements: 39391
elements to be deleted:  3399
number of QuaTrains remaining:  225128
iteration  3
number of double elements: 30549
elements to be deleted:  2928
number of QuaTrains remaining:  222200
iteration  4
number of double elements: 24332
elements to be deleted:  2597
number of QuaTrains remaining:  219603
iteration  5
number of double elements: 19620
elements to be deleted:  2261
number of QuaTrains remaining:  217342
iteration  6
number of double elements: 15220
elements to be deleted:  1939
number of QuaTrains remaining:  215403
iteration  7
number of double elements: 11884
elements to be deleted:  1689
number of QuaTrains remaining:  213714
iteration  8
number of double elements: 9762
elements to be deleted:  1465
numbe

In [35]:
len(unique_rows_en)

201856

In [38]:
with open('dataset/QuaTrain-en-unique-p10.pkl', 'wb') as f:
    pickle.dump(unique_rows_en, f)