# Cleaning the QuaTrain datasets 

## Sequential filtering 

In [1]:
import pandas as pd

In [2]:
import itertools

In [3]:
import pickle 

In [4]:
import numpy as np 
import math 

In [5]:
import json

In [6]:
from tqdm.notebook import tqdm

In [64]:
# data from https://github.com/potamides/uniformers/tree/main
df_de = pd.read_json('dataset/QuaTrain-de.json', lines=True)
df_en = pd.read_json('dataset/QuaTrain-en.json', lines=True)

In [5]:
dlk_path = 'dataset/DLK/dlk_full/dlk.v5.german.poetry.corpus.full.json'

In [9]:

with open(dlk_path) as f:
    dlk_dict = json.load(f)

# converting json dataset from dictionary to dataframe
#dlk_df = pd.DataFrame.from_dict(dlk_dict, orient='index')
#dlk_df.head()

In [10]:
all_quads = []
all_quads_nh = []
for pidx, poem in dlk_dict.items():
    poem_quads = []
    for sidx, stanza in poem['poem'].items():
        lines = [line[1]["text"] for line in list(stanza.items())]
        quad_num = len(lines)//4
        quads = []
        for a in range(quad_num):
            b = a * 4
            quad = lines[b:b+4]
            quads.append(quad)
            all_quads_nh.append(quad)
        poem_quads.append(quads)     
    all_quads.append(poem_quads)


In [12]:
with open('dataset/DLK/quatrains.pkl', 'wb') as f:
    pickle.dump(all_quads, f)

In [7]:
with open('dataset/DLK/quatrains.pkl', 'rb') as f:
    all_quads = pickle.load(f)

In [8]:
all_quads_nh = []
for poem in all_quads:
    for stanza in poem:
        for quad in stanza:
            all_quads_nh.append(quad)
            
#print(all_quads_nh)

In [9]:
first_lines = [ar[0] for ar in all_quads_nh]
print(first_lines[:10])

['mein Trieb, der waget warlich', 'Die Schuldigkeit befahl, die arme Schüch-', 'Und öfters knarrend gehn; da süsse Melo-', 'Die Schüchternheit die sprach: zurük mit', 'Den Liedern fehlt das Feur, das in die Her-', 'Sie wandte ferner ein: bedenkest du denn', 'Da Brokkes Saitenspiel, so oft das Herze', 'So sprach ', 'Des Herzens Trieb befahl und zeigte deine', 'Du kennest meinen Trieb auch in den nie-']


In [27]:
vocab_fl = np.unique(first_lines)


In [12]:
len(first_lines)

614789

In [32]:
len(vocab_fl)

343963

In [31]:
print(vocab_fl[1])

             Als ich jüngst, mich zu erquicken,


In [18]:
fl_dic = {}
for i, ar in enumerate(tqdm(list(df_de["text"])[:1000])):
    fl_ar = list(ar)[0] 
    if fl_ar in first_lines:
        if fl_ar in fl_dic.keys():
            fl_dic[fl_ar] += ar
        else:
            fl_dic[fl_ar] = [ar]
            

    

  0%|          | 0/1000 [00:00<?, ?it/s]

In [37]:
vocab_fl_list = list(vocab_fl)

In [39]:
fl_quads = []
fl_quad_indices = []
for i, ar in enumerate(tqdm(list(df_de_unique["text"]))):
    fl_ar = list(ar)[0] 
    if fl_ar in vocab_fl_list:
        fl_quads.append(fl_ar)
        fl_quad_indices.append(i)
        vocab_fl_list.remove(fl_ar)
    

  0%|          | 0/587556 [00:00<?, ?it/s]

In [40]:
len(fl_quads)

291699

Index([   3028,    3042,    3043,    3047,    3098,    3102,    3106,    3110,
          3114,    3118,
       ...
       1483616, 1483620, 1483621, 1483625, 1483626, 1483630, 1483658, 1483661,
       1483664, 1483667],
      dtype='int64', length=291699)

In [46]:
df_fl = df_de_unique.loc[df_de_unique.index[fl_quad_indices], :]

In [48]:
_, counts = np.unique(np.concatenate(list(df_fl["text"])), return_counts=True)
num_doubles = (counts>1).sum()
print(num_doubles)

9636


In [9]:
def sequential_filter(df):
    last_t = ""
    overlapping = []
    unique_rows = []
    for i, row in df.iterrows():
        if list(row["text"])[0] == last_t and len(overlapping) < 4:
            overlapping.append(row)
        elif len(overlapping) == 0:
            overlapping = [row]
        else:
            unique_rows.append(overlapping[0])
            overlapping = [row]
        last_t = list(row["text"])[1]
    unique_rows.append(overlapping[0]) 
    #print(unique_rows)   
    df_unique = pd.DataFrame(unique_rows)
    return(df_unique)

### German data

In [10]:
len(df_de)

1483685

In [5]:
df_de.head()

Unnamed: 0,text,language,rhyme,meter,alliteration
0,"[Da jener Elbe-Schwan der Gottheit Ruhm, Da Br...",de,ABCD,iambus,0.040973
1,"[Da Brokkes Saitenspiel, so oft das Herze, Das...",de,ABCD,iambus,0.054848
2,"[gen, der belebt Feld, Berg und Thal;, Der erw...",de,ABCD,iambus,0.023747
3,"[Ueber die erhabnen Berge, und erwekt das Feld...",de,ABCD,alexandrine,0.044927
4,[Als mit holden Dünsten füllet; da was aus den...,de,ABCD,other,0.084108


In [None]:
vocab_de_original = np.unique(np.concatenate(list(df_de["text"])))

In [None]:
# number of unique lines 
len(vocab_de_original)

In [11]:
df_de_unique = sequential_filter(df_de)

In [12]:
len(df_de_unique)

587556

In [14]:
with open('dataset/QuaTrain-de-sequential.pkl', 'wb') as f:
    pickle.dump(df_de_unique, f)

### English data

In [67]:
len(df_en)

662885

In [None]:
vocab_en_original = np.unique(np.concatenate(list(df_en["text"])))


In [10]:
len(vocab_en_original)

969129

In [30]:
df_en_unique = sequential_filter(df_en)

In [32]:
len(df_en_unique)

236753

In [41]:
with open('dataset/QuaTrain-en-unique.pkl', 'wb') as f:
    pickle.dump(df_en_unique, f)

## Eliminating remaining overlaps 

In [15]:
load_unique_dfs = True

In [16]:
if load_unique_dfs:
    with open('dataset/QuaTrain-de-sequential.pkl', 'rb') as f:
        df_de_unique = pickle.load(f)
    #with open('dataset/QuaTrain-en-unique.pkl', 'rb') as f:
        #df_en_unique = pickle.load(f)

In [39]:
def get_unique_lines(df):
    tokens, inv, counts = np.unique(np.concatenate(list(df["text"])), return_counts=True, return_inverse=True)
    index = df.index
    quads = np.resize(inv, (int(inv.size/4), 4))
    i_first = []
    i_nf = []
    i_ul = []
    i_nul = []
    for i, a in enumerate(quads):
        if counts[a[0]]==1:
            i_first.append(index[i])
        else:
            i_nf.append(index[i])
        if 1 in [counts[x] for x in a]:
            i_ul.append(index[i])
        else:
            i_nul.append(index[i])
            
    df_first = df.loc[i_first, :]
    df_nf = df.loc[i_nf, :]
    df_ul = df.loc[i_ul, :]
    df_nul = df.loc[i_nul, :]
    return((df_first, df_nf), (df_ul, df_nul))

(df_f, df_nf), (df_ul, df_nul) = get_unique_lines(df_de_unique[:100000])
print(len(df_f))
print(len(df_nf))
print(len(df_ul))
print(len(df_nul))


81256
18744
82732
17268


In [40]:
def check_remaining(df_ul, df_remaining):
    text_ul = np.concatenate(list(df_ul["text"]))
    tokens, inv, counts = np.unique(text_ul, return_counts=True, return_inverse=True)
    index_remaining = df_remaining.index
    i_to_keep = []
    for i, l in tqdm(enumerate(list(df_remaining["text"]))): 
        unique = True 
        l = list(l)
        for t in l:
            if t in tokens: 
                unique = False
                break
        if unique:
            text_ul = np.append(text_ul, l)
            tokens, inv, counts = np.unique(text_ul, return_counts=True, return_inverse=True)
            i_to_keep.append(index_remaining[i])
    return(i_to_keep)
        



In [41]:
i_to_keep = check_remaining(df_ul, df_nul)

235it [00:40,  5.86it/s]


KeyboardInterrupt: 

In [38]:
print(i_to_keep)

[]


In [36]:
from tqdm import tqdm 

In [53]:
def get_unique_quatrains(quads, vocab, index, p = 10):
    i = 0
    quads_current = quads.copy()
    while sum(np.unique(quads_current, return_counts=True)[1]>1) > 0: 
        print("iteration ", i)
        i += 1 
        tokens, inv, counts = np.unique(quads_current, return_counts=True, return_inverse=True)
        vocab = vocab[tokens]
        quads_current = np.resize(inv, (int(inv.size/4), 4))
        #print("quads_current", quads_current)
        print("number of double elements:", sum(counts>1))
        quad_counts = []
        for arr in quads_current: 
            num_arr = []
            for num in arr:
                num_arr.append(counts[num])
            quad_counts.append(num_arr)
        quad_count_sums = [sum(arr) for arr in quad_counts]
        quad_indices_sorted = sorted(range(len(quad_count_sums)), key=lambda k: quad_count_sums[k])
        double_num = (np.array(quad_count_sums)>4).sum()
        #double_num = np.array(quad_count_sums).sum() - 4 * len(quad_count_sums)
        num_to_delete = math.ceil(double_num/p)
        print("elements to be deleted: ", num_to_delete)
        to_delete = np.array(quad_indices_sorted[-num_to_delete:])
        #to_delete = np.array([ind for ind in to_delete if not 1 in quad_counts[ind]])
        mask = np.ones(len(quads_current), dtype=bool)
        mask[to_delete] = False
        quads_current = quads_current[mask]
        index = index[mask]
        print("number of QuaTrains remaining: ", len(quads_current))
    return(quads_current, vocab, index)

### German data

In [49]:
vocab, inv, counts = np.unique(np.concatenate(list(df_fl["text"])), return_inverse=True, return_counts = True)

In [50]:
# number of unique lines in the data 
len(vocab)

1155299

In [51]:
# Quatrains with lines encoded as indexes in vocab
quads = np.resize(inv, (int(inv.size/4), 4))

In [57]:
print(quads)

[[ 862186  806399  173619  591538]
 [1049307   96214  909415   94428]
 [ 559946  826978  429788  836124]
 ...
 [ 721903  376581  866741  774077]
 [1036802  755233  909174 1036650]
 [ 447637   54339  648461  584401]]


In [56]:
quads.size

1166796

In [58]:
# 1/p is the fraction of Quatrains containing duplicates that are eliminated at each step
quads_unique_p10, vocab_new, quad_index = get_unique_quatrains(quads, vocab, df_fl.index, p=20)

iteration  0
number of double elements: 9636
elements to be deleted:  639
number of QuaTrains remaining:  291060
iteration  1
number of double elements: 9444
elements to be deleted:  606
number of QuaTrains remaining:  290454
iteration  2
number of double elements: 9060
elements to be deleted:  571
number of QuaTrains remaining:  289883
iteration  3
number of double elements: 8205
elements to be deleted:  531
number of QuaTrains remaining:  289352
iteration  4
number of double elements: 7297
elements to be deleted:  495
number of QuaTrains remaining:  288857
iteration  5
number of double elements: 5921
elements to be deleted:  447
number of QuaTrains remaining:  288410
iteration  6
number of double elements: 5341
elements to be deleted:  421
number of QuaTrains remaining:  287989
iteration  7
number of double elements: 4734
elements to be deleted:  387
number of QuaTrains remaining:  287602
iteration  8
number of double elements: 4229
elements to be deleted:  359
number of QuaTrains re

number of double elements: 53
elements to be deleted:  6
number of QuaTrains remaining:  282123
iteration  74
number of double elements: 50
elements to be deleted:  5
number of QuaTrains remaining:  282118
iteration  75
number of double elements: 47
elements to be deleted:  5
number of QuaTrains remaining:  282113
iteration  76
number of double elements: 44
elements to be deleted:  5
number of QuaTrains remaining:  282108
iteration  77
number of double elements: 41
elements to be deleted:  5
number of QuaTrains remaining:  282103
iteration  78
number of double elements: 38
elements to be deleted:  4
number of QuaTrains remaining:  282099
iteration  79
number of double elements: 36
elements to be deleted:  4
number of QuaTrains remaining:  282095
iteration  80
number of double elements: 34
elements to be deleted:  4
number of QuaTrains remaining:  282091
iteration  81
number of double elements: 32
elements to be deleted:  4
number of QuaTrains remaining:  282087
iteration  82
number of 

In [59]:
# df containing non-overlapping Quatrains
unique_rows_de = df_fl.loc[quad_index, :]

In [60]:
len(vocab_new)

1128195

In [61]:
len(unique_rows_de)

282048

In [None]:
vocab_original = np.unique(np.concatenate(list(df_de["text"])))
len(vocab_original)

In [62]:
with open('dataset/QuaTrain-de-unique-p20-fl.pkl', 'wb') as f:
    pickle.dump(unique_rows_de, f)

### English data

In [33]:
vocab_en, inv_en, counts_en = np.unique(np.concatenate(list(df_en_unique["text"])), return_inverse=True, return_counts = True)
quads_en = np.resize(inv_en, (int(inv_en.size/4), 4))
quads_unique_en_p10, vocab_new_en, quad_index_en = get_unique_quatrains(quads_en, vocab_en, df_en_unique.index, p=10)
unique_rows_en = df_en_unique.loc[quad_index_en, :]
unique_text_en = vocab_new_en[quads_unique_en_p10]
#unique_text_en

iteration  0
number of double elements: 50664
elements to be deleted:  4349
number of QuaTrains remaining:  232404
iteration  1
number of double elements: 46005
elements to be deleted:  3877
number of QuaTrains remaining:  228527
iteration  2
number of double elements: 39391
elements to be deleted:  3399
number of QuaTrains remaining:  225128
iteration  3
number of double elements: 30549
elements to be deleted:  2928
number of QuaTrains remaining:  222200
iteration  4
number of double elements: 24332
elements to be deleted:  2597
number of QuaTrains remaining:  219603
iteration  5
number of double elements: 19620
elements to be deleted:  2261
number of QuaTrains remaining:  217342
iteration  6
number of double elements: 15220
elements to be deleted:  1939
number of QuaTrains remaining:  215403
iteration  7
number of double elements: 11884
elements to be deleted:  1689
number of QuaTrains remaining:  213714
iteration  8
number of double elements: 9762
elements to be deleted:  1465
numbe

In [35]:
len(unique_rows_en)

201856

In [38]:
with open('dataset/QuaTrain-en-unique-p10.pkl', 'wb') as f:
    pickle.dump(unique_rows_en, f)