## Data filtering with top 20 most common edits

### Import

In [20]:
from collections import Counter
import jsonlines

####  instal packages
pip install jsonlines

### Loading the data

In [21]:
data = []
edits = []
with jsonlines.open('./data_full/ptrain.jsonl') as f:
    for line in f.iter():
        data.append(line)
        edits.append(line["labels"])

In [22]:
data_val = []
edits_val = []
with jsonlines.open('./data_full/val.jsonl') as f:
    for line in f.iter():
        data_val.append(line)
        edits_val.append(line["labels"])

### removing [CLS] and [SEP] tokens

In [23]:
removetoken = lambda x:x[1:-1]

In [24]:
for d in data:
    d["tokens"] = removetoken(d["tokens"])
    d["labels"] = removetoken(d["labels"])

In [25]:
for d in data_val:
    d["tokens"] = removetoken(d["tokens"])
    d["labels"] = removetoken(d["labels"])

In [26]:
print(f"{len(data):,}")

200,000


In [27]:
print(f"{len(data_val):,}")

84,974


### Finding most common 20 edits 

In [28]:
flatten_edits_1 = [lst for edit in edits for lst in edit]

In [29]:
print(f"{len(flatten_edits_1):,}")

5,145,773


In [30]:
len(edits)

200000

In [31]:
key_1 = Counter(flatten_edits_1).most_common(20)

In [32]:
key_1 = [i[0] for i in key_1]

In [33]:
key_1

['$KEEP',
 '$DELETE',
 '$REPLACE_.',
 '$REPLACE_,',
 '$TRANSFORM_AGREEMENT_SINGULAR',
 '$APPEND_.',
 '$APPEND_,',
 '$APPEND_the',
 '$REPLACE_to',
 '$REPLACE_the',
 '$TRANSFORM_VERB_VBZ_VB',
 '$REPLACE_of',
 '$REPLACE_in',
 '$TRANSFORM_VERB_VBG_VB',
 '$TRANSFORM_VERB_VBN_VB',
 '$APPEND_to',
 '$TRANSFORM_AGREEMENT_PLURAL',
 '$APPEND_of',
 '$APPEND_and',
 '$APPEND_a']

### Filtering the sentence 
 
 Filterinng the sentence which has only the 20 most common edits in them

In [34]:
final = []
for j in data:
    if set(j["labels"]).issubset(set(key_1)):
        final.append(j)

In [35]:
final_val = []
for i in data_val:
    if set(i["labels"]).issubset(set(key_1)):
        final_val.append(i)

In [36]:
print(f"{len(final):,}")

52,731


In [37]:
print(f"{len(final_val):,}")

25,865


### Writing back to the file

In [38]:
with jsonlines.open('./data_filter/ptrain.jsonl',mode='w') as f:
    for line in final:
        f.write(line)

In [39]:
with jsonlines.open('./data_filter/val.jsonl',mode='w') as f:
    for line in final_val:
        f.write(line)

## Data Filtering with less keep edit and more other edits

### Import

In [61]:
from collections import Counter
import jsonlines
import random

### Loading the data

In [62]:
data = []
edits = []
with jsonlines.open('./data_filter/ptrain.jsonl') as f:
    for line in f.iter():
        del line["original_sentence"]
        del line["errored_sentence"]
        data.append(line)
        edits.append(line["labels"])

### removing [CLS] and [SEP] tokens

In [63]:
removetoken = lambda x:x[1:-1]

In [64]:
for d in data:
    d["tokens"] = removetoken(d["tokens"])
    d["labels"] = removetoken(d["labels"])

In [65]:
wanted = []
for i,edit in enumerate(edits):
    c = Counter(edit)
    keep = c["$KEEP"]
    other = sum([val for key,val in c.items() if key != "$KEEP"])
    if 5*other >= keep:
        wanted.append(data[i])

In [66]:
len(wanted)

9825

In [9]:
random.shuffle(wanted)

In [67]:
wanted.extend(wanted)

In [None]:
9825

In [68]:
with jsonlines.open('./data_filter_1/ptrin.jsonl',mode='w') as f:
    for line in wanted:
        f.write(line)

In [43]:
c=0

In [44]:
with jsonlines.open('./data_filter_1/ptrin.jsonl') as f:
    for line in f.iter():
        c += 1

In [45]:
c

19650

In [None]:
data = []
edits = []
with jsonlines.open('./data_full/ptrain.jsonl') as f:
    for line in f.iter():
        del line["original_sentence"]
        del line["errored_sentence"]
        data.append(line)
        edits.append(line["labels"])

In [None]:
removetoken = lambda x:x[1:-1]

In [None]:
for d in data:
    d["tokens"] = removetoken(d["tokens"])
    d["labels"] = removetoken(d["labels"])

In [None]:
unwanted = []
for i,edit in enumerate(edits):
    c = Counter(edit)
    keep = c["$KEEP"]
    other = sum([val for key,val in c.items() if key != "$KEEP"])
    if 2*other < keep:
        unwanted.append(i)

In [None]:
len(data)

In [None]:
len(edits)

In [None]:
len(unwanted)

In [None]:
200000-199515

In [6]:
10+10+16+5+7+10+2000+2005+1963

6026

In [10]:
l = [1,2,3]
d = [4,5,6]
l.extend(d)

In [11]:
l

[1, 2, 3, 4, 5, 6]

In [12]:
l.extend(l)

In [13]:
l

[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]