In [5]:
import json
import os

import sklearn.datasets
import re

import nlpaug.augmenter.word as naw
import nlpaug.model.word_stats as nmw
import numpy as np
np.random.seed(0)

data_root = "data/scienceie/multi_head_selection"
train_data = "train.txt"
dev_data = "dev.txt"
test_data = "test.txt"

train_text_list = []
dev_text_list = []
test_text_list = []

for line in open(os.path.join(data_root, train_data), 'r'):
    line = line.strip("\n")
    instance = json.loads(line)
    train_text_list.append(instance['text'])
for line in open(os.path.join(data_root, dev_data), 'r'):
    line = line.strip("\n")
    instance = json.loads(line)
    dev_text_list.append(instance['text'])
for line in open(os.path.join(data_root, test_data), 'r'):
    line = line.strip("\n")
    instance = json.loads(line)
    test_text_list.append(instance['text'])

train_text_tokens = train_text_list + test_text_list

# Train TF-IDF model
tfidf_model = nmw.TfIdf()
tfidf_model.train(train_text_tokens)
tfidf_model.save('.')

# Load TF-IDF augmenter
aug = naw.TfIdfAug(model_path='.', action='substitute', aug_p=0.3, tokenizer=lambda x: x.split(), reverse_tokenizer=lambda ls: " ".join(ls))

texts = list(map(lambda x: " ".join(x), dev_text_list))

In [6]:
max([len(x) for x in test_text_list])

119

In [None]:
original = []
augmented = []
for text in texts:
    augmented_text = aug.augment(text)
    
    print('-'*20)
    print('Original Input:{}'.format(text))
    print('Agumented Output:{}'.format(augmented_text))
    original.append(text.split())
    augmented.append(augmented_text.split())    

In [None]:
with open(os.path.join(data_root, "unlabelled_original_test.txt"), 'w') as f:
    for text in original:
        f.write(" ".join(text))
        f.write('\n')
with open(os.path.join(data_root, "unlabelled_augmented_test.txt"), 'w') as f:
    for text in augmented:
        f.write(" ".join(text))
        f.write('\n')

In [None]:
index = 150
print(" ".join(original[index]))
print(" ".join(augmented[index]))

In [2]:
stopwords = ['et', 'al', 'i.' , 'e.', 'can']
stopwords_regex = '.\.'
# Load TF-IDF augmenter
aug = naw.ContextualWordEmbsAug(
    model_path='allenai/scibert_scivocab_cased', action="substitute", device="cuda:1", aug_p=0.1, stopwords=stopwords, stopwords_regex=stopwords_regex )

texts = list(map(lambda x: " ".join(x), dev_text_list))
print("Number of sentences:", len(texts))

unlabelled = []
unk_count = 0
for i, text in enumerate(texts):
    augmented_text1 = aug.augment(text)
    while '[UNK]' in augmented_text1:
        if unk_count == 20:
            break
        augmented_text1 = aug.augment(text)
        unk_count += 1
    if unk_count == 20:
        print("skipped", text)
        unk_count = 0
        continue
    unlabelled.append(augmented_text1)

Number of sentences: 414
Augmenting: 0 A bond failure is thought of as a micro - crack nucleation , specifically as a separation between the adjacent cells in the cellular structure along their common face .
Augmenting: 1 Initially , the micro - cracks may be dispersed in the model reflecting the random distribution of pore sizes and the low level of interaction due to force redistribution .
Augmenting: 2 Interaction and coalescence may follow as the population of micro - cracks increases .
Augmenting: 3 These situations are illustrated in Fig . 3 .
Augmenting: 4 The structure of the failed surface can be represented with a mathematical graph , where graph nodes represent failed faces and graph edges exist between failed faces with common triple line in the cellular structure , i.e. where two micro - cracks formed a continuous larger crack .
skipped The structure of the failed surface can be represented with a mathematical graph , where graph nodes represent failed faces and graph edge

Augmenting: 50 Full updates of the interaction potential are only required for the grid points that are related to charges that hopped during the last iteration .
Augmenting: 51 Accumulative rounding errors that arise due to repetitive addition and subtraction are solve this by rounding all interaction potentials to a uniformly spaced range of floating point numbers .
Augmenting: 52 Half metallic ferromagnets ( HMF ) have attracted enormous interest due to their applications in spintronic devices [ 1]. Dilute magnetic semiconductors ( DMSs ) are considered to be the best materials to show half metallicity .
Augmenting: 53 These materials have two components , one being a semiconducting material with diamagnetic properties while the other is a magnetic dopant such as transition metal having un - paired d electrons
Augmenting: 54 [ 2].
Augmenting: 55 The major advantage of these materials is utilization of electron 's spin as information carrier since advanced functionalities in spintron

Augmenting: 90 – Nuclear Energy Agency ( OECD - NEA ) has sponsored an ongoing benchmark entitled “ Uncertainty Analysis in Modelling ” ( UAM ) related to these efforts .
Augmenting: 91 The goal of this work is to offer a strategy for computing lattice sensitivities using the DRAGON lattice code and WIMS - D4 multi - group library .
Augmenting: 92 Results are presented with comparison to those from TSUNAMI , developed by Oak Ridge National Laboratories .
Augmenting: 93 Zirconium alloys are used as fuel cladding in pressurised and boiling water nuclear reactors .
Augmenting: 94 As such these materials are exposed to a large number of environmental factors that will promote degradation mechanisms such as oxidation .
Augmenting: 95 At high burn - ups , i.e. extended service life , oxidation and the associated hydrogen pick - up can be a limiting factor in terms of fuel efficiency and safety .
skipped At high burn - ups , i.e. extended service life , oxidation and the associated hydrogen p

Augmenting: 135 Other optical methods , under more recent development , also show some promise to achieve enantiomer separation , as will be introduced later .
Augmenting: 136 There are some relevant studies on information dissemination in transportation systems using simulations .
Augmenting: 137 One category of studies look at how either local information ( only about the neighbours ) or global information ( about the entire network ) affects the global network performance .
Augmenting: 138 Our approach is different in the sense that we investigate the impact of information on the global network performance depending on the fraction of people that receive information .
Augmenting: 139 We analyse what is the effect of real time information dissemination and explain why this effect appears .
Augmenting: 140 Information is disseminated in real time and contains global details about how congested the roads are .
Augmenting: 141 This approach is important as it gives insights on the impac

Augmenting: 184 Moreover , this paper showed that the fluid mechanics are accurately modelled using the LES framework .
Augmenting: 185 In the current paper , this framework is extended to account for non - spherical particles .
Augmenting: 186 This work shows how our approach based on the combination of Statistical Mechanics and nonlinear PDEs theory provides us with a novel and powerful tool to tackle phase transitions .
Augmenting: 187 This method leads to solution of perhaps the most known test - case that exhibits a first order phase transition ( semi - heuristically described ) such as the van der Waals model .
Augmenting: 188 In particular we have obtained the first global mean field partition function ( Eq .
Augmenting: 189 ( 9 ) ) , for a system of finite number of particles .
Augmenting: 190 The partition function is a solution to the Klein – Gordon equation , reproduces the van der Waals isotherms away from the critical region and , in the thermodynamic limit
Augmenting: 191

Augmenting: 230 UO2+x reacts with the pyrocarbon coating layer according to the equilibrium:(1)UO2+x + xC → UO2 + xCO
Augmenting: 231 The remainder of our discussion proceeds as follows .
Augmenting: 232 In Section 2 we briefly describe the problem of cell tracking and introduce our approach to cell tracking , which may be regarded as fitting a mathematical model to experimental image data sets .
Augmenting: 233 We present the geometric evolution law model we seek to fit , which is a simplification of recently developed models in the literature that show good agreement with experiments [ 8,10–12,4,13,9].
Augmenting: 234 We finish Section 2 by reformulating our model into the phase field framework , which appears more suitable for the problem in hand , and we formulate the cell tracking problem as a PDE constrained optimisation problem .
Augmenting: 235 In Section 3 we propose an algorithm for the resolution of the PDE constrained optimisation problem and we discuss some practical aspec

Augmenting: 272 However , when the collisions can no longer be assumed as binary and instantaneous , the soft sphere model is the only realistic option .
Augmenting: 273 It is interesting to know whether the choice of the collision model affects the statistics .
Augmenting: 274 Fig .
Augmenting: 275 14 compares the mean velocity obtained from both models with the experimental data .
Augmenting: 276 The same comparison is performed for the smooth walls .
Augmenting: 277 The differences between the hard and soft sphere models for the smooth walls are almost negligible .
Augmenting: 278 However , the differences between the hard and soft sphere models for the rough walls are minor .
Augmenting: 279 This is because the rough wall treatment in the soft sphere implementation adds extra virtual walls during the collision of a particle with a wall , which is a more realistic representation of a rough wall compared to the hard sphere rough wall treatment where one random wall is considered .
Au

Augmenting: 316 The Mie function , as written above , deceivingly suggests that four parameters are needed to characterize the behaviour of an isotropic molecule , however the exponents λa
Augmenting: 317 and λr are intimately related , and for fluid phase equilibria
Augmenting: 318 , one needs not consider them as independent parameters [ 57].
Augmenting: 319 Accordingly , we choose herein to fix the attractive exponent to λa=6 which would be expected to be representative of the dispersion scaling of most simple fluids and refer from here on to the repulsive parameter as λ =
Augmenting: 320 λr .
Augmenting: 321 The potential simplifies
Augmenting: 322 to(2)ϕ(r)=λλ−6λ66/(λ−6)εσrλ−σr6
skipped to(2)ϕ(r)=λλ−6λ66/(λ−6)εσrλ−σr6
Augmenting: 323 The formulation in Table 1 was derived by an empirical approach and led to a non - classical glass matrix .
Augmenting: 324 Carter et al .
Augmenting: 325 [ 3 ] and Zhang et al .
Augmenting: 326 [ 4 ] took a more systematic approach to such glass - ce

Augmenting: 372 It has been known [ 9,14,18,22 ] that the fragmentation processes in polyatomic molecules induced by an intense ultrafast laser field can sometimes exhibit sensitive dependence on the instantaneous phase characteristics of the laser field .
Augmenting: 373 Depending on the change in sign the chirped laser pulses , fragmentation could be either enhanced or suppressed [ 14,18,22]. Controlling the outcome of such laser induced molecular fragmentation with chirped femtosecond laser pulses has brought forth a number of experimental and theoretical effects in the recent years .
Augmenting: 374 However , efforts are continuing for a specific fragment channel enhancement , which is difficult since it also is a function of the molecular system under study [ 20,22–24].
Augmenting: 375 Here we report the observation of a coherently enhanced fragmentation pathway of n - propyl benzene , which seems to have such specific fragmentation channel available .
Augmenting: 376 We found tha

In [3]:
stopwords = ['et', 'al', 'i.' , 'e.', 'can']
stopwords_regex = '.\.'
# Load TF-IDF augmenter
aug = naw.ContextualWordEmbsAug(
    model_path='allenai/scibert_scivocab_cased', action="substitute", device="cuda:1", aug_p=0.5, stopwords=stopwords, stopwords_regex=stopwords_regex )

texts = list(map(lambda x: " ".join(x), dev_text_list))
print("Number of sentences:", len(texts))

augmented = []
unk_count = 0
for i, text in enumerate(texts):
    augmented_text2 = aug.augment(text)
    while '[UNK]' in augmented_text2:
        if unk_count == 20:
            break
        augmented_text2 = aug.augment(text)
        unk_count += 1
    if unk_count == 20:
        print("skipped", text)
        unk_count = 0
        continue
    augmented.append(augmented_text2) 

Number of sentences: 414
skipped The structure of the failed surface can be represented with a mathematical graph , where graph nodes represent failed faces and graph edges exist between failed faces with common triple line in the cellular structure , i.e. where two micro - cracks formed a continuous larger crack .
skipped Methods that predict the cell temperature at maximum power point ( MPP ) operation offer a more realistic approach since they include the electrical energy generation of the solar cells ( i.e. real operating conditions ) ; Yandt et al .
skipped In the context of the chemical compass ( i.e. when the task is determining the magnetic field direction through anisotropic hyperfine interactions ) , an analogous configuration ( with only one spin-1/2 nucleus ) has been proposed [ 3 ] , and numerically characterized [ 8 ] , as being optimal : Additional nuclear spins would perturb the intuitive ‘ reference and probe ’ picture .
skipped At high burn - ups , i.e. extended serv

In [6]:
import spacy
nlp = spacy.load("en_core_web_sm")

def spacy_tokenize(sent):
    sent = nlp(sent)
    sent = [str(x) for x in sent]
    return sent

error_list = []
new_unlabelled = []
new_augmented = []
for index, (unlabel, augment) in enumerate(zip(unlabelled, augmented)):
    a = spacy_tokenize(unlabel)
    b = spacy_tokenize(augment)
    if len(a) != len(b):
        # these needs manual interventions or ignore
        error_list.append(index)
        print(len(error_list), "errors so far...")
        continue
    new_unlabelled.append(a)
    new_augmented.append(b)

1 errors so far...
2 errors so far...
3 errors so far...
4 errors so far...
5 errors so far...
6 errors so far...
7 errors so far...
8 errors so far...
9 errors so far...
10 errors so far...
11 errors so far...
12 errors so far...
13 errors so far...
14 errors so far...
15 errors so far...
16 errors so far...


In [7]:
with open(os.path.join(data_root, "unlabelled_original_scibert2.txt"), 'w') as f:
    for text in new_unlabelled:
        f.write(" ".join(text))
        f.write('\n')
with open(os.path.join(data_root, "unlabelled_augmented_scibert2.txt"), 'w') as f:
    for text in new_augmented:
        f.write(" ".join(text))
        f.write('\n')

In [5]:
for index in error_list:
    print(index + 1)
    print(len(new_augmented[index]) - len(new_unlabelled[index]))
    for i in zip(new_unlabelled[index], new_augmented[index]):
        print(i)
    print()
    print("########################")
    print()

5
-1
('with', 'First')
('reference', 'reference')
('to', 'in')
('ref', 'a.')

########################

11
1
('high', '40')
('load', '3')
('pv', 'M')
('(', '(')
('hcpv', '0')
(')', ')')
('s.', '0')

########################

30
1
('the', 'Modern')
('pipes', 'pipes')
('may', 'themselves')
('already', 'been')
('exposed', 'exposed')
('to', 'to')
('various', 'various')
('degradation', 'process')
('hazards', 'factors')
('(', '(')
('diverse', 'physical')
('hazards', 'mechanisms')
(',', ',')
('mechanical', 'like')
('fatigue', 'fatigue')
(',', ',')
('thermal', 'thermal')
('fatigue', 'defects')
(',', ',')
('stress', 'organic')
('corrosion', 'corrosion')
(',', ',')
('Ž.', 'etc')
(')', '.')
('.', ')')

########################

56
-1
('that', "What'm")
("'s", 'how')
('why', 'the')
('these', 'research')
('research', 'mostly')
('areas', 'goes')
('shifted', 'to')
('towards', 'large')
('large', 'band')
('band', 'gap')
('gap', 'systems')
('materials', '.')

########################

108
-1
('the', 'Si

In [8]:
# check with manually labelled data
with open(os.path.join(data_root, "unlabelled_original_scibert2.txt"), 'r') as f:
    original_lines = f.read().split('\n')
with open(os.path.join(data_root, "unlabelled_augmented_scibert2.txt"), 'r') as f:
    augmented_lines = f.read().split('\n')
for a, b in zip(original_lines, augmented_lines):
    if len(a.split()) != len(b.split()):
        print("ERRORRR!!!")