In [1]:
import pandas as pd
import numpy as np
from cytoolz import *
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
terms = [t.split() for t in open('terms.txt')]

In [3]:
terms[:10]

[['part', '-', 'of', '-', 'speech', 'tagging'],
 ['word', '-', 'to', '-', 'word'],
 ['part', '-', 'of', '-', 'speech'],
 ['state', '-', 'ofthe', '-', 'art'],
 ['tree', '-', 'to', '-', 'string'],
 ['-', 'fold', 'cross', '-', 'validation'],
 ['end', '-', 'to', '-', 'end'],
 ['state', '-', 'of', '-', 'theart'],
 ['sequence', '-', 'to', '-', 'sequence'],
 ['context', '-', 'free', 'grammar']]

In [4]:
df = pd.read_parquet('s3://ling583/micusp.parquet', storage_options={'anon':True})

In [5]:
df.head()

Unnamed: 0,filename,text
0,micusp/BIO.G0.15.1.html,"New York City, 1908: different colors of skin..."
1,micusp/BIO.G1.04.1.html,\tThe fish-tetrapod transition has been calle...
2,micusp/BIO.G3.03.1.html,\tIntracellular electric fields are of great ...
3,micusp/BIO.G0.11.1.html,Environmental stresses to plants have been st...
4,micusp/BIO.G1.01.1.html,\tThe recurrent cholera pandemics have been re...


---

**Remove non-specific terms**

---

### spaCy Setup

In [6]:
import spacy

Loading a processing pipeline.  This is a small English model.  Will be using part of speech labels only, so we will be excluding modules.

In [7]:
nlp = spacy.load('en_core_web_sm', exclude=['parser', 'ner', 'lemmatizer', 'attribute_ruler'])

In [8]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7efbacab0ae0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7efbaca54220>)]

In [9]:
doc = nlp(df['text'].iloc[0])

In [10]:
doc

 New York City, 1908: different colors of skin swirl in the great melting pot to produce a cultural medley.  Now imagine such a metropolis spreading to cover every last crevice on Earth.  Over time, people will weave to produce an unprecedented uniformity; once discrete identities would be lost.  Our heritages will be remembered only by the history texts in the hands of our progeny.  A similar effect can be observed in environmental systems: we are in danger of losing our global biodiversity to a monotonous fate.   The threat of invasive species is now greater than the world has ever witnessed.  The number of introductions caused by international commerce is enormous (Mooney and Cleland, 2001).  Although only a small portion of emigrants survive, those survivors have aggregated to form a giant global problem of bioinvasions (Mack et al., 2000).  Extensive studies have been done on many exotics, such as the zebra mussel Dreissena polymorpha, and their biological and ecological threats p

In [11]:
doc[0:10]

 New York City, 1908: different colors of

In [12]:
doc[200]

.

In [13]:
doc[200].tag_, doc[200].norm_

('NNP', '.')

We will import the rule-based matcher from spacy

In [14]:
from spacy.matcher import Matcher

In [15]:
matcher = Matcher(nlp.vocab)
matcher.add('Term', [[{'TAG': {'IN': ['JJ', 'NN']}},  #JJ = adjective  #NN = noun
                      {'TAG': {'IN': ['JJ', 'NN', 'IN', 'HYPH']}, 'OP': '*'},  ##IN = preposition ##HYPH = hyphenated speech
                      {'TAG': 'NN'}]])

In [16]:
spans = matcher(doc, as_spans=True)

This is the first candidate in the first document:

In [17]:
tuple(tok.norm_ for tok in spans[0])

('skin', 'swirl')

### Extract candidate terms

In [18]:
def get_candidates(text):
    doc = nlp(text)  #tokenize and tag
    spans = matcher(doc, as_spans=True)  #find all the tags
    return [tuple(tok.norm_ for tok in span) for span in spans] #return a list of all spans converted into tuples of normalized strings

In [19]:
get_candidates(df['text'].iloc[0])

[('skin', 'swirl'),
 ('great', 'melting'),
 ('melting', 'pot'),
 ('great', 'melting', 'pot'),
 ('cultural', 'medley'),
 ('last', 'crevice'),
 ('unprecedented', 'uniformity'),
 ('similar', 'effect'),
 ('global', 'biodiversity'),
 ('monotonous', 'fate'),
 ('invasive', 'species'),
 ('threat', 'of', 'invasive', 'species'),
 ('international', 'commerce'),
 ('small', 'portion'),
 ('global', 'problem'),
 ('giant', 'global', 'problem'),
 ('zebra', 'mussel'),
 ('human', 'interest'),
 ('ecosystem', 'disturbance'),
 ('hasty', 'action'),
 ('irreparable', 'damage'),
 ('global', 'asset'),
 ('asset', 'of', 'biodiversity'),
 ('global', 'asset', 'of', 'biodiversity'),
 ('invasive', 'predator'),
 ('minimal', 'disturbance'),
 ('human', 'contact'),
 ('insufficient', 'understanding'),
 ('predatory', 'land'),
 ('land', 'snail'),
 ('predatory', 'land', 'snail'),
 ('african', 'land'),
 ('giant', 'african', 'land'),
 ('land', 'snail'),
 ('african', 'land', 'snail'),
 ('giant', 'african', 'land', 'snail'),
 ('a

Now, we have to get all of the candidates for the entire dataset

In [20]:
candidates = list(concat(df['text'].progress_apply(get_candidates)))

  0%|          | 0/788 [00:00<?, ?it/s]

### Compute c-values

$$\mbox{C-value}(a)=\begin{cases}\log_2|a|\cdot f(a) & \mbox{if } a \mbox{ is not nested}\\\log_2|a|\left(f(a)-\frac{1}{P(T_a)}\sum_{b\in T_a}f(b)\right) & \mbox{otherwise}\\\end{cases}$$


Next, we will count the frequencies of all the candidates and organize them by length.

In [21]:
from collections import defaultdict, Counter

In [22]:
freqs = defaultdict(Counter)
for c in candidates:
    freqs[len(c)][c] += 1

In [23]:
freqs.keys()

dict_keys([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])

In [24]:
freqs[5].most_common(5)

[(('trend', 'of', 'part', '-', 'time'), 15),
 (('asymmetry', 'in', 'stock', 'price', 'response'), 13),
 (('cycle', '-', 'to', '-', 'cycle'), 13),
 (('interaction', 'term', 'on', 'stock', 'price'), 10),
 (('basal', 'area', 'per', 'sample', 'area'), 9)]

In [25]:
from nltk import ngrams

In [26]:
#Use 5-1 = 4, to 1, but excluding 1 (use -1)
list(range(4, 1, -1))

[4, 3, 2]

In [27]:
def get_subterms(term):
    k = len(term)
    for m in range(k-1, 1, -1):
        yield from ngrams(term, m)

In [28]:
list(get_subterms(('trend', 'of', 'part', '-', 'time')))

[('trend', 'of', 'part', '-'),
 ('of', 'part', '-', 'time'),
 ('trend', 'of', 'part'),
 ('of', 'part', '-'),
 ('part', '-', 'time'),
 ('trend', 'of'),
 ('of', 'part'),
 ('part', '-'),
 ('-', 'time')]

In [29]:
from math import log2

In [30]:
def c_value(F, theta):
    
    termhood = Counter()
    longer = defaultdict(list)
    
    for k in sorted(F, reverse=True):
        for term in F[k]:
            if term in longer:
                discount = sum(longer[term]) / len(longer[term])
            else:
                discount = 0
            c = log2(k) * (F[k][term] - discount)  #This is the extra boost given to longer sequences
            if c > theta:
                termhood[term] = c
                for subterm in get_subterms(term):
                    if subterm in F[len(subterm)]:
                        longer[subterm].append(F[k][term])
    return termhood

In [31]:
terms = c_value(freqs, theta=80)

In [32]:
for t, c in terms.most_common(20):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

  282.00  282 other hand
  264.00  264 health care
  252.00  126 part - time faculty
  206.00  206 same time
  177.52  112 long - term
  169.00  169 high school
  167.00  167 body color
  155.33   98 self - esteem
  146.00  146 wing venation
  138.00  138 eye color
  137.00  137 domestic violence
  125.21   79 stock price response
  120.46   76 decision - making
  112.53   71 low - income
  111.00  111 renewable energy
  103.02   65 quality of life
  103.02   65 state of nature
  103.02   65 spell - caster
  103.02   65 community violence exposure
  101.00  101 wild type


In [33]:
#Looking at the bottom of the list
for t, c in tail(20, terms.most_common()):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

  137.00  137 domestic violence
  125.21   79 stock price response
  120.46   76 decision - making
  112.53   71 low - income
  111.00  111 renewable energy
  103.02   65 quality of life
  103.02   65 state of nature
  103.02   65 spell - caster
  103.02   65 community violence exposure
  101.00  101 wild type
   97.00   97 civil society
   96.68   61 middle - class
   96.00   48 psychological well - being
   93.00   93 great deal
   86.00   86 first time
   84.00   84 social support
   83.00   83 future research
   83.00   83 sexual harassment
   82.42   52 full - time
   81.00   81 social movement


In [34]:
with open('terms-final.txt', 'w') as f:
    for term in terms:
        print(' '.join(term), file=f)

Combining both lists into one master list

In [35]:
terms = open("terms.txt")

In [36]:
print(terms.read())

part - of - speech tagging
word - to - word
part - of - speech
state - ofthe - art
tree - to - string
- fold cross - validation
end - to - end
state - of - theart
sequence - to - sequence
context - free grammar
right - hand side
log - linear model
- fold cross validation
multi - document summarization
inter - annotator agreement
semi - supervised learning
multi - task learning
pre - trained word
natural language processing
high - level
low - level
machine translation system
first - order
sentence - level
predicate - argument
natural language generation
point of view
part of speech
finite - state
long - distance
real - time
word sense disambiguation
co - occurrence
trade - off
large - scale
real - world
high - quality
statistical machine translation
n - gram
long - term
question - answer
set of candidate
number of training
amount of training
second - order
word - level
f - measure
log - likelihood
chinese word segmentation
t - test
character - level
feed - forward
document - level
gram 

In [37]:
terms_2 = open("terms-final.txt")

In [38]:
print(terms_2.read())

part - time faculty
psychological well - being
long - term
quality of life
decision - making
low - income
middle - class
stock price response
self - esteem
full - time
state of nature
spell - caster
community violence exposure
other hand
same time
first time
wild type
health care
great deal
high school
renewable energy
wing venation
body color
eye color
future research
domestic violence
sexual harassment
civil society
social support
social movement



In [39]:
new_terms = [t for t in terms if t not in terms_2]

In [40]:
with open("new_terms_final.txt", "w") as output:
    output.write(str(new_terms))