In [1]:
import os
import sys

currentdir = os.getcwd()
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)

#  from itables import init_notebook_mode
#  init_notebook_mode(all_interactive=True)
# ---
# from techminer2.ingest import ingest_raw_data

# ingest_raw_data(
#    #
#    # DATABASE PARAMS:
#    root_dir="regtech/",
#    disable_progress_bar=False,
# )

import pandas as pd  # type: ignore
import numpy as np

In [2]:
def create_data_frame(root_dir, column):
    """
    :meta private:
    """

    #
    # The thesaurus is created only from the main databaase.
    file = os.path.join(root_dir, "databases/_main.csv.zip")
    data_frame = pd.read_csv(file, encoding="utf-8", compression="zip")
    data_frame = data_frame[[column]]
    data_frame.columns = ["raw_term"]
    data_frame = data_frame.dropna()

    #
    # Basic preprocessing
    data_frame["raw_term"] = data_frame["raw_term"].str.replace("_", " ")
    data_frame["raw_term"] = data_frame["raw_term"].str.strip()
    data_frame["raw_term"] = data_frame["raw_term"].str.upper()

    #
    # Only non-empty descriptors
    data_frame = data_frame.loc[data_frame["raw_term"].str.len() > 0, :]

    #
    # Explodes the terms list
    data_frame["raw_term"] = data_frame["raw_term"].str.split(";")
    data_frame = data_frame.explode("raw_term")
    data_frame["raw_term"] = data_frame["raw_term"].str.strip()

    #
    # Replace strange characters
    data_frame["raw_term"] = data_frame["raw_term"].str.replace('"', "")
    data_frame["raw_term"] = data_frame["raw_term"].str.replace(chr(8212), "")
    data_frame["raw_term"] = data_frame["raw_term"].str.replace(chr(8220), "")
    data_frame["raw_term"] = data_frame["raw_term"].str.replace(chr(8221), "")
    data_frame["raw_term"] = data_frame["raw_term"].mask(
        (data_frame["raw_term"].str[0] == "-") & data_frame["raw_term"].str.len() > 1,
        data_frame["raw_term"].str.replace("^-", "", regex=True),
    )

    #
    # Counts term frequency
    data_frame["OCC"] = 1
    data_frame = data_frame.groupby("raw_term", as_index=False).agg({"OCC": np.sum})

    return data_frame

In [3]:
keywords_data_frame = create_data_frame(root_dir="tm2/", column="raw_keywords")
nlp_data_frame = create_data_frame(root_dir="tm2/", column="raw_nlp_phrases")

#
# NLP phrases may contain keywords, so we need to remove them.
nlp_data_frame = nlp_data_frame.loc[
    ~nlp_data_frame["raw_term"].isin(keywords_data_frame["raw_term"]), :
]

In [4]:
file = "../techminer2/word_lists/stopwords.txt"

with open(file, "r") as in_file:
    stopwords = in_file.read().split("\n")

In [17]:
file = "../techminer2/word_lists/r_stopwords.txt"

with open(file, "r") as in_file:
    r_stopwords = in_file.read().split("\n")

r_stopwords = [w.replace("'", "").upper().strip() for w in r_stopwords if len(w) > 2]
r_stopwords = sorted(set(r_stopwords))

r_stopwords = sorted(set(r_stopwords) | set(stopwords))

file = "../techminer2/word_lists/stopwords.txt"

with open(file, "w") as out_file:
    out_file.write("\n".join(r_stopwords))

In [14]:
word = nlp_data_frame.raw_term.copy()
word = word.str.split(" ")
word = word.explode()
word = word.str.strip()


selected_stopwords = set(r_stopwords) & set(word)
nlp_data_frame["selected"] = False
for sword in selected_stopwords:
    sword = r"\b" + sword + r"\b"
    nlp_data_frame["selected"] = nlp_data_frame["selected"] | nlp_data_frame.raw_term.str.contains(
        sword, regex=True
    )

nlp_data_frame.loc[nlp_data_frame["selected"], :].sort_values("OCC", ascending=False).head(40)

Unnamed: 0,raw_term,OCC,selected
45892,ELSEVIER LTD,2147,True
137264,SIMULATION RESULTS,1842,True
52762,EXPERIMENTAL RESULTS,1211,True
126076,RECENT YEARS,514,True
72092,IMPORTANT ROLE,308,True
58020,FRANCIS GROUP,257,True
36983,DIFFERENT TYPES,245,True
141361,SONS LTD,233,True
117933,PREDICTION RESULTS,226,True
79914,LARGE NUMBER,225,True


In [None]:
keywords_data_frame

In [None]:
file = "../techminer2/word_lists/stopwords.txt"

with open(file, "r") as in_file:
    stopwords = in_file.read().split("\n")

stopwords = [word.upper().strip().replace("'", "") for word in stopwords]

with open(file, "w") as out_file:
    out_file.write("\n".join(stopwords))

In [None]:
text = """
a	needn	a	not	able	never
about	needn’t	accordance	now	above-	often
above	no	according	of	mentioned	others
after	nor	all	on	accordingly	otherwise
again	not	also	onto	across	overall
against	now	an	or	along	rather
ain	o	and	other	already	remarkably
all	of	another	particularly	alternatively	significantly
am	off	are	preferably	always	simply
an	on	as	preferred	among	sometimes
and	once	at	present	and/or	specifically
any	only	be	provide	anything	straight
are	or	because	provided	anywhere	forward
aren	other	been	provides	better	substantially
aren’t	our	being	relatively	disclosure	thereafter
as	ours	by	respectively	due	therebetween
at	ourselves	claim	said	easily	therefor
be	out	comprises	should	easy	therefrom
because	over	corresponding	since	e.g	therein
been	own	could	some	either	thereinto
before	re	described	such	elsewhere	thereon
being	s	desired	suitable	enough	therethrough
below	same	do	than	especially	therewith
between	shan	does	that	essentially	together
both	shan’t	each	the	et al	toward
but	she	embodiment	their	etc	towards
by	she’s	fig	then	eventually	typical
can	should	figs	there	excellent	upon
couldn	should’ve	for	thereby	finally	via
couldn’t	shouldn	from	therefore	furthermore	vice versa
d	shouldn’t	further	thereof	good	whatever
did	so	generally	thereto	hence	whereas
didn	some	had	these	he/she	whereat
didn’t	such	has	they	him/her	wherever
do	t	have	this	his/her	whether
does	than	having	those	ie	whose
doesn	that	herein	thus	ii	within
doesn’t	that’ll	however	to	iii	without
doing	the	if	use	instead	yet
don	their	in	various	later	
don’t	theirs	into	was	like	
down	them	invention	were	little	
during	themselves	is	what	many	
each	there	it	when	may	
few	these	its	where	meanwhile	
for	they	means	whereby	might	
from	this	wherein	moreover
further	those	which	much
had	through		while	must
hadn	to	who	
hadn’t	too	will	
has	under		with		
hasn	until	Would
hasn’t	up				
have	ve
haven	very
haven’t	was
having	wasn
he	wasn’t
her	we
here	were
hers	weren
herself	weren’t
him	what
himself	when
his	where
how	which
i	while
if	who
in	whom
into	why
is	will
isn	with
isn’t	won
it	won’t
it’s	wouldn
its	wouldn’t
itself	y
just	you
ll	you’d
m	you’ll
ma	you’re
me	you’ve
mightn	your
mightn’t	yours
more	yourself
most	yourselves
mustn	
mustn’t	
my	
myself
"""

text = text.replace("\t", "\n").replace("'", "").split("\n")
text = sorted(set([word.upper().strip() for word in text]))

with open("stopwords.txt", "w") as out_file:
    out_file.write("\n".join(text))

In [None]:
with open("../techminer2/word_Lists/stopwords.txt", "r") as in_file:
    stopwords = in_file.read().split("\n")

stopwords = [t.upper().replace("'", "") for t in stopwords]
stopwords = sorted(set(stopwords))
len(stopwords)

In [None]:
nlp_data_frame