# Installing and importing libraries

In [None]:
# pip install bigartm

In [None]:
# to use the BigARTM, you need to have a python version 3.8 or lower
import artm

from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import STOPWORDS
import re
from pathlib import *
import csv
import json
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

from textblob import TextBlob, Word

In [None]:
# Checking library
artm.ARTM(num_topics=1)
print(artm.version())

# Preparation

Removing stop words from the text and deleting documents in which there are less than 5 terms from one word. For a basic experiment, you can immediately upload a file Train_1.vw.txt from the disk. There is a link to it in the folder "code" in the file "ACL_RD_TEC.txt".

In [None]:
def preparation(input_path = 'Dataset/Train_Test/Train/Train_1_stem.vw.txt', \
                output_path = 'Dataset/Train_Test/Train/Train_1.vw.txt'):
    """
    Removing stop words from the text and deleting documents in which there are less than 5 terms
            Parameters:
                    input_path (str): directory with documents sorted by year to load
                    output_path (str): directory with documents sorted by year to save
    """

counter = 0
with open(input_path, 'r', errors="ignore") as fin:
    with open(output_path, 'w') as fout:
        for line in fin.readlines():
            words = line.split()
            if len(words) < 5:
                continue
            else:
                words[0] = 'doc_' + str(counter)
                counter += 1
                clean = ' '.join([word for word in words if (len(word) > 2 and not word in all_stopwords_gensim)])
                fout.write(clean + '\n')

It is necessary to highlight subject and background topics. Here the first 2 topics are background, followed by 149 subject topics. The dictionary is created by a function from the BigARTM library. The dictionary is created by a function from the BigARTM library. It also breaks the dataset into butches. One file is submitted to the input. It contains a dataset in the Vowpal Wabbit format.

In [None]:
# compiling a dictionary and divide the dataset into butches
vw_filaname = 'terminology-extraction-master/ACTER/en/texts.vw.txt'
bv = artm.BatchVectorizer(data_path=vw_filaname, data_format='vowpal_wabbit', batch_size=900, target_folder='batches')

# Model and Train

In [None]:
def set_model(bv, num_topics=150, tau_dec=0.1, tau_dec_add=0.025, tau_phi=-0.1, tau_phi_add=0.2, \
              tau_theta=-0.1, tau_theta_add=0.2):
    """
    Creating a model and adding regularizers to it
            Parameters:
                    bv (artm.BatchVectorizer): needed to set dictionary
                    num_topics (int): number of hidden topics -- number of columns of the phi matrix
                    tau_dec (float): the value of the decorrelation coefficient for subject topics
                    tau_dec_add (float): the value of the decorrelation coefficient for background topics
                    tau_phi (float): the value of the smooth-sparce coefficient of subject topics of phi-matrix
                    tau_phi_add (float): the value of the smooth-sparce coefficient of background topics of phi-matrix
                    tau_theta (float): the value of the smooth-sparce coefficient of subject topics of theta-matrix
                    tau_theta_add (float): the value of the smooth-sparce coefficient of background topics of theta-matrix
            Return:
                    model (artm.ARTM): the ARTM model
    """
    # list of subject topics
    list_of_topics = []
    for i in range(2, num_topics + 1):
        list_of_topics.append('topic_' + str(i))
    
    # model
    model = artm.ARTM(num_topics=num_topics, dictionary=bv.dictionary)
    model.scores.add(artm.PerplexityScore(name='perplexity', dictionary=bv.dictionary))
    model.scores.add(artm.TopTokensScore(name='top-tokens', num_tokens=10))
    
    # regularizers
    reg = artm.DecorrelatorPhiRegularizer(name='decor', tau = tau_dec, topic_names=list_of_topics)
    reg_add = artm.DecorrelatorPhiRegularizer(name='decor_add', tau = tau_dec_add, topic_names=['topic_0', 'topic_1'])
    reg_phi = artm.SmoothSparsePhiRegularizer(name='ssphi', tau=tau_phi, topic_names=list_of_topics)
    reg_phi_add = artm.SmoothSparsePhiRegularizer(name='ssphi_add', tau=tau_phi_add, topic_names=['topic_0', 'topic_1'])
    reg_theta = artm.SmoothSparseThetaRegularizer(name='sstheta', tau=tau_theta, topic_names=list_of_topics)
    reg_theta_add = artm.SmoothSparseThetaRegularizer(name='sstheta_add', tau=tau_theta_add, topic_names=['topic_0', 'topic_1'])
    model.regularizers.add(reg)
    model.regularizers.add(reg_add)
    model.regularizers.add(reg_phi)
    model.regularizers.add(reg_phi_add)
    model.regularizers.add(reg_theta)
    model.regularizers.add(reg_theta_add)
    
    return model

In [None]:
model = set_model(bv)

Perplexy is one of the quality metrics. The smaller it is, the better.

In [None]:
# train
for i in range(15):
    model.fit_offline(bv, num_collection_passes=1)
    print(f'Iter #{i}, perplexity: {model.score_tracker["perplexity"].last_value}')
    print(f'sparse: {model.score_tracker["sparse"].last_value}')

In [None]:
# 10 terms with the highest probability in each topic
top_tokens = model.score_tracker['top-tokens'].last_tokens

for topic_name in model.topic_names:
    print(top_tokens[topic_name])

In [None]:
# saving the phi matrix in 'csv' format
name = 'Phi.csv'
model.get_phi(topic_names=model.topic_names, class_ids=['@default_class'], model_name = model.model_pwt).to_csv(name)

# Results

In [None]:
def find_terms(phi_name='Phi.csv', num_topics=150, min_prob=0.03, max_topics=3):
    """
    Search for words that have a high probability in a small number of topics
            Parameters:
                    phi_name (str): name of the file in the 'csv' format with the phi-matrix
                    num_topics (int): number of hidden topics -- number of columns of the phi-matrix
                    min_prob (float): minimum probability for the term
                    max_topics (int): maximum number of high probability topics for a term
                    
            Return:
                    terms (list): the list of all possible terms in the dataset
    """
    terms = []

    with open(phi_name) as f:
        reader = csv.reader(f)
        f = 0
        for row in reader:
            if f == 0:
                f = 1
                continue
            else:
                count = 0
                for i in range(1, num_topics + 1):
                    if float(row[i]) > min_prob:
                        count += 1
                if 0 < count <= max_topics:
                    terms.append(re.sub(r'\'|\)', '', row[0].split(', ')[1]))
    return terms

In [None]:
terms = find_terms()

In [None]:
def results(terms, truth_json='Dataset/Train_test/Train_1.json', input_dir = 'Dataset/TopMine_texts/'):
    """
    Search for words that have a high probability in a small number of topics
            Parameters:
                    terms (list): the list of all possible terms in the dataset
                    truth_json (str): the name of the file with markup in 'json' format
                    input_dir (str): directory with texts after TopMine
            Return:
                    precision (list): precision for each document
                    recall (list): recall for each document
    """
    precision = []
    recall = []
    iteration = 0
    terms = set(terms)
    with open(truth_json, 'r') as fin:
        truth = json.load(fin)

    for k in truth.keys():
        # tp -- true-positive, fp -- false-positive, fn -- false-negative
        tp, fp, fn = 0, 0, 0
        
        # markup stemming and deliting numbers
        truth_str = ' '.join(truth[k])
        truth_str = re.sub(r"\d+", "", truth_str, flags=re.UNICODE)
        stemmer = SnowballStemmer(language='english')
        sent = TextBlob(truth_str)
        gr_truth = set([stemmer.stem(w) for w in sent.words if len(w) > 2])
        
        fn = len(gr_truth)
        
        input_path = input_dir + k + '.txt'
        with open(input_path, 'r') as fin:
            result = []
            words = set(fin.read().split())
            result = words & terms

        # progress
        iteration += 1
        if iteration % 1000 == 0:
            print('Progress: ', iteration)

        # counting results
        match = result & gr_truth
        tp = len(match)
        fn = fn - tp
        fp = len(result) - tp
        if (tp + fp) >= 1:
            precision.append(tp / (tp + fp))
        else:
            precision.append(0)
        if (tp + fn) >= 1:
            recall.append(tp / (tp + fn))
        else:
            recall.append(0)

    return precision, recall

In [None]:
precision, recall = results(terms)

In [None]:
prec = sum(precision) / len(precision)
print('Precision:', prec)

rec = sum(recall) / len(recall)
print('Recall:', rec)

f1 = 2 * prec * rec / (prec + rec)
print('F1:', f1)