## Get sentence embeddings ([CLS] token embedding of last hidden state) from RoBERTa model that is fine-tuned on arxiv data for classification. The embeddings are then compared with SentenceTransformer's embeddings

In [None]:
# !pip install sentence_transformers

In [2]:
import transformers
import numpy as np
from numpy.linalg import norm
import pandas as pd
import string
import tensorflow as tf
from transformers import RobertaTokenizerFast, TFRobertaModel, RobertaConfig
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.layers import Input
from sentence_transformers import SentenceTransformer

In [3]:
class TextPreprocessor:
    def __init__(self, remove_punct: bool = True, remove_digits: bool = True,
                 remove_stop_words: bool = True,
                 remove_short_words: bool = True, minlen: int = 1, maxlen: int = 1, top_p: float = None,
                 bottom_p: float = None):
        self.remove_punct = remove_punct
        self.remove_digits = remove_digits
        self.remove_stop_words = remove_stop_words
        self.remove_short_words = remove_short_words
        self.minlen = minlen
        self.maxlen = maxlen
        self.top_p = top_p
        self.bottom_p = bottom_p
        self.words_to_remove = []
        self.stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
                           'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
                           'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them',
                           'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
                           'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
                           'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'or',
                           'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
                           'into', 'through', 'during', 'before', 'after', 'to', 'from',
                           'in', 'out', 'on', 'off', 'further', 'then', 'once',
                           'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
                           'other', 'such', 'only', 'own', 'same', 'so', 'than',
                           'too', 'can', 'will', 'just', 'should',
                           'now']

        

    @staticmethod
    def __remove_double_whitespaces(string: str):
        return " ".join(string.split())
    

    def __remove_punct(self, string_series: pd.Series):
        """
       Removes punctuations from the input string.
       :param string_series: pd.Series, input string series
       :return: pd.Series, cleaned string series
       """
        clean_string_series = string_series.copy()
        puncts = [r'\n', r'\r', r'\t']
        puncts.extend(list(string.punctuation))
        for i in puncts:
            clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
        return clean_string_series.map(self.__remove_double_whitespaces)

    def __remove_digits(self, string_series: pd.Series):
        """
       Removes digits from the input string.
       :param string_series: pd.Series, input string series
       :return: pd.Series, cleaned string series
       """
        clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy()
        return clean_string_series.map(self.__remove_double_whitespaces)
 

    def __remove_stop_words(self, string_series: pd.Series):
        """
       Removes stop words from the input string.
       :param string_series: pd.Series, input string series
       :return: pd.Series, cleaned string series
       """
        def str_remove_stop_words(string: str):
            stops = self.stop_words
            return " ".join([token for token in string.split() if token not in stops])

        return string_series.map(str_remove_stop_words)

    

    def preprocess(self, string_series: pd.Series, dataset: str = "train"):
        """
        Entry point.
        :param string_series: pd.Series, input string series
        :param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
        :return: pd.Series, cleaned string series
        """
        string_series = string_series.str.lower()
        if self.remove_punct:
            string_series = self.__remove_punct(string_series=string_series)
        if self.remove_digits:
            string_series = self.__remove_digits(string_series=string_series)
        if self.remove_stop_words:
            string_series = self.__remove_stop_words(string_series=string_series)
        

        string_series = string_series.str.strip()
        string_series.replace(to_replace="", value="this is an empty message", inplace=True)

        return string_series

In [4]:
# Fine-tuning code: https://github.com/ksv-muralidhar/hugging_face_tf_fine_tuning/blob/main/roberta_text_classification.ipynb
def load_finetuned_model():
    '''
    Function to load fine-tuned RoBERTa model to classify arxiv papers.
    '''
    model = tf.keras.models.load_model('arxiv_classifier_hf_roberta.h5', 
                                       custom_objects={"TFRobertaModel": transformers.TFRobertaModel})
    model_checkpoint = "roberta-base"
    top = model.get_layer('tf.__operators__.getitem_9').output
    model = Model(inputs=model.input, outputs=top)
    return model

In [5]:
def get_sent_transformer_embeddings(sent_transformer, txt):
    '''
    Function to get sentence embeddings from SentenceTransformer using
    'roberta-base-nli-mean-tokens' chaeckpoint
    '''
    txt = text_preprocessor.preprocess(pd.Series(txt))[0]
    embedding = sent_transformer.encode(txt, show_progress_bar=False)
    return embedding

In [6]:
def get_finetuned_model_embeddings(model, txt):
    '''
    Function to extract [CLS] token embedding from the last hidden state of
    fine-tuned RoBERTa model
    '''
    model_checkpoint = "roberta-base"
    tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint)
    txt = text_preprocessor.preprocess(pd.Series(txt))[0]
    txt = tokenizer([txt], 
          max_length=200, padding="max_length", truncation=True, return_tensors="tf")
    embedding = model.predict([txt['input_ids'], txt['attention_mask']], verbose=0)[0]
    return embedding

In [7]:
def get_similarity_score(emb1, emb2):
    '''
    Function to compute cosine-similarity score.
    '''
    cos_sim = np.dot(emb1, emb2) / (norm(emb1) * norm(emb2))
    return cos_sim

In [None]:
text_preprocessor = TextPreprocessor()
sent_transformer = SentenceTransformer('roberta-base-nli-mean-tokens')
finetuned_model = load_finetuned_model()

def compare_models(txt1, txt2):
    '''
    Function to return cosine similarity scores of embeddings of fine-tuned model and 
    SentenceTransformer embeddings
    '''
    sent_emb1 = get_sent_transformer_embeddings(sent_transformer, txt1)
    sent_emb2 = get_sent_transformer_embeddings(sent_transformer, txt2)
    
    finetuned_model_emb1 = get_finetuned_model_embeddings(finetuned_model, txt1)
    finetuned_model_emb2 = get_finetuned_model_embeddings(finetuned_model, txt2)
    
    print(f'Similarity score of fine-tuned model: {get_similarity_score(finetuned_model_emb1, finetuned_model_emb2)}')
    print(f'Similarity score of sentence transformer: {get_similarity_score(sent_emb1, sent_emb2)}')

In [20]:
# computer science
txt1 = '''
Enterprise applications of Large Language Models (LLMs) hold promise for question answering on
enterprise SQL databases. However, the extent to which LLMs can accurately respond to enterprise
questions in such databases remains unclear, given the absence of suitable Text-to-SQL benchmarks
tailored to enterprise settings. Additionally, the potential of Knowledge Graphs (KGs) to enhance
LLM-based question answering by providing business context is not well understood. This study aims
to evaluate the accuracy of LLM-powered question answering systems in the context of enterprise
questions and SQL databases, while also exploring the role of knowledge graphs in improving
accuracy.
'''

# computer science
txt2 = '''
Most widely-used pre-trained language models operate on sequences of tokens corresponding to word or subword units. 
By comparison, token-free models that operate directly on raw text (bytes or characters) have
many benefits: they can process text in any
language out of the box, they are more robust
to noise, and they minimize technical debt by
removing complex and error-prone text preprocessing pipelines. Since byte or character
sequences are longer than token sequences,
past work on token-free models has often introduced new model architectures designed
to amortize the cost of operating directly on
raw text. In this paper, we show that a standard Transformer architecture can be used
with minimal modifications to process byte
sequences
'''

compare_models(txt1, txt2)

Similarity score of fine-tuned model: 0.9938491582870483
Similarity score of sentence transformer: 0.5998247265815735


In [10]:
# math
txt1 = '''
It is known that many different types of finite random subgraph models undergo quantitatively similar phase transitions around their percolation thresholds, and the proofs of
these results rely on isoperimetric properties of the underlying host graph. Recently, the authors showed that such a phase transition occurs in a large class of regular high-dimensional
product graphs, generalising a classic result for the hypercube.
In this paper we give new isoperimetric inequalities for such regular high-dimensional
product graphs, which generalise the well-known isoperimetric inequality of Harper for the
hypercube, and are asymptotically sharp for a wide range of set sizes. We then use these
isoperimetric properties to investigate the structure of the giant component L1 in supercritical percolation on these product graphs, that is, when p =
1+ϵ
d
, where d is the degree of
the product graph and ϵ > 0 is a small enough constant.
We show that typically L1 has edge-expansion Ω
1
d ln d

. Furthermore, we show that L1
likely contains a linear-sized subgraph with vertex-expansion Ω
1
d ln d

. These results are
best possible up to the logarithmic factor in d.
Using these likely expansion properties, we determine, up to small polylogarithmic factors
in d, the likely diameter of L1 as well as the typical mixing time of a lazy random walk on
L1. Furthermore, we show the likely existence of a cycle of length Ω
n
d ln d

. These results
not only generalise, but also improve substantially upon the known bounds in the case of
the hypercube, where in particular the likely diameter and typical mixing time of L1 were
previously only known to be polynomial in 
'''

# computer science
txt2 = '''
Most widely-used pre-trained language models operate on sequences of tokens corresponding to word or subword units. 
By comparison, token-free models that operate directly on raw text (bytes or characters) have
many benefits: they can process text in any
language out of the box, they are more robust
to noise, and they minimize technical debt by
removing complex and error-prone text preprocessing pipelines. Since byte or character
sequences are longer than token sequences,
past work on token-free models has often introduced new model architectures designed
to amortize the cost of operating directly on
raw text. In this paper, we show that a standard Transformer architecture can be used
with minimal modifications to process byte
sequences
'''

compare_models(txt1, txt2)

Similarity score of fine-tuned model: 0.3928041160106659
Similarity score of sentence transformer: 0.7885901927947998


In [11]:
# stats
txt1 = '''
Causal inference necessarily relies upon untestable assumptions; hence,
it is crucial to assess the robustness of obtained results to violations of identification
assumptions. However, such sensitivity analysis is only occasionally undertaken in
practice, as many existing methods only apply to relatively simple models and their
results are often difficult to interpret. We take a more flexible approach to sensitivity
analysis and view it as a constrained stochastic optimization problem. We focus
on linear models with an unmeasured confounder and a potential instrument. We
show how the R2
-calculus – a set of algebraic rules that relates different (partial) R2
-
values and correlations – can be applied to identify the bias of the k-class estimators
and construct sensitivity models flexibly. We further show that the heuristic “plug-in”
sensitivity interval may not have any confidence guarantees; instead, we propose a
boostrap approach to construct sensitivity intervals which perform well in numerical
simulations. We illustrate the proposed methods with a real study on the causal effect
of education on earnings and provide user-friendly visualization tools.

'''

# stats
txt2 = '''
When estimating causal effects, it is important to assess external validity, i.e., determine
how useful a given study is to inform a practical question for a specific target population. One
challenge is that the covariate distribution in the population underlying a study may be different
from that in the target population. If some covariates are effect modifiers, the average treatment
effect (ATE) may not generalize to the target population. To tackle this problem, we propose new
methods to generalize or transport the ATE from a source population to a target population, in
the case where the source and target populations have different sets of covariates. When the ATE
in the target population is identified, we propose new doubly robust estimators and establish their
rates of convergence and limiting distributions. Under regularity conditions, the doubly robust
estimators provably achieve the efficiency bound and are locally asymptotic minimax optimal.
A sensitivity analysis is provided when the identification assumptions fail. Simulation studies
show the advantages of the proposed doubly robust estimator over simple plug-in estimators.
Importantly, we also provide minimax lower bounds and higher-order estimators of the target
functionals. The proposed methods are applied in transporting causal effects of dietary intake on
adverse pregnancy outcomes from an observational study to the whole U.S. female population.
'''

compare_models(txt1, txt2)

Similarity score of fine-tuned model: 0.8682332038879395
Similarity score of sentence transformer: 0.8693523406982422


In [12]:
# stats
txt1 = '''
Given the wealth inequality worldwide, there is an urgent need to identify the mode of
wealth exchange through which it arises. To address the research gap regarding models that
combine equivalent exchange and redistribution, this study compares an equivalent market
exchange with redistribution based on power centers and a nonequivalent exchange with mutual
aid using the Polanyi, Graeber, and Karatani modes of exchange. Two new exchange models based
on multi-agent interactions are reconstructed following an econophysics approach for evaluating
the Gini index (inequality) and total exchange (economic flow). Exchange simulations indicate that
the evaluation parameter of the total exchange divided by the Gini index can be expressed by the
same saturated curvilinear approximate equation using the wealth transfer rate and time period of
redistribution and the surplus contribution rate of the wealthy and the saving rate. However,
considering the coercion of taxes and its associated costs and independence based on the morality
of mutual aid, a nonequivalent exchange without return obligation is preferred. This is oriented
toward Graeber's baseline communism and Karatani's mode of exchange D, with implications for
alternatives to the capitalist economy.
'''

# computer science
txt2 = '''
Most widely-used pre-trained language models operate on sequences of tokens corresponding to word or subword units. 
By comparison, token-free models that operate directly on raw text (bytes or characters) have
many benefits: they can process text in any
language out of the box, they are more robust
to noise, and they minimize technical debt by
removing complex and error-prone text preprocessing pipelines. Since byte or character
sequences are longer than token sequences,
past work on token-free models has often introduced new model architectures designed
to amortize the cost of operating directly on
raw text. In this paper, we show that a standard Transformer architecture can be used
with minimal modifications to process byte
sequences
'''

compare_models(txt1, txt2)

Similarity score of fine-tuned model: 0.3753003180027008
Similarity score of sentence transformer: 0.6677366495132446


In [13]:
# stats
txt1 = '''
Given the wealth inequality worldwide, there is an urgent need to identify the mode of
wealth exchange through which it arises. To address the research gap regarding models that
combine equivalent exchange and redistribution, this study compares an equivalent market
exchange with redistribution based on power centers and a nonequivalent exchange with mutual
aid using the Polanyi, Graeber, and Karatani modes of exchange. Two new exchange models based
on multi-agent interactions are reconstructed following an econophysics approach for evaluating
the Gini index (inequality) and total exchange (economic flow). Exchange simulations indicate that
the evaluation parameter of the total exchange divided by the Gini index can be expressed by the
same saturated curvilinear approximate equation using the wealth transfer rate and time period of
redistribution and the surplus contribution rate of the wealthy and the saving rate. However,
considering the coercion of taxes and its associated costs and independence based on the morality
of mutual aid, a nonequivalent exchange without return obligation is preferred. This is oriented
toward Graeber's baseline communism and Karatani's mode of exchange D, with implications for
alternatives to the capitalist economy.
'''

# physics
txt2 = '''
We develop a neural network based pipeline to estimate masses of galaxy clusters with a 
known redshift directly from photon information in X-rays. Our neural networks are trained 
using supervised learning on simulations of eROSITA observations, focusing in this
paper on the Final Equatorial Depth Survey (eFEDS). We use convolutional neural networks which are modified to include additional
information of the cluster, in particular its redshift. In contrast to existing work, we utilize simulations including background and point
sources to develop a tool which is usable directly on observational eROSITA data for an extended mass range from group size halos to
massive clusters with masses in between 1013M < M < 1015M. Using this method, we are able to provide for the first time neural
network mass estimation for the observed eFEDS cluster sample from Spectrum-Roentgen-Gamma/eROSITA observations and we
find consistent performance with weak lensing calibrated masses. In this measurement, we do not use weak lensing information and
we only use previous cluster mass information which was used to calibrate the cluster properties in the simulations. When compared
to simulated data, we observe a reduced scatter with respect to luminosity and count-rate based scaling relations. We comment on the
application for other upcoming eROSITA All-Sky Survey observations.
'''

compare_models(txt1, txt2)

Similarity score of fine-tuned model: 0.18251608312129974
Similarity score of sentence transformer: 0.8016979098320007


In [14]:
# physics
txt1 = '''
This search for Magnetic Monopoles (MMs) and High Electric Charge Objects (HECOs) with
spins 0, 1/2 and 1, uses for the first time the full MoEDAL detector, exposed to 6.6 fb−1 
protonproton collisions at 13 TeV. The results are interpreted in terms of Drell-Yan and photon-fusion
pair production. Mass limits on direct production of MMs of up to 10 Dirac magnetic charges and
HECOs with electric charge in the range 5e to 350e, were achieved. The charge limits placed on
MM and HECO production are currently the strongest in the world. MoEDAL is the only LHC
experiment capable of being directly calibrated for highly-ionizing particles using heavy ions and
with a detector system dedicated to definitively measuring magnetic charge.

'''

# physics
txt2 = '''
Recently, a thermodynamic definition of time has been introduced. This definition is useful to find approach some open
problems in physics. But, it was obtained by a phenomenological approach and a logical inconsistency appears in the
definition. In particular, the definition was based on the ratio of two quantities, the entropy production and its rate, linked one
another just by the definition of time. In this paper, this inconsistency is overcome, by using the second law of thermodynamics
and Barbour’s mathematical methods, obtaining an analytical result that brings to the same equation of the phenomenological
method, but without any logical inconsistency.
'''

compare_models(txt1, txt2)

Similarity score of fine-tuned model: 0.7753373384475708
Similarity score of sentence transformer: 0.6930670142173767


In [15]:
# physics
txt1 = '''
This search for Magnetic Monopoles (MMs) and High Electric Charge Objects (HECOs) with
spins 0, 1/2 and 1, uses for the first time the full MoEDAL detector, exposed to 6.6 fb−1 
protonproton collisions at 13 TeV. The results are interpreted in terms of Drell-Yan and photon-fusion
pair production. Mass limits on direct production of MMs of up to 10 Dirac magnetic charges and
HECOs with electric charge in the range 5e to 350e, were achieved. The charge limits placed on
MM and HECO production are currently the strongest in the world. MoEDAL is the only LHC
experiment capable of being directly calibrated for highly-ionizing particles using heavy ions and
with a detector system dedicated to definitively measuring magnetic charge.

'''

# economics
txt2 = '''
In this paper, we propose Forest-PLS, a feature selection method for analyzing policy 
effect heterogeneity in a more flexible and comprehensive manner than is typically
available with conventional methods. In particular, our method is able to capture policy 
effect heterogeneity both within and across subgroups of the population defined
by observable characteristics. To achieve this, we employ partial least squares to identify 
target components of the population and causal forests to estimate personalized
policy effects across these components. We show that the method is consistent and
leads to asymptotically normally distributed policy effects. To demonstrate the efficacy 
of our approach, we apply it to the data from the Pennsylvania Reemployment
Bonus Experiments, which were conducted in 1988-1989. The analysis reveals that financial 
incentives can motivate some young non-white individuals to enter the labor
market. However, these incentives may also provide a temporary financial cushion
for others, dissuading them from actively seeking employment. Our findings highlight the 
need for targeted, personalized measures for young non-white male participants.

'''

compare_models(txt1, txt2)

Similarity score of fine-tuned model: 0.3002638518810272
Similarity score of sentence transformer: 0.6593513488769531


In [16]:
# economics
txt1 = '''
Given an initial matching and a policy objective on the distribution of agent types to institutions, we study the existence of a mechanism
that weakly improves the distributional objective and satisfies constrained
efficiency, individual rationality, and strategy-proofness. We show that
such a mechanism need not exist in general. We introduce a new notion
of discrete concavity, which we call pseudo M♮
-concavity, and construct a
mechanism with the desirable properties when the distributional objective
satisfies this notion. We provide several practically relevant distributional
objectives that are pseudo M♮
-concave.
'''

# economics
txt2 = '''
An agent may strategically employ a vague message to mislead an audience’s belief
about the state of the world, but this may cause the agent to feel guilt or negatively impact
how the audience perceives the agent. Using a novel experimental design that allows
participants to be vague while at the same time isolating the internal cost of lying from
the social identity cost of appearing dishonest, we explore the extent to which these two
types of lying costs affect communication. We find that participants exploit vagueness to be
consistent with the truth, while at the same time leveraging the imprecision to their own
benefit. More participants use vague messages in treatments where concern with social
identity is relevant. In addition, we find that social identity concerns substantially affect
the length and patterns of vague messages used across the treatments.

'''

compare_models(txt1, txt2)

Similarity score of fine-tuned model: 0.7802478075027466
Similarity score of sentence transformer: 0.6077088713645935


In [17]:
# economics
txt1 = '''
This paper examines the dynamics of Tether, the stablecoin with the largest
market capitalization. We show that the distributional and dynamic properties of Tether/USD rates have been evolving from 2017 to 2021. We use local
analysis methods to detect and describe the local patterns, such as short-lived
trends, time-varying volatility and persistence. To accommodate these patterns, we consider a time varying parameter Double Autoregressive tvDAR(1)
model under the assumption of local stationarity of Tether/USD rates. We estimate the tvDAR model non-parametrically and test hypotheses on the functional parameters. In the application to Tether, the model provides a good fit
and reliable out-of-sample forecasts at short horizons, while being robust to
time-varying persistence and volatility. In addition, the model yields a simple
plug-in measure of stability for Tether and other stablecoins for assessing and
comparing their stability.

'''

# electrical engg.
txt2 = '''
In this paper, we optimize a Wireless Powered Communication (WPC) system including multiple
pair of users, where transmitters employ single-antenna to transmit their information and power to their
receivers with the help of one multiple-antennas Amplify-and-Forward (AF) relay or an active Intelligent
Reflecting Surface (IRS). We propose a joint Time Switching (TS) scheme in which transmitters,
receivers, and the relay/IRS are either in their energy or information transmission/reception modes.
The transmitted multi-carrier unmodulated and modulated waveforms are used for Energy Harvesting
(EH) and Information Decoding (ID) modes, respectively. In order to design an optimal fair system, we
maximize the minimum rate of all pairs for both relay and IRS systems through a unified framework.
This framework allows us to simultaneously design energy waveforms, find optimal relay/IRS amplification
/reflection matrices, allocate powers for information waveforms, and allocate time durations for
various phases. In addition, we take into account the non-linearity of the EH circuits in our problem. This
problem turns out to be non-convex. Thus, we propose an iterative algorithm by using the MinorizationMaximization (MM) 
technique, which quickly converges to the optimal solution. Numerical examples
show that the proposed method improves the performance of the multi-pair WPC relay/IRS system
under various setups.
'''

compare_models(txt1, txt2)

Similarity score of fine-tuned model: 0.4304705262184143
Similarity score of sentence transformer: 0.716398298740387


In [18]:
# math
txt1 = '''
Let either GL(E) × SO(F) or GL(E) × Sp(F) act naturally on the space of
matrices E ⊗ F. There are only finitely many orbits, and the orbit closures are orthogonal
and symplectic generalizations of determinantal varieties, which can be described similarly
using rank conditions. In this paper, we study the singularities of these varieties and
describe their defining equations. We prove that in the symplectic case, the orbit closures
are normal with good filtrations, and in characteristic 0 have rational singularities. In
the orthogonal case we show that most orbit closures will have the same properties, and
determine precisely the exceptions to this.
'''

# math
txt2 = '''
 In this paper, we initiate the study of a triple (X, ∆, D) which consists of
a pair (X, ∆) and a polarizing pseudoeffective divisor D. The adjoint asymptotic 
multiplier ideal sheaf J (X, ∆; kDk) associated to the triple gives a simultaneous 
generalization of the multiplier ideal sheaf J (D) and asymptotic multiplier ideal sheaf J (kDk).
We describe the closed set defined by the ideal sheaf J (X, ∆; kDk) in terms of the 
minimal model program. We also characterize the case where J (X, ∆; kDk) = OX. Lastly,
we also prove a Nadel type vanishing theorem of cohomology using J (X, ∆; kDk).
'''

compare_models(txt1, txt2)

Similarity score of fine-tuned model: 0.9861209392547607
Similarity score of sentence transformer: 0.720016598701477


In [19]:
# computer science
txt1 = '''
Influence Maximization (IM) is a crucial problem in data science.
The goal is to find a fixed-size set of highly-influential seed vertices
on a network to maximize the influence spread along the edges.
While IM is NP-hard on commonly-used diffusion models, a greedy
algorithm can achieve (1−1/𝑒)-approximation, repeatedly selecting
the vertex with the highest marginal gain in influence as the seed.
Due to theoretical guarantees, rich literature focuses on improving
the performance of the greedy algorithm. To estimate the marginal
gain, existing work either runs Monte Carlo (MC) simulations of
influence spread or pre-stores hundreds of sketches (usually pervertex information). However, these approaches can be inefficient
in time (MC simulation) or space (storing sketches), preventing the
ideas from scaling to today’s large-scale graphs.
This paper significantly improves the scalability of IM using
two key techniques. The first is a sketch-compression technique for
the independent cascading model on undirected graphs. It allows
combining the simulation and sketching approaches to achieve
a time-space tradeoff. The second technique includes new data
structures for parallel seed selection. Using our new approaches,
we implemented PaC-IM: Parallel and Compressed IM.
We compare PaC-IM with state-of-the-art parallel IM systems
on a 96-core machine with 1.5TB memory. PaC-IM can process
large-scale graphs with up to 900M vertices and 74B edges in about
2 hours. On average across all tested graphs, our uncompressed
version is 5–18× faster and about 1.4× more space-efficient than
existing parallel IM systems. Using compression further saves 3.8×
space with only 70% overhead in time on average.

'''

# computer science
txt2 = '''
The planted clique problem is a paradigmatic model of statistical-to-computational
gaps: the planted clique is information-theoretically detectable if its size k ≥ 2 log2 n but 
polynomialtime algorithms only exist for the recovery task when k = Ω(√
n). By now, there are many simple
and fast algorithms that succeed as soon as k = Ω(√
n). Glaringly, however, no MCMC approach
to the problem had been shown to work, including the Metropolis process on cliques studied by
Jerrum since 1992. In fact, Chen, Mossel, and Zadik recently showed that any Metropolis process
whose state space is the set of cliques fails to find any sub-linear sized planted clique in polynomial
time if initialized naturally from the empty set. Here, we redeem MCMC performance for the
planted clique problem by relaxing the state space to all vertex subsets and adding a corresponding
energy penalty for missing edges. With that, we prove that energy-minimizing Markov chains 
(gradient descent and a low-temperature relaxation of it) succeed at recovering planted cliques of size
k = Ω(√
n) if initialized from the full graph. Importantly, initialized from the empty set, the 
relaxation still does not help the gradient descent find sub-linear planted cliques. We also demonstrate
robustness of these Markov chain approaches under a natural contamination model.
'''

compare_models(txt1, txt2)

Similarity score of fine-tuned model: 0.9069778919219971
Similarity score of sentence transformer: 0.7196155786514282
