In [1]:
# !pip install transformers==4.30.2
# !pip install datasets==2.12.0

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
import string
from unidecode import unidecode
import tensorflow as tf 
from sklearn.utils import class_weight
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import cloudpickle

In [2]:
class TextPreprocessor:
    def __init__(self, remove_punct: bool = True, remove_digits: bool = True,
                 remove_stop_words: bool = True,
                 remove_short_words: bool = True, minlen: int = 1, maxlen: int = 1, top_p: float = None,
                 bottom_p: float = None):
        self.remove_punct = remove_punct
        self.remove_digits = remove_digits
        self.remove_stop_words = remove_stop_words
        self.remove_short_words = remove_short_words
        self.minlen = minlen
        self.maxlen = maxlen
        self.top_p = top_p
        self.bottom_p = bottom_p
        self.words_to_remove = []
        self.stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
                           'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
                           'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them',
                           'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
                           'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
                           'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'or',
                           'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
                           'into', 'through', 'during', 'before', 'after', 'to', 'from',
                           'in', 'out', 'on', 'off', 'further', 'then', 'once',
                           'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
                           'other', 'such', 'only', 'own', 'same', 'so', 'than',
                           'too', 'can', 'will', 'just', 'should',
                           'now']

        

    @staticmethod
    def __remove_double_whitespaces(string: str):
        return " ".join(string.split())
    

    def __remove_punct(self, string_series: pd.Series):
        """
       Removes punctuations from the input string.
       :param string_series: pd.Series, input string series
       :return: pd.Series, cleaned string series
       """
        clean_string_series = string_series.copy()
        puncts = [r'\n', r'\r', r'\t']
        puncts.extend(list(string.punctuation))
        for i in puncts:
            clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
        return clean_string_series.map(self.__remove_double_whitespaces)

    def __remove_digits(self, string_series: pd.Series):
        """
       Removes digits from the input string.
       :param string_series: pd.Series, input string series
       :return: pd.Series, cleaned string series
       """
        clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy()
        return clean_string_series.map(self.__remove_double_whitespaces)
 

    def __remove_stop_words(self, string_series: pd.Series):
        """
       Removes stop words from the input string.
       :param string_series: pd.Series, input string series
       :return: pd.Series, cleaned string series
       """
        def str_remove_stop_words(string: str):
            stops = self.stop_words
            return " ".join([token for token in string.split() if token not in stops])

        return string_series.map(str_remove_stop_words)

    

    def preprocess(self, string_series: pd.Series, dataset: str = "train"):
        """
        Entry point.
        :param string_series: pd.Series, input string series
        :param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
        :return: pd.Series, cleaned string series
        """
        string_series = string_series.str.lower()
        if self.remove_punct:
            string_series = self.__remove_punct(string_series=string_series)
        if self.remove_digits:
            string_series = self.__remove_digits(string_series=string_series)
        if self.remove_stop_words:
            string_series = self.__remove_stop_words(string_series=string_series)
        

        string_series = string_series.str.strip()
        string_series.replace(to_replace="", value="this is an empty message", inplace=True)

        return string_series

In [3]:
# https://www.kaggle.com/datasets/bigbuddha47/text-classiciation?select=arxiv100.csv
data = pd.read_csv("arxiv100.csv")

In [4]:
data

Unnamed: 0,title,abstract,label
0,The Pre-He White Dwarfs in Eclipsing Binaries....,We report the first $BV$ light curves and hi...,astro-ph
1,A Possible Origin of kHZ QPOs in Low-Mass X-ra...,A possible origin of kHz QPOs in low-mass X-...,astro-ph
2,The effects of driving time scales on heating ...,Context. The relative importance of AC and D...,astro-ph
3,A new hard X-ray selected sample of extreme hi...,Extreme high-energy peaked BL Lac objects (E...,astro-ph
4,The baryon cycle of Seven Dwarfs with superbub...,"We present results from a high-resolution, c...",astro-ph
...,...,...,...
99995,Semiparametric estimation for space-time max-s...,Max-stable processes have been expanded to q...,stat
99996,A spatial causal analysis of wildland fire-con...,Wildland fire smoke contains hazardous level...,stat
99997,Neural Conditional Event Time Models,Event time models predict occurrence times o...,stat
99998,Efficient Estimation of COM-Poisson Regression...,The Conway-Maxwell-Poisson (CMP) or COM-Pois...,stat


In [5]:
data ['text'] = data['title'] + " " + data['abstract']
data.rename(columns={'label': 'category'}, inplace=True)
data = data[['text', 'category']].copy()

In [6]:
data

Unnamed: 0,text,category
0,The Pre-He White Dwarfs in Eclipsing Binaries....,astro-ph
1,A Possible Origin of kHZ QPOs in Low-Mass X-ra...,astro-ph
2,The effects of driving time scales on heating ...,astro-ph
3,A new hard X-ray selected sample of extreme hi...,astro-ph
4,The baryon cycle of Seven Dwarfs with superbub...,astro-ph
...,...,...
99995,Semiparametric estimation for space-time max-s...,stat
99996,A spatial causal analysis of wildland fire-con...,stat
99997,Neural Conditional Event Time Models Event t...,stat
99998,Efficient Estimation of COM-Poisson Regression...,stat


In [7]:
data['text'][0]

"The Pre-He White Dwarfs in Eclipsing Binaries. I. WASP 0131+28   We report the first $BV$ light curves and high-resolution spectra of the\npost-mass transfer binary star WASP 0131+28 to study the absolute properties of\nextremely low-mass white dwarfs. From the observed spectra, the double-lined\nradial velocities were derived, and the effective temperature and rotational\nvelocity of the brighter, more massive primary were found to be $T_{\\rm eff,1}\n= 10,000 \\pm 200$ K and $v_1\\sin$$i$ = 55 $\\pm$ 10 km s$^{-1}$, respectively.\nThe combined analysis of the {\\it TESS} archive data and ours yielded the\naccurate fundamental parameters of the program target. The masses were derived\nto about 1.0 \\% accuracy and the radii to 0.6 \\%, or better. The secondary\ncomponent's parameters of $M_2 = 0.200 \\pm 0.002$ M$_\\odot$, $R_2 = 0.528 \\pm\n0.003$ R$_\\odot$, $T_{\\rm eff,2}$ = 11,186 $\\pm$ 235 K, and $L_2 = 3.9 \\pm 0.3$\nL$_\\odot$ are in excellent agreement with the evolutionary

In [8]:
tp = TextPreprocessor()
data['text'] = tp.preprocess(data['text'])

In [9]:
data['text'][0]

'pre white dwarfs eclipsing binaries wasp report first bv light curves high resolution spectra post mass transfer binary star wasp study absolute properties extremely low mass white dwarfs observed spectra double lined radial velocities derived effective temperature rotational velocity brighter more massive primary found t m eff pm k v sin pm km s respectively combined analysis tess archive data yielded accurate fundamental parameters program target masses derived accuracy radii better secondary component s parameters m pm m odot r pm r odot t m eff pm k l pm l odot excellent agreement evolutionary sequence helium core white dwarf mass m odot indicates star halfway constant luminosity phase results presented article demonstrate wasp el cvn eclipsing binary thin disk formed stable roche lobe overflow channel composed main sequence dwarf spectral type pre white dwarf'

In [10]:
label_dist = pd.Series(data['category']).value_counts()
label_dist

category
astro-ph    10000
cond-mat    10000
cs          10000
eess        10000
hep-ph      10000
hep-th      10000
math        10000
physics     10000
quant-ph    10000
stat        10000
Name: count, dtype: int64

In [11]:
le = LabelEncoder()
data['target'] = le.fit_transform(data['category'])

In [12]:
with open("arxiv_category_preprocessor_labelencoder.bin", "wb") as model_file_obj:
    cloudpickle.dump((tp, le), model_file_obj)

In [13]:
data

Unnamed: 0,text,category,target
0,pre white dwarfs eclipsing binaries wasp repor...,astro-ph,0
1,possible origin khz qpos low mass x ray binari...,astro-ph,0
2,effects driving time scales heating coronal ar...,astro-ph,0
3,new hard x ray selected sample extreme high en...,astro-ph,0
4,baryon cycle seven dwarfs superbubble feedback...,astro-ph,0
...,...,...,...
99995,semiparametric estimation space time max stabl...,stat,9
99996,spatial causal analysis wildland fire contribu...,stat,9
99997,neural conditional event time models event tim...,stat,9
99998,efficient estimation com poisson regression ge...,stat,9


In [14]:
x = data['text'].copy()
y = data['target'].copy()

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((80000,), (20000,), (80000,), (20000,))

In [17]:
x_train, x_test, y_train, y_test = x_train.to_list(), x_test.to_list(), y_train.to_list(), y_test.to_list()

In [18]:
classes_ = sorted([*y.unique()]).copy()
classes_

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [19]:
from transformers import RobertaTokenizerFast

In [20]:
model_checkpoint = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [21]:
print(x_train[0])
print(tokenizer.tokenize(x_train[0]))
print(tokenizer(x_train[0]))

exploring public reaction covid news social media portugal outburst proliferation covid pandemic together subsequent social distancing measures raised massive challenges almost domains public private life around globe stay home movement pushed news audiences social networks turn become most prolific field receiving sharing news updates well public expression opinions concerns feelings pandemic public opinion critical aspect analysing information events impact peoples lives research shown social media data may promising understanding people respond health risks social crisis feelings tend share adapting unforeseen circumstances threaten almost societal spheres paper presents results social media analysis news headlines posted major daily news outlet portugal sic noticias facebook january december focusing issues attention cycle audiences emotional response covid news outburst work adds emergent body studies examining public response coronavirus pandemic social media data
['expl', 'oring

In [22]:
strategy = tf.distribute.MirroredStrategy()

In [24]:
BATCH_SIZE = 32 * strategy.num_replicas_in_sync
N_TOKENS = 200
N_CLASSES = len(classes_)

In [25]:
train_tokens = tokenizer(x_train, max_length=N_TOKENS, padding="max_length", truncation=True, return_tensors="tf", return_attention_mask=True)
test_tokens = tokenizer(x_test, max_length=N_TOKENS, padding="max_length", truncation=True, return_tensors="tf", return_attention_mask=True)

In [26]:
train_tokens[:5]

[Encoding(num_tokens=200, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=200, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=200, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=200, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=200, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [27]:
train_tf_data = tf.data.Dataset.from_tensor_slices((dict(train_tokens), to_categorical(y_train)))
test_tf_data = tf.data.Dataset.from_tensor_slices((dict(test_tokens), to_categorical(y_test)))

In [28]:
del(data)
del(train_tokens)
del(test_tokens)

In [29]:
train_tf_data=train_tf_data.prefetch(tf.data.AUTOTUNE)
test_tf_data=test_tf_data.prefetch(tf.data.AUTOTUNE)

In [30]:
for i in train_tf_data.take(1):
    print(i)

({'input_ids': <tf.Tensor: shape=(200,), dtype=int32, numpy=
array([    0, 23242,  5137,   285,  4289, 47268,   808,   340,   592,
         433,  4103, 39029, 28999, 24739, 47268,   808, 23387, 14414,
         561,  7757,   592,  7018,  7710,  1797,  1179,  2232,  2019,
         818, 30700,   285,   940,   301,   198,  7183,  1095,   184,
        2079,  3148,   340,  7768,   592,  4836,  1004,   555,   144,
       17038,   882,  2806,  3565,   340,  3496,   157,   285,  8151,
        5086,  1379,  6453, 23387, 14414,   285,  2979,  2008,  6659,
       34273,   154,   335,  1061,   913, 16592,  1074,   557,  2343,
         592,   433,   414,   189,  6177,  2969,    82,  2519,   474,
        2476,   592,  1486,  6453,  3805,   458, 26493, 33257,  4215,
       10508,   818, 24032, 35962,  2225,  6822,   775,   592,   433,
        1966,   340,  6337,  1278,   538,  1230,   340,  9455,  4103,
       39029,   579,   636,    45,   636,  5003, 10660, 10408, 16705,
         263, 47153,  5650,  

In [33]:
from transformers import TFRobertaModel, RobertaConfig
from tensorflow.keras.layers import Input, Dense, Dropout, Average, BatchNormalization

In [35]:
config = RobertaConfig.from_pretrained(model_checkpoint, output_hidden_states=False)

In [63]:
with strategy.scope():
    model = TFRobertaModel.from_pretrained(model_checkpoint, config=config)
    input_ids = Input(shape=(N_TOKENS,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(N_TOKENS,), dtype=tf.int32, name="attention_mask")
    x = model([input_ids, attention_mask])[0][:,0,:] # [CLS] token of last hidden state
    x = Dropout(0.3)(x)
    x = BatchNormalization()(x)
    x = Dense(512, activation="relu")(x)
    x = Dropout(0.3)(x)
    x = BatchNormalization()(x)
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.3)(x)
    x = BatchNormalization()(x)
    output = Dense(N_CLASSES, activation="softmax", name="output")(x)
    model = tf.keras.Model(inputs=[input_ids, attention_mask],outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(5e-5), metrics=["categorical_accuracy"], loss="categorical_crossentropy")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.embeddings.position_ids', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

In [64]:
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 200)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 200)]        0           []                               
                                                                                                  
 tf_roberta_model_9 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids[0][0]',              
 odel)                          thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 200,                                         

In [65]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor="val_loss",patience=1,mode="min")

In [66]:
model.fit(train_tf_data.shuffle(len(train_tf_data)).batch(BATCH_SIZE), validation_data=test_tf_data.shuffle(len(test_tf_data)).batch(BATCH_SIZE), 
          epochs=10, callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10


<keras.callbacks.History at 0x7e67fda54580>

In [67]:
del(train_tf_data)
del(test_tf_data)

In [68]:
model.save("arxiv_classifier_hf_roberta.h5")

In [69]:
del(model)

**INFERENCE**

In [129]:
import transformers
import tensorflow as tf
import cloudpickle
from transformers import RobertaTokenizerFast

with open("arxiv_category_preprocessor_labelencoder.bin", "rb") as model_file_obj:
    text_preprocessor, label_encoder = cloudpickle.load(model_file_obj)
model = tf.keras.models.load_model('arxiv_classifier_hf_roberta.h5', custom_objects={"TFRobertaModel": transformers.TFRobertaModel})
model_checkpoint = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint)

In [130]:
import numpy as np
import pandas as pd
def inference(text: str):
    text = text_preprocessor.preprocess(pd.Series(text))[0]
    tokens = tokenizer([text], max_length=200, padding="max_length", truncation=True, return_tensors="tf")
    input_ = [tokens['input_ids'], tokens['attention_mask']]
    pred = model.predict(input_, verbose=0)
    arg_max = np.argmax(pred[0])
    return [label_encoder.inverse_transform([arg_max])[0],  pred[0][arg_max]]

In [131]:
txt = '''
Most widely-used pre-trained language models operate on sequences of tokens corresponding to word or subword units. By comparison, token-free models that operate directly on raw text (bytes or characters) have
many benefits: they can process text in any
language out of the box, they are more robust
to noise, and they minimize technical debt by
removing complex and error-prone text preprocessing pipelines. Since byte or character
sequences are longer than token sequences,
past work on token-free models has often introduced new model architectures designed
to amortize the cost of operating directly on
raw text. In this paper, we show that a standard Transformer architecture can be used
with minimal modifications to process byte
sequences
'''
inference(txt)

['cs', 0.9987519]

In [132]:
txt = '''
Measurements of coherent charmonium production cross sections together with their ratio in ultra-peripheral PbPb collisions are studied at
a nucleon-nucleon centre-of-mass energy of 5.02 TeV, the differential crosssections are measured as a function of rapidity and transverse momentum,
separately. The photo-production of J/ψ mesons at low transverse momentum is studied in peripheral PbPb collisions, which confirms coherent J/ψ
production in hadronic collisions. These latest results significantly improve
previous measurements and are compared with some theoretical predictions.
'''
inference(txt)

['hep-ph', 0.999938]

In [133]:
txt = '''
Let p be a prime, W the ring of Witt vectors of a perfect field k of characteristic p and ζ a primitive pth root of unity. We introduce a new notion of calculus over W that we call absolute calculus. It may be seen as a singular version of the q-calculus used in previous work, in the sense that the role of the coordinate is now played by q itself. 
We show that what we call a weakly nilpotent absolute connection on a finite free module is equivalent to a prismatic vector bundle on W[ζ]. As a corollary of a theorem of Bhatt and Scholze, we finally obtain that an absolute connection with a frobenius structure on a finite free module is equivalent to a lattice in a crystalline representation. We also consider the case of de Rham prismatic crystals as well as Hodge-Tate prismatic crystals.'''
inference(txt)

['math', 0.9949458]

In [134]:
txt = '''
Stabilizing proteins is a foundational step in protein engineering. However, the
evolutionary pressure of all extant proteins makes identifying the scarce number of
mutations that will improve thermodynamic stability challenging. Deep learning
has recently emerged as a powerful tool for identifying promising mutations. Existing approaches, however, are computationally expensive, as the number of model
inferences scales with the number of mutations queried. Our main contribution is
a simple, parallel decoding algorithm. Our Mutate Everything is capable of predicting the effect of all single and double mutations in one forward pass. It is even
versatile enough to predict higher-order mutations with minimal computational
overhead. We build our Mutate Everything on top of ESM2 and AlphaFold, neither
of which were trained to predict thermodynamic stability. We trained on the MegaScale cDNA proteolysis dataset and achieved state-of-the-art performance on single
and higher-order mutations on S669, ProTherm, and ProteinGym datasets'''
inference(txt)

['stat', 0.48001435]

In [135]:
txt = '''
We incorporate a version of a spike and slab prior, comprising a pointmass at zero ("spike") and a Normal distribution around zero ("slab") into a dynamic panel data framework to model coefficient heterogeneity. In addition to homogeneity and full heterogeneity, our specification can also capture sparse heterogeneity, that is, there is a core group of units that share common parameters and a set of deviators with idiosyncratic parameters. We fit a model with unobserved components to income data from the Panel Study of Income Dynamics. 
We find evidence for sparse heterogeneity for balanced panels composed of individuals with long employment histories.'''
inference(txt)

['stat', 0.8867156]

In [137]:
txt = '''
Despite investments in multiple space and ground-based solar observatories by the
global community, the Sun’s polar regions remain unchartered territory – the last great
frontier for solar observations. Breaching this frontier is fundamental to understanding
the solar cycle – the ultimate driver of short-to-long term solar activity that encompasses
space weather and space climate. Magnetohydrodynamic dynamo models and
empirically observed relationships have established that the polar field is the primary
determinant of future solar cycle amplitude. Models of solar surface evolution of tilted
active regions indicate that the mid-to-high latitude surges of magnetic flux govern the
dynamics leading to the reversal and build-up of polar field. Our theoretical
understanding and numerical models of this high latitude magnetic field dynamics and
plasma flows – that are a critical component of the sunspot cycle – lack precise
observational constraints, currently limited by large projection effects due to our location
in the plane of the ecliptic.'''
inference(txt)

['astro-ph', 0.9994288]