## Get sentence embeddings from DistilBERT model that is fine-tuned on Wikipedia data for article classification

In [1]:
import transformers
import numpy as np
from numpy.linalg import norm
import pandas as pd
import string
import tensorflow as tf
import cloudpickle
from transformers import DistilBertTokenizerFast
from tensorflow.keras.models import load_model, Model

In [2]:
class TextPreprocessor:
    def __init__(self, remove_punct: bool = True, remove_digits: bool = True,
                 remove_stop_words: bool = True,
                 remove_short_words: bool = True, minlen: int = 1, maxlen: int = 1, top_p: float = None,
                 bottom_p: float = None):
        self.remove_punct = remove_punct
        self.remove_digits = remove_digits
        self.remove_stop_words = remove_stop_words
        self.remove_short_words = remove_short_words
        self.minlen = minlen
        self.maxlen = maxlen
        self.top_p = top_p
        self.bottom_p = bottom_p
        self.words_to_remove = []
        self.stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
                           'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
                           'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them',
                           'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
                           'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
                           'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'or',
                           'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
                           'into', 'through', 'during', 'before', 'after', 'to', 'from',
                           'in', 'out', 'on', 'off', 'further', 'then', 'once',
                           'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
                           'other', 'such', 'only', 'own', 'same', 'so', 'than',
                           'too', 'can', 'will', 'just', 'should',
                           'now']

        

    @staticmethod
    def __remove_double_whitespaces(string: str):
        return " ".join(string.split())
    

    def __remove_punct(self, string_series: pd.Series):
        """
       Removes punctuations from the input string.
       :param string_series: pd.Series, input string series
       :return: pd.Series, cleaned string series
       """
        clean_string_series = string_series.copy()
        puncts = [r'\n', r'\r', r'\t']
        puncts.extend(list(string.punctuation))
        for i in puncts:
            clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
        return clean_string_series.map(self.__remove_double_whitespaces)

    def __remove_digits(self, string_series: pd.Series):
        """
       Removes digits from the input string.
       :param string_series: pd.Series, input string series
       :return: pd.Series, cleaned string series
       """
        clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy()
        return clean_string_series.map(self.__remove_double_whitespaces)
 

    def __remove_stop_words(self, string_series: pd.Series):
        """
       Removes stop words from the input string.
       :param string_series: pd.Series, input string series
       :return: pd.Series, cleaned string series
       """
        def str_remove_stop_words(string: str):
            stops = self.stop_words
            return " ".join([token for token in string.split() if token not in stops])

        return string_series.map(str_remove_stop_words)

    

    def preprocess(self, string_series: pd.Series, dataset: str = "train"):
        """
        Entry point.
        :param string_series: pd.Series, input string series
        :param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
        :return: pd.Series, cleaned string series
        """
        string_series = string_series.str.lower()
        if self.remove_punct:
            string_series = self.__remove_punct(string_series=string_series)
        if self.remove_digits:
            string_series = self.__remove_digits(string_series=string_series)
        if self.remove_stop_words:
            string_series = self.__remove_stop_words(string_series=string_series)
        

        string_series = string_series.str.strip()
        string_series.replace(to_replace="", value="this is an empty message", inplace=True)

        return string_series

In [3]:
def get_tokenizer_model():
    model = tf.keras.models.load_model('dbpedia_classifier_hf_distilbert.h5', 
                                       custom_objects={"TFDistilBertModel": transformers.TFDistilBertModel})
    model_checkpoint = "distilbert-base-uncased"
    tokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)
    # removing layers after average layer to get embeddings as output
    top = model.get_layer('average').output
    model = Model(inputs=model.input, outputs=top)
    return tokenizer, model

In [4]:
def get_sentence_embeddings(txt, tokenizer, model):
    txt = text_preprocessor.preprocess(pd.Series(txt))[0]
    # loading DistilBERT model fine-tuned on DBPedia data
    txt = tf.data.Dataset.from_tensor_slices((dict(tokenizer([txt], 
                                                 max_length=200, padding="max_length", 
                                                 truncation=True, return_tensors="tf"))))
    embedding = model.predict(txt, verbose=0)[0]
    return embedding

In [5]:
def get_similarity_score(txt1, txt2, tokenizer, model):
    emb1 = get_sentence_embeddings(txt1, tokenizer, model)
    emb2 = get_sentence_embeddings(txt2, tokenizer, model)
    
    cos_sim = np.dot(emb1, emb2) / (norm(emb1) * norm(emb2))
    print(f"Text 1:\n{txt1}\n\nText 2:\n{txt2}\n\nCosine similarity score: {cos_sim}")
    return cos_sim

In [6]:
text_preprocessor = TextPreprocessor()
tokenizer, model = get_tokenizer_model()

In [7]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 200)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 200)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_distil_bert_model (TFDi  TFBaseModelOutput(last_hid   6636288   ['input_ids[0][0]',           
 stilBertModel)              den_state=(None, 200, 768)   0          'attention_mask[0][0]']      
                             , hidden_states=((None, 20                                       

In [8]:
txt1 = '''
Microsoft Corporation is an American multinational technology corporation headquartered in Redmond, Washington. Microsoft's best-known software products are the Windows line of operating systems, the Microsoft 365 suite of productivity applications, and the Edge web browser. Its flagship hardware products are the Xbox video game consoles and the Microsoft Surface lineup of touchscreen personal computers. Microsoft ranked No. 14 in the 2022 Fortune 500 rankings of the largest United States corporations by total revenue;[2] it was the world's largest software maker by revenue as of 2022. It is considered one of the Big Five American information technology companies, alongside Alphabet (parent company of Google), Amazon, Apple, and Meta Platforms.'''

txt2 = '''
Google LLC (/ˈɡuːɡəl/ ⓘ) is an American multinational technology company focusing on artificial intelligence,[9] online advertising, search engine technology, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics. It has been referred to as "the most powerful company in the world"[10] and as one of the world's most valuable brands due to its market dominance, data collection, and technological advantages in the field of artificial intelligence.[11][12][13] Alongside Amazon, Apple Inc., Meta Platforms, and Microsoft, Google's parent company Alphabet Inc. is one of the five Big Tech companies.
'''

_ = get_similarity_score(txt1, txt2, tokenizer, model)

Text 1:

Microsoft Corporation is an American multinational technology corporation headquartered in Redmond, Washington. Microsoft's best-known software products are the Windows line of operating systems, the Microsoft 365 suite of productivity applications, and the Edge web browser. Its flagship hardware products are the Xbox video game consoles and the Microsoft Surface lineup of touchscreen personal computers. Microsoft ranked No. 14 in the 2022 Fortune 500 rankings of the largest United States corporations by total revenue;[2] it was the world's largest software maker by revenue as of 2022. It is considered one of the Big Five American information technology companies, alongside Alphabet (parent company of Google), Amazon, Apple, and Meta Platforms.

Text 2:

Google LLC (/ˈɡuːɡəl/ ⓘ) is an American multinational technology company focusing on artificial intelligence,[9] online advertising, search engine technology, cloud computing, computer software, quantum computing, e-commerce, 

In [9]:
txt1 = '''
Microsoft Corporation is an American multinational technology corporation headquartered in Redmond, Washington. Microsoft's best-known software products are the Windows line of operating systems, the Microsoft 365 suite of productivity applications, and the Edge web browser. Its flagship hardware products are the Xbox video game consoles and the Microsoft Surface lineup of touchscreen personal computers. Microsoft ranked No. 14 in the 2022 Fortune 500 rankings of the largest United States corporations by total revenue;[2] it was the world's largest software maker by revenue as of 2022. It is considered one of the Big Five American information technology companies, alongside Alphabet (parent company of Google), Amazon, Apple, and Meta Platforms.'''

txt2 = '''
Joseph Robinette Biden Jr. (/ˈbaɪdən/ ⓘ BY-dən; born November 20, 1942) is an American politician who is the 46th and current president of the United States. Ideologically a moderate member of the Democratic Party, he previously served as the 47th vice president from 2009 to 2017 under President Barack Obama and represented Delaware in the United States Senate from 1973 to 2009.
'''

_ = get_similarity_score(txt1, txt2, tokenizer, model)

Text 1:

Microsoft Corporation is an American multinational technology corporation headquartered in Redmond, Washington. Microsoft's best-known software products are the Windows line of operating systems, the Microsoft 365 suite of productivity applications, and the Edge web browser. Its flagship hardware products are the Xbox video game consoles and the Microsoft Surface lineup of touchscreen personal computers. Microsoft ranked No. 14 in the 2022 Fortune 500 rankings of the largest United States corporations by total revenue;[2] it was the world's largest software maker by revenue as of 2022. It is considered one of the Big Five American information technology companies, alongside Alphabet (parent company of Google), Amazon, Apple, and Meta Platforms.

Text 2:

Joseph Robinette Biden Jr. (/ˈbaɪdən/ ⓘ BY-dən; born November 20, 1942) is an American politician who is the 46th and current president of the United States. Ideologically a moderate member of the Democratic Party, he previous

In [10]:
txt1 = '''
Vladimir Vladimirovich Putin[c] (born 7 October 1952) is a Russian politician and former intelligence officer who has been President of Russia since 2012. Putin has held continuous positions as president or prime minister since 1999:[d] as prime minister from 1999 to 2000 and from 2008 to 2012, and as president from 2000 to 2008 and since 2012.[e][7]'''

txt2 = '''
Joseph Robinette Biden Jr. (/ˈbaɪdən/ ⓘ BY-dən; born November 20, 1942) is an American politician who is the 46th and current president of the United States. Ideologically a moderate member of the Democratic Party, he previously served as the 47th vice president from 2009 to 2017 under President Barack Obama and represented Delaware in the United States Senate from 1973 to 2009.
'''

_ = get_similarity_score(txt1, txt2, tokenizer, model)

Text 1:

Vladimir Vladimirovich Putin[c] (born 7 October 1952) is a Russian politician and former intelligence officer who has been President of Russia since 2012. Putin has held continuous positions as president or prime minister since 1999:[d] as prime minister from 1999 to 2000 and from 2008 to 2012, and as president from 2000 to 2008 and since 2012.[e][7]

Text 2:

Joseph Robinette Biden Jr. (/ˈbaɪdən/ ⓘ BY-dən; born November 20, 1942) is an American politician who is the 46th and current president of the United States. Ideologically a moderate member of the Democratic Party, he previously served as the 47th vice president from 2009 to 2017 under President Barack Obama and represented Delaware in the United States Senate from 1973 to 2009.


Cosine similarity score: 0.9945241212844849


In [11]:
txt1 = '''
The British Broadcasting Corporation (BBC) is a British public service broadcaster headquartered at the Broadcasting House in London, originally established in 1922 as the British Broadcasting Company and evolved into its current state with its current name on New Year's Day 1927. The oldest and largest local and global broadcaster by stature and by number of employees, it employs over 21,000 staff in total, of whom approximately 17,900 are in public-sector broadcasting.[1][2][3][4][5]'''

txt2 = '''
Cognizant is an American multinational information technology services and consulting company. It is headquartered in Teaneck, New Jersey, U.S. Cognizant is part of the NASDAQ-100 and trades under CTSH. It was founded as an in-house technology unit of Dun & Bradstreet in 1994,[6] and started serving external clients in 1996.[6]

After a series of corporate re-organizations, there was an initial public offering in 1998.[7]

Cognizant had a period of fast growth during the 2000s and became a Fortune 500 company in 2011; as of 2021, it is ranked 185.[8]
'''

_ = get_similarity_score(txt1, txt2, tokenizer, model)

Text 1:

The British Broadcasting Corporation (BBC) is a British public service broadcaster headquartered at the Broadcasting House in London, originally established in 1922 as the British Broadcasting Company and evolved into its current state with its current name on New Year's Day 1927. The oldest and largest local and global broadcaster by stature and by number of employees, it employs over 21,000 staff in total, of whom approximately 17,900 are in public-sector broadcasting.[1][2][3][4][5]

Text 2:

Cognizant is an American multinational information technology services and consulting company. It is headquartered in Teaneck, New Jersey, U.S. Cognizant is part of the NASDAQ-100 and trades under CTSH. It was founded as an in-house technology unit of Dun & Bradstreet in 1994,[6] and started serving external clients in 1996.[6]

After a series of corporate re-organizations, there was an initial public offering in 1998.[7]

Cognizant had a period of fast growth during the 2000s and beca

In [12]:
txt1 = '''
The British Broadcasting Corporation (BBC) is a British public service broadcaster headquartered at the Broadcasting House in London, originally established in 1922 as the British Broadcasting Company and evolved into its current state with its current name on New Year's Day 1927. The oldest and largest local and global broadcaster by stature and by number of employees, it employs over 21,000 staff in total, of whom approximately 17,900 are in public-sector broadcasting.[1][2][3][4][5]'''

txt2 = '''
The Cable News Network (CNN) is a multinational news channel and website headquartered in Atlanta, Georgia, U.S.[2][3][4] Founded in 1980 by American media proprietor Ted Turner and Reese Schonfeld as a 24-hour cable news channel, and presently owned by the Manhattan-based media conglomerate Warner Bros. Discovery (WBD),[5] CNN was the first television channel to provide 24-hour news coverage and the first all-news television channel in the United States.[6][7][8][9][10]'''

_ = get_similarity_score(txt1, txt2, tokenizer, model)

Text 1:

The British Broadcasting Corporation (BBC) is a British public service broadcaster headquartered at the Broadcasting House in London, originally established in 1922 as the British Broadcasting Company and evolved into its current state with its current name on New Year's Day 1927. The oldest and largest local and global broadcaster by stature and by number of employees, it employs over 21,000 staff in total, of whom approximately 17,900 are in public-sector broadcasting.[1][2][3][4][5]

Text 2:

The Cable News Network (CNN) is a multinational news channel and website headquartered in Atlanta, Georgia, U.S.[2][3][4] Founded in 1980 by American media proprietor Ted Turner and Reese Schonfeld as a 24-hour cable news channel, and presently owned by the Manhattan-based media conglomerate Warner Bros. Discovery (WBD),[5] CNN was the first television channel to provide 24-hour news coverage and the first all-news television channel in the United States.[6][7][8][9][10]

Cosine simila

In [13]:
txt1 = '''
Morgan Freeman[2] (born June 1, 1937) is an American actor and producer. He is known for his distinctive deep voice and various roles in a wide variety of film genres. Throughout his career spanning over five decades, he has received numerous accolades, including an Academy Award, a Screen Actors Guild Award, and a Golden Globe Award. He is the recipient of the Kennedy Center Honor in 2008, the AFI Life Achievement Award in 2011, the Cecil B. DeMille Award in 2012, and the Screen Actors Guild Life Achievement Award in 2018.
'''

txt2 = '''
William Henry Gates III (born October 28, 1955) is an American business magnate, investor, philanthropist, and writer best known for co-founding the software giant Microsoft, along with his childhood friend Paul Allen.[2][3] During his career at Microsoft, Gates held the positions of chairman, chief executive officer (CEO), president, and chief software architect, while also being its largest individual shareholder until May 2014.[4] He was a major entrepreneur of the microcomputer revolution of the 1970s and 1980s.
'''

_ = get_similarity_score(txt1, txt2, tokenizer, model)

Text 1:

Morgan Freeman[2] (born June 1, 1937) is an American actor and producer. He is known for his distinctive deep voice and various roles in a wide variety of film genres. Throughout his career spanning over five decades, he has received numerous accolades, including an Academy Award, a Screen Actors Guild Award, and a Golden Globe Award. He is the recipient of the Kennedy Center Honor in 2008, the AFI Life Achievement Award in 2011, the Cecil B. DeMille Award in 2012, and the Screen Actors Guild Life Achievement Award in 2018.


Text 2:

William Henry Gates III (born October 28, 1955) is an American business magnate, investor, philanthropist, and writer best known for co-founding the software giant Microsoft, along with his childhood friend Paul Allen.[2][3] During his career at Microsoft, Gates held the positions of chairman, chief executive officer (CEO), president, and chief software architect, while also being its largest individual shareholder until May 2014.[4] He was a maj

In [14]:
txt1 = '''
Abdul Rashid Salim Salman Khan (pronounced [səlˈmɑːn xɑːn]; born 27 December 1965)[3] is an Indian actor, film producer, and television personality who works predominantly in Hindi films. In a career spanning over three decades, Khan has received numerous awards, including two National Film Awards as a film producer, and two Filmfare Awards as an actor.[4] He is cited in the media as one of the most commercially successful actors of Indian cinema.[5][6] Forbes has included Khan in listings of the highest-paid celebrities in the world, in 2015 and 2018, with him being the highest-ranked Indian in the latter year.[7][8][9][10]
'''

txt2 = '''
Shah Rukh Khan (pronounced [ˈʃɑːɦɾʊx xɑːn]; born 2 November 1965), also known by the initialism SRK, is an Indian actor and film producer who works in Hindi films. Referred to in the media as the "Baadshah of Bollywood" and "King Khan",[a] he has appeared in more than 90 films, and earned numerous accolades, including 14 Filmfare Awards. He has been awarded the Padma Shri by the Government of India, as well as the Ordre des Arts et des Lettres and Legion of Honour by the Government of France. Khan has a significant following in Asia and the Indian diaspora worldwide. In terms of audience size and income, several media outlets have described him as one of the most successful film stars in the world.[b] Many of his films thematise Indian national identity and connections with diaspora communities, or gender, racial, social and religious differences and grievances.
'''

_ = get_similarity_score(txt1, txt2, tokenizer, model)

Text 1:

Abdul Rashid Salim Salman Khan (pronounced [səlˈmɑːn xɑːn]; born 27 December 1965)[3] is an Indian actor, film producer, and television personality who works predominantly in Hindi films. In a career spanning over three decades, Khan has received numerous awards, including two National Film Awards as a film producer, and two Filmfare Awards as an actor.[4] He is cited in the media as one of the most commercially successful actors of Indian cinema.[5][6] Forbes has included Khan in listings of the highest-paid celebrities in the world, in 2015 and 2018, with him being the highest-ranked Indian in the latter year.[7][8][9][10]


Text 2:

Shah Rukh Khan (pronounced [ˈʃɑːɦɾʊx xɑːn]; born 2 November 1965), also known by the initialism SRK, is an Indian actor and film producer who works in Hindi films. Referred to in the media as the "Baadshah of Bollywood" and "King Khan",[a] he has appeared in more than 90 films, and earned numerous accolades, including 14 Filmfare Awards. He has 

In [15]:
txt1 = '''
Mark Elliot Zuckerberg (/ˈzʌkərbɜːrɡ/; born May 14, 1984) is an American business magnate, computer programmer, internet entrepreneur, and philanthropist. He co-founded the social media service Facebook and its parent company Meta Platforms (formerly Facebook, Inc.), of which he is executive chairman, chief executive officer and controlling shareholder.[1][2]

Zuckerberg attended Harvard University, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Originally launched in only select college campuses, the site expanded rapidly and eventually beyond colleges, reaching one billion users in 2012. Zuckerberg took the company public in May 2012 with majority shares. In 2007, at age 23, he became the world's youngest self-made billionaire. He has since used his funds to organize multiple philanthropic endeavors, including the establishment of the Chan Zuckerberg Initiative.
'''

txt2 = '''
William Henry Gates III (born October 28, 1955) is an American business magnate, investor, philanthropist, and writer best known for co-founding the software giant Microsoft, along with his childhood friend Paul Allen.[2][3] During his career at Microsoft, Gates held the positions of chairman, chief executive officer (CEO), president, and chief software architect, while also being its largest individual shareholder until May 2014.[4] He was a major entrepreneur of the microcomputer revolution of the 1970s and 1980s.
'''

_ = get_similarity_score(txt1, txt2, tokenizer, model)

Text 1:

Mark Elliot Zuckerberg (/ˈzʌkərbɜːrɡ/; born May 14, 1984) is an American business magnate, computer programmer, internet entrepreneur, and philanthropist. He co-founded the social media service Facebook and its parent company Meta Platforms (formerly Facebook, Inc.), of which he is executive chairman, chief executive officer and controlling shareholder.[1][2]

Zuckerberg attended Harvard University, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Originally launched in only select college campuses, the site expanded rapidly and eventually beyond colleges, reaching one billion users in 2012. Zuckerberg took the company public in May 2012 with majority shares. In 2007, at age 23, he became the world's youngest self-made billionaire. He has since used his funds to organize multiple philanthropic endeavors, including the establishment of the Chan Zuckerberg Initiative.


Text 2:

William Henry Gate