# **Notes:**

The algorithm should be optimized in terms of its speed and performance. The one way of increasing its speed is to set limitations and restrictions to the urls to be searced for. On the other hand, to be able to increase performance, embedding models can be changed, dimensionality reduction algorithm and clustering algoritm parameters can be adjusted and so on...

Additionally, there are different ways of extracting topics of a text such as using KeyBert, c-TFIDF, BerTopic... I wanted to use Bert transformers; however, it was computationally heavy process, I couldn't make fine tuning. Therefore, I did not prefer this way. For the preprocessing of the data, there are bunch of things that can be done to further process and clean the data such as lemmatization, yet I couldn't find a well performanced lemmatizer. I have tried 'Zeyrek', but as I said it didn't work well.

Finally, the code can be made available for use from the command line using 'argparse' library.

In [2]:
import selenium
from selenium import webdriver
from warnings import filterwarnings
import numpy as np
from nltk.corpus import stopwords
import nltk
import requests
from bs4 import BeautifulSoup
import torch
from transformers import AutoModel, AutoTokenizer
from scipy.spatial.distance import cosine
from operator import itemgetter
from flair.embeddings import TransformerDocumentEmbeddings
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup
import json
import numpy as np
import requests
from requests.models import MissingSchema
import spacy
import trafilatura
import psycopg2 as pg2
import gensim

filterwarnings('ignore')

nltk.download('stopwords')
nltk.download('punkt')
stopwords = set(nltk.corpus.stopwords.words('turkish')+['bir'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hasan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hasan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased",do_lower_case=True, truncate=True)
model = AutoModel.from_pretrained("dbmdz/bert-base-turkish-128k-uncased", output_hidden_states = True)

Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# This is the first 100_000 sentences from the TsCorpora from where I have used sample paragraphs to test my algorithm. 
data_path = 'C:/Users/Hasan/Desktop/first_100_000.txt'
with open(data_path, 'r') as f:
    data = f.read().split('\n')
f.close()
data.remove(data[0])

In [7]:
def preprocess_text(data):

    """
    Function to process text data.

    args:
        data: text data to be processed as a list of sentences of type string.
    """

    no_stop_words = []
    for k in data:
        no_stop_words.append(' '.join([x.lower() for x in k.split(' ') if x.lower() not in stopwords]))

    no_punkt = []
    for sentence in no_stop_words:
        no_punkt.append(' '.join([word.lower() for word in sentence.split(' ') if word.isalpha()]))

    return no_punkt
    

def get_sentence_embeddings(processed_data):

    """
    Function to get sentence embeddings to be used to calculate cosine similarity.

    args:
        processed_data: the output from the preprocess_text.
    """

    spec_tokens = '[CLS] ' + ' '.join(processed_data) + ' [SEP]'
      

    tokenized_text = tokenizer.tokenize(spec_tokens)[:512]
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segment_ids = [1]*len(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segment_ids])

    
    model.eval()

    with torch.no_grad():

        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]


    token_vecs = hidden_states[-2][0]

    sentence_embedding = torch.mean(token_vecs, dim=0)
    return sentence_embedding

def Extract(lst):

    """
    Function to get elemnts at specific indices in a nested list.

    args:
        lst: nested list 
    """

    return list( map(itemgetter(0), lst ))

In [4]:
# taking a sample from the corpora
ix = 25050
inp = data[ix:ix+17]
inp

['Kronik Ürtiker’in çoğunluğunun tanımlaması zordur',
 '—Belli Paraziter hastalıklar , —Tansiyon ilaçları veya stres tetikleyicidir —Kronik Hastalıklar ( idrar yolu enfeksiyonları , sinüzit , akciğer hastalıkları ) Kronik ürtiker ; derinin ısıya maruz kalması , soğuk , güneş ışığı , titreme veya basınç gibi fiziksel faktörler tarafından da tetiklenebilir',
 'Bazı kişiler o kadar hassas cilde sahiptir ki herhangi bir ovalama dahi dermografizm diye adlandırılan ve deriden kabarık kırmızı çizgilere neden olan kabartılara sebep olabilir',
 'Buda bir çeşit ürtikeryal oluşumdur bağışıklık sistemini tutan hastalıklar Ürtiker’i tetikleyebilir',
 'Belirtileri Nelerdir ? Döküntüye bağlı bir sayıda kırmızı çevrilmiş cilt lezyonları',
 'Bu vücuduna yayılmaya meyillidir Kronik olgularda kaşıntılı uykusuz gecelere neden olur bu durum depresyona bile neden olabilir',
 'Hastalığın bulguları arasında kaşıntılı , deriden kabarık , kızarık 0,5cm ila çok büyük ölçülerde deride plaklar bulunur',
 'Bu plakl

In [8]:
# preprocessing input
processed_input = preprocess_text(inp)
processed_input

['kronik çoğunluğunun tanımlaması zordur',
 'paraziter hastalıklar ilaçları stres tetikleyicidir hastalıklar idrar yolu enfeksiyonları sinüzit akciğer hastalıkları kronik ürtiker derinin ısıya maruz kalması soğuk güneş ışığı titreme basınç fiziksel faktörler tarafından tetiklenebilir',
 'kişiler kadar hassas cilde sahiptir herhangi ovalama dahi dermografizm adlandırılan deriden kabarık kırmızı çizgilere olan kabartılara sebep olabilir',
 'buda çeşit ürtikeryal oluşumdur bağışıklık sistemini tutan hastalıklar tetikleyebilir',
 'belirtileri nelerdir döküntüye bağlı sayıda kırmızı çevrilmiş cilt lezyonları',
 'vücuduna yayılmaya meyillidir kronik olgularda kaşıntılı uykusuz gecelere olur durum depresyona bile olabilir',
 'hastalığın bulguları arasında kaşıntılı deriden kabarık kızarık ila büyük ölçülerde deride plaklar bulunur',
 'plakların bazıları birleşme eğilimindedir',
 'plakların sınırlarını net olarak çizmek zaman mümkün olmaz lezyonlar genellikle saat içerisinde solar yerine başka

In [11]:
# getting the embedding of the input
inp_embed = get_sentence_embeddings(processed_input)
inp_embed.shape

torch.Size([768])

In [14]:
# extracting keywords from the text to make a google search

processed_text = ' '.join(processed_input).split()
dictionary = gensim.corpora.Dictionary([processed_text])
bow_corpus = [dictionary.doc2bow(doc) for doc in [processed_text]]

lda_model = gensim.models.LdaMulticore(bow_corpus,
                                       num_topics=3,
                                       id2word=dictionary,
                                       passes=10,
                                       workers=2)

pred = ' '.join([x[1] for x in lda_model.top_topics(bow_corpus)[0][0]])
pred

'hastalarda hastalıklar ürtiker kronik olarak ancak plakların olabilir kabarık hastalığın sedatif reseptör çıkabilir kaşıntılı dahi blokerleri tekrar hastalık deriden stres'

In [53]:
# making a google search and extracting urls from the result

urls=[]
url = 'https://www.google.com'

options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")

driver = webdriver.Chrome('chromedriver', chrome_options=options)

driver.get(url)
search = driver.find_element_by_xpath('/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/input')
search.send_keys(pred)
btn = driver.find_element_by_xpath('/html/body/div[1]/div[3]/form/div[1]/div[1]/div[3]/center/input[1]')
btn.click()
links = driver.find_elements_by_css_selector('.yuRUbf a')
for link in links:
    urls.append(link.get_attribute('href'))

driver.quit()

In [54]:
def bs_extract_text_fallback(response_content):

    """
    Secondary function to be used when trafilatura couldn't process the url due to any reason.

    args:
        response_content: content of the url to be parsed.
    """
    
    soup = BeautifulSoup(response_content, 'html.parser', from_encoding='utf-8')
    text = soup.find_all(text=True)

    cleaned_text = ''

    blacklist = [
        '[document]',
        'noscript',
        'header',
        'html',
        'meta',
        'head',
        'input',
        'script',
        'style'
    ]

    for item in text:
        if item.parent.name not in blacklist:
            cleaned_text += '{} '.format(item)

    cleaned_text = cleaned_text.replace('\t', '')
    return cleaned_text.strip()



def extract_text_from_single_web_page(url):

    """
    Primary function to be used to process the given url.

    args:
        url: url to be processed as a string.
    """
    
    
    downloaded_url = trafilatura.fetch_url(url)
    

    try:

        a = trafilatura.extract(downloaded_url,
                                output_format='json',
                                with_metadata=False,
                                include_comments=False,
                                date_extraction_params={'extensive_search':True,
                                                        'original_date':True})
        
    except:
        
        a=False
    
    
        

    if a:
        json_output = json.loads(a)
        return json_output['text']

    else:
        try:
            try:
                resp = requests.get(url, verify=False)
            
                if resp.status_code == 200:
                    return bs_extract_text_fallback(resp.content)
                else:
                    return np.nan

            except requests.exceptions.ConnectionError:
                pass

        except MissingSchema:
            return np.nan
            
# extracting text from multiple web pages
text_content = [extract_text_from_single_web_page(url) for url in urls]

ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree for URL None
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://zgmy.rpmcustomshop.pl/%C3%BCrtiker-anjio%C3%B6dem-bitkisel-tedavisi.html
ERROR:trafilatura.downloads:not a 200 response: 403 for URL https://istanbulalerjimerkezi.com.tr/yetiskinlerde-urtiker-kurdesen-nedir/
ERROR:trafilatura.utils:parsed tree length: 0, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree for URL None


In [56]:
from math import isnan, nan

# cleaning up nan contents
while np.nan in text_content:
    text_content.remove(np.nan)



# getting cosine similarities of the results found and storing the most relevant ones in the ret list with their similarity probabilities.
ret = []
prob = 0
for resp in text_content:
    if resp:
        
        processed_out = preprocess_text([' '.join([x for x in resp.split('\n') if x.strip()])])
        
        out_embed = get_sentence_embeddings(processed_out)
        similarity = 1-cosine(out_embed, inp_embed)
        if similarity>prob and not 'http' in resp:
            prob = similarity
            ret.append((resp, similarity))

Token indices sequence length is longer than the specified maximum sequence length for this model (625 > 512). Running this sequence through the model will result in indexing errors


In [57]:
# getting the most relevant text's probability
ret[-1][1]

0.9500774145126343

In [58]:
# printing the most relevant content
print(ret[-1][0])

'Yaşam kalitesini oldukça etkileyen, sık görülen alerjik otoimmün hastalıklar arasında yer alır. Ciltte kabarıklık, kızarıklık ve kaşıntıya sebep olan bu hastalığın tanısında hastanın hikayesi önemlidir. Ürtikerin hangi şartlarda ortaya çıktığı (stres, besinler, ilaçlar, enfeksiyonlar ve fiziksel nedenler) iyice tanımlanmalıdır.\nÜrtiker, bazı hücrelerimizden salınan Histamin’in etkisiyle cilt üzerindeki kızarıklıklara neden olur. Daha derin dokularda gerçekleşen ürtiker, anjioödem olarak tanımlanır. Özellikle dudaklarda, göz kapaklarında, ellerde ve ayakta şiddetli şişlikler olarak görülürler. Ağız, dil ve sonrasında solunum yollarını etkileyebilen şişlikler nefes alınmasını güçleştirebilir. Anjioödem ciddi ve acil müdahele edilmesi gereken bir hastalıktır.\nÜrtiker ve anjioödemin tedavisi çoğunlukla hastalığın seyrine bağlı olarak planlanır. Erken dönem tedavide antihistaminik ilaçlar başarılıdır. Ancak daha komplike hastalarda çoklu ilaç tedavileri gerekmektedir.\nÜrtiker nedir?\nHa

In [59]:
# connecting to the database and storing outputs

conn = pg2.connect(database='chatbot', user='postgres', password='')
cur = conn.cursor()
query = 'INSERT INTO history(user_input, output, main_topic,group_id) VALUES(%s,%s,%s,%s)'
cur.execute(query, (inp, ret[-1][0], pred, 1))
conn.commit()
conn.close()

In [60]:
# querrying the database to extract general topics 

conn = pg2.connect(database='chatbot', user='postgres', password='20138579hsnmr')
cur = conn.cursor()
query = 'SELECT * FROM history'
cur.execute(query)
raw_db_data = cur.fetchall()
conn.close()

In [61]:
# cleaning the output from the querry
db_texts = [x[0].replace('{','').replace('}','') for x in raw_db_data]
db_texts = [x.replace('"','') for x in db_texts]

In [62]:
# preparing involved models to be able to extrach topics and sub-topics with BerTopic

import umap.umap_ as umap
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('dbmdz/bert-base-turkish-128k-uncased')
umap_model = umap.UMAP(init='spectral', n_neighbors=2, n_components=3, min_dist=0.05) # hyperparameters should be optimized for better performance!
hdbscan_model = HDBSCAN(gen_min_span_tree=True,
                        prediction_data=True) # hyperparameters should be optimized for better performance!



stopwords = list(stopwords) + ['http', 'https', 'amp', 'com', 'olarak', 'olan', 'olacak', 'oldu', 'olmuş', 'oluyor']

# we add this to remove stopwords that can pollute topcs
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords)

Some weights of the model checkpoint at C:\Users\Hasan/.cache\torch\sentence_transformers\dbmdz_bert-base-turkish-128k-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [63]:
# extracting topics

from bertopic import BERTopic

model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=5,
    language='turkish',
    calculate_probabilities=True,
    verbose=True
)
topics, probs = model.fit_transform(db_texts)

Batches: 100%|██████████| 1/1 [00:08<00:00,  8.84s/it]
2022-09-23 09:41:24,875 - BERTopic - Transformed documents to Embeddings
2022-09-23 09:41:29,931 - BERTopic - Reduced dimensionality
2022-09-23 09:41:29,974 - BERTopic - Clustered reduced embeddings


In [64]:
model.visualize_barchart(topics, probs)