In [1]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import string
from gensim import corpora
from gensim import models
import pyLDAvis.gensim
import re
from typing import List
import pickle

  from collections import Sequence
  from collections import Iterable


## Что еще можно сделать:
1) Попробовать NMF
2) Добавить n-граммы
3) Optimize choice for number of topics through coherence measure
4)

# 1. Scrapping

В качестве объекта скреппинга выбран ресурс PubMed с биологическими статьями. Вытаскивать со странички буду название и abstract.

In [2]:
get_url_from_id = lambda idx: "https://pubmed.ncbi.nlm.nih.gov/" + str(idx) + "/"

In [3]:
def get_text_from_page(idx: str):
    page = requests.get(get_url_from_id(idx)).text
    soup = BeautifulSoup(page, 'html.parser')
    title = soup.title.text
    
    spans = soup.findAll('div')
    abstract = None
    classes = []
    for span in spans:
        try:
            classes.extend(span['class'])
            if 'abstract-content' in span['class']:
                abstract = span
                break
        except KeyError:
            pass
    
    abstract = abstract.text
    return title + ' ' + abstract

In [4]:
start_article = 29949996

In [None]:
cntr = 0
articles = []
idx = start_article
while cntr < 300:
#     time.sleep(1)
    try:
        print(f"Scrapping {idx}")
        txt = get_text_from_page(idx)
        articles.append(txt)
        cntr+=1
        idx+=1
    except Exception:
        print(f"Failed: {idx}")
        idx += 1

In [130]:
with open('scrapped_data.pickle', 'wb') as f:
    pickle.dump(articles, f)

# 2. Text preprocessing

### Plan:
1. Tokenize
2. Remove punctuation
3. Hybride stemming
4. Remmove stop words

In [13]:
stopwords = set(stopwords.words('english'))

In [2]:
corpus = arcicles.copy()

# with open('scrapped_data.pickle', 'rb') as f:
#     corpus = pickle.load(f)

## 1. Tokenize

In [3]:
corpus = [nltk.word_tokenize(text) for text in corpus]

## 2. Remove puctuation tokens

In [4]:
punc = string.punctuation + "``" + "\'\'" + "..." + "...."

In [5]:
corpus = [list(filter(lambda token: token not in punc, text)) for text in corpus]

## Chech whether all punctuation symbols removed

In [6]:
words = []
for text in corpus:
    words.extend(text)
words = list(set(words))
words.sort()

In [7]:
words[20:30]

['-0.005',
 '-0.675',
 '-1-aminopropan-2-ol',
 '-10',
 '-13',
 '-2',
 '-25',
 '-4',
 '-7',
 '-Editorial']

Можно заметить, что осталось много специфических символов, чисел, поскольку они часто встречаются в статьях, но не несут почти никакого смысла. Поэтому просто уберу их.

In [8]:
def filter_text_from_punct(txt: List[str]) -> List[str]:
    result = []
    for wrd in txt:
        word = re.sub('\d', "", wrd)
        word = re.sub('\W', "", word)
        if (len(word) > 0):
            result.append(word)
    return result

  word = re.sub('\d', "", wrd)
  word = re.sub('\W', "", word)


In [9]:
corpus = [filter_text_from_punct(text) for text in corpus]

## 3. Stemming

In [10]:
import krovetz
ks = krovetz.PyKrovetzStemmer()

In [11]:
corpus = [[ks.stem(i) for i in text] for text in corpus]

## 4. Drop stop-words

In [14]:
corpus = [[word for word in text if word not in stopwords] for text in corpus]

# 5. Drop short word

In [15]:
words = []
for text in corpus:
    words.extend(text)
words = list(set(words))

In [16]:
sorted(words, key=len)[:30]

['e',
 'p',
 'b',
 'γ',
 'λ',
 'z',
 'β',
 'f',
 'x',
 'r',
 'c',
 '⁶',
 'l',
 'q',
 'κ',
 'j',
 'Φ',
 'Å',
 'n',
 'k',
 'u',
 'h',
 'v',
 'g',
 'et',
 'yi',
 'pq',
 'ah',
 'gm',
 'sv']

Довольно много одиночных букв (которые точно надо убрать) и слов длины два. Часть из слов длины 2 может быть важна, но среди них может быть и мусор, поэтому почищу их все.

In [17]:
def filter_text_from_short(txt: List[str]) -> List[str]:
    result = []
    for wrd in txt:
        if (len(wrd) > 2):
            result.append(wrd)
    return result

In [18]:
corpus = [filter_text_from_short(text) for text in corpus]

## Создаем словарь и векторизуем его

In [19]:
#  создаем словарь 
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(text) for text in corpus]
tfidf_model = models.TfidfModel(corpus)
tfidf = tfidf_model[corpus]

## Topic modeling
## 1. LDA

In [20]:
NUM_TOPICS = 5
ldamodel = models.LdaModel(tfidf, id2word=dictionary, num_topics=NUM_TOPICS)

In [21]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.001*"cell" + 0.001*"patient" + 0.001*"expression" + 0.001*"disease"')
(1, '0.001*"group" + 0.001*"treatment" + 0.001*"patient" + 0.001*"health"')
(2, '0.001*"cell" + 0.001*"gene" + 0.001*"patient" + 0.001*"health"')
(3, '0.001*"ato" + 0.001*"cell" + 0.001*"patient" + 0.001*"health"')
(4, '0.001*"patient" + 0.001*"aml" + 0.001*"mutation" + 0.001*"care"')


In [22]:
def get_topics(model, tfidf):
    m_score = 0
    for index, score in sorted(model[tfidf[1]], key=lambda tup: -1*tup[1]):
        if score > m_score:
            m_score = score
            m_topic = index
    return m_topic

In [24]:
topics_model = []
ldamodel.print_topic(get_topics(ldamodel, tfidf), 4)

'0.001*"cell" + 0.001*"gene" + 0.001*"patient" + 0.001*"health"'

In [27]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary=ldamodel.id2word)

RuntimeError: invalid : opcode signature doesn't match buffer (c vs l) at 1

## 2. LSI

In [25]:
from gensim import models
lsi = models.LsiModel(tfidf, id2word=dictionary, num_topics=5)
lsi.show_topics(num_words=3)

[(0, '0.231*"patient" + 0.193*"cell" + 0.174*"expression"'),
 (1, '-0.290*"health" + 0.226*"cell" + 0.208*"expression"'),
 (2, '-0.272*"patient" + 0.193*"cell" + -0.159*"aml"'),
 (3, '-0.359*"health" + -0.250*"mental" + -0.165*"care"'),
 (4, '0.281*"decoction" + 0.203*"disease" + 0.181*"guizhi"')]