In [129]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import string
from gensim import corpora
from gensim import models
import pyLDAvis.gensim
import re
from typing import List
import pickle

## Что еще можно сделать:
1) Попробовать NMF
2) Добавить n-граммы
3) Optimize choice for number of topics through coherence measure
4)

# 1. Scrapping

В качестве объекта скреппинга выбран ресурс PubMed с биологическими статьями. Вытаскивать со странички буду название и abstract.

In [2]:
get_url_from_id = lambda idx: "https://pubmed.ncbi.nlm.nih.gov/" + str(idx) + "/"

In [3]:
def get_text_from_page(idx: str):
    page = requests.get(get_url_from_id(idx)).text
    soup = BeautifulSoup(page, 'html.parser')
    title = soup.title.text
    
    spans = soup.findAll('div')
    abstract = None
    classes = []
    for span in spans:
        try:
            classes.extend(span['class'])
            if 'abstract-content' in span['class']:
                abstract = span
                break
        except KeyError:
            pass
    
    abstract = abstract.text
    return title + ' ' + abstract

In [4]:
start_article = 29949996

In [5]:
cntr = 0
articles = []
idx = start_article
while cntr < 300:
#     time.sleep(1)
    try:
        print(f"Scrapping {idx}")
        txt = get_text_from_page(idx)
        articles.append(txt)
        cntr+=1
        idx+=1
    except Exception:
        print(f"Failed: {idx}")
        idx += 1

Scrapping 29949996
Failed: 29949996
Scrapping 29949997
Failed: 29949997
Scrapping 29949998
Scrapping 29949999
Scrapping 29950000
Scrapping 29950001
Scrapping 29950002
Scrapping 29950003
Scrapping 29950004
Scrapping 29950005
Scrapping 29950006
Scrapping 29950007
Scrapping 29950008
Scrapping 29950009
Scrapping 29950010
Scrapping 29950011
Scrapping 29950012
Scrapping 29950013
Scrapping 29950014
Scrapping 29950015
Failed: 29950015
Scrapping 29950016
Scrapping 29950017
Scrapping 29950018
Scrapping 29950019
Scrapping 29950020
Scrapping 29950021
Scrapping 29950022
Failed: 29950022
Scrapping 29950023
Failed: 29950023
Scrapping 29950024
Failed: 29950024
Scrapping 29950025
Failed: 29950025
Scrapping 29950026
Failed: 29950026
Scrapping 29950027
Failed: 29950027
Scrapping 29950028
Failed: 29950028
Scrapping 29950029
Scrapping 29950030
Scrapping 29950031
Scrapping 29950032
Scrapping 29950033
Scrapping 29950034
Scrapping 29950035
Scrapping 29950036
Failed: 29950036
Scrapping 29950037
Scrapping 29950

In [130]:
with open('scrapped_data.pickle', 'wb') as f:
    pickle.dump(articles, f)

# 2. Text preprocessing

### Plan:
1. Tokenize
2. Remove punctuation
3. Hybride stemming
4. Remmove stop words

In [94]:
stopwords = set(stopwords.words('english'))

In [102]:
corpus = articles.copy()

## 1. Tokenize

In [103]:
corpus = [nltk.word_tokenize(text) for text in corpus]

## 2. Remove puctuation tokens

In [104]:
punc = string.punctuation + "``" + "\'\'" + "..." + "...."

In [105]:
corpus = [list(filter(lambda token: token not in punc, text)) for text in corpus]

## Chech whether all punctuation symbols removed

In [106]:
words = []
for text in corpus:
    words.extend(text)
words = list(set(words))
words.sort()

In [107]:
words[20:30]

['-0.005',
 '-0.675',
 '-1-aminopropan-2-ol',
 '-10',
 '-13',
 '-2',
 '-25',
 '-4',
 '-7',
 '-Editorial']

Можно заметить, что осталось много специфических символов, чисел, поскольку они часто встречаются в статьях, но не несут почти никакого смысла. Поэтому просто уберу их.

In [108]:
def filter_text_from_punct(txt: List[str]) -> List[str]:
    result = []
    for wrd in txt:
        word = re.sub('\d', "", wrd)
        word = re.sub('\W', "", word)
        if (len(word) > 0):
            result.append(word)
    return result

  word = re.sub('\d', "", wrd)
  word = re.sub('\W', "", word)


In [109]:
corpus = [filter_text_from_punct(text) for text in corpus]

## 3. Stemming

In [110]:
import krovetz
ks = krovetz.PyKrovetzStemmer()

In [111]:
corpus = [[ks.stem(i) for i in text] for text in corpus]

## 4. Drop stop-words

In [112]:
corpus = [[word for word in text if word not in stopwords] for text in corpus]

# 5. Drop short word

In [113]:
words = []
for text in corpus:
    words.extend(text)
words = list(set(words))

In [118]:
sorted(words, key=len)[:30]

['b',
 'c',
 'e',
 'f',
 'g',
 'h',
 'j',
 'k',
 'l',
 'n',
 'p',
 'q',
 'r',
 'u',
 'v',
 'x',
 'z',
 'Å',
 'Φ',
 'β',
 'γ',
 'κ',
 'λ',
 '⁶',
 'aa',
 'ac',
 'ad',
 'ag',
 'ah',
 'ai']

Довольно много одиночных букв (которые точно надо убрать) и слов длины два. Часть из слов длины 2 может быть важна, но среди них может быть и мусор, поэтому почищу их все.

In [121]:
def filter_text_from_short(txt: List[str]) -> List[str]:
    result = []
    for wrd in txt:
        if (len(wrd) > 2):
            result.append(wrd)
    return result

In [122]:
corpus = [filter_text_from_short(text) for text in corpus]

## Создаем словарь и векторизуем его

In [123]:
#  создаем словарь 
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(text) for text in corpus]
tfidf_model = models.TfidfModel(corpus)
tfidf = tfidf_model[corpus]

## Topic modeling
## 1. LDA

In [124]:
NUM_TOPICS = 5
ldamodel = models.LdaModel(tfidf, id2word=dictionary, num_topics=NUM_TOPICS)

In [126]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.001*"health" + 0.001*"patient" + 0.001*"bone" + 0.001*"cell"')
(1, '0.001*"patient" + 0.001*"cell" + 0.001*"expression" + 0.001*"women"')
(2, '0.001*"patient" + 0.001*"gene" + 0.001*"group" + 0.001*"cancer"')
(3, '0.001*"group" + 0.001*"patient" + 0.001*"cell" + 0.001*"expression"')
(4, '0.001*"cell" + 0.001*"hiv" + 0.001*"ato" + 0.001*"health"')


In [34]:
def get_topics(model, tfidf):
    m_score = 0
    for index, score in sorted(model[tfidf[1]], key=lambda tup: -1*tup[1]):
        if score > m_score:
            m_score = score
            m_topic = index
    return m_topic

In [35]:
topics_model = []
lda.print_topic(get_topics(ldamodel, tfidf), 4)

'0.001*"patient" + 0.001*"group" + 0.001*"case" + 0.001*"ato"'

In [23]:
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary=ldamodel.id2word)

## 2. LSI

In [127]:
from gensim import models
lsi = models.LsiModel(tfidf, id2word=dictionary, num_topics=5)
lsi.show_topics(num_words=3)

[(0, '0.231*"patient" + 0.194*"cell" + 0.174*"expression"'),
 (1, '0.289*"health" + -0.223*"cell" + -0.207*"expression"'),
 (2, '-0.263*"patient" + 0.197*"cell" + -0.156*"aml"'),
 (3, '-0.354*"health" + -0.241*"mental" + -0.160*"cell"'),
 (4, '0.289*"decoction" + 0.189*"guizhi" + 0.183*"disease"')]

In [128]:
tfidf

<gensim.interfaces.TransformedCorpus at 0x7fa7ecd70950>