# Data parsing 

In [77]:
import requests
from bs4 import BeautifulSoup as bs

In [78]:
def datascraping (url):
    r = requests.get(url)
    if r.status_code == 200:
        soup = bs(r.text, 'html.parser')
        #add all tags about text
        allp = soup.findAll('p')
        sitetext = ''
        for p in allp:
            sitetext += p.text 
        return sitetext
    else:
        print ('No server connection')

In [79]:
sitetext = datascraping('https://oxylabs.io/blog/python-web-scraping')

# Data preprocessing

In [80]:
import spacy
import re
nlp = spacy.load('en_core_web_sm')

In [81]:
def datapreprocessing(sitetext):
    sitetext = re.sub(r'[^\w\s]','', sitetext)
    sitetext = nlp(sitetext)
    tokens = []
    tokens.append([token.lemma_ for token in sitetext if token.is_stop == False])
    return tokens

In [82]:
sitetext = datapreprocessing(sitetext)

# Topics modeling

In [101]:
import gensim
import gensim.corpora as corpora
dictionary  = corpora.Dictionary(sitetext)
corpus = [dictionary.doc2bow(text) for text in sitetext]    

In [106]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [117]:
dictionary[100]

'absolute'

In [107]:
lda_model.print_topics()

[(0,
  '0.021*"datum" + 0.020*"web" + 0.016*"scrape" + 0.015*"library" + 0.012*"Python" + 0.010*"use" + 0.009*"browser" + 0.008*"create" + 0.008*"need" + 0.007*"code"'),
 (1,
  '0.002*"datum" + 0.002*"scrape" + 0.002*"web" + 0.001*"library" + 0.001*"use" + 0.001*"Python" + 0.001*"require" + 0.001*"create" + 0.001*"browser" + 0.001*"code"'),
 (2,
  '0.002*"datum" + 0.002*"web" + 0.002*"library" + 0.002*"scrape" + 0.002*"Python" + 0.002*"use" + 0.002*"create" + 0.002*"page" + 0.002*"need" + 0.002*"browser"'),
 (3,
  '0.001*"datum" + 0.001*"library" + 0.001*"scrape" + 0.001*"web" + 0.001*"Python" + 0.001*"browser" + 0.001*"create" + 0.001*"use" + 0.001*"require" + 0.001*"list"')]