# Data parsing 

For extracting all the useful and significant text from the web page, we need to find all the tags which are created to store it. 
Such as: 
1. All levels ```<h>``` for headlines
2. ```<p>``` for paragraph of text
Another tags for text are wrappers for ```<p>```, so we will take the information out by using  ```<p>``` tag. 

In [19]:
import requests
from bs4 import BeautifulSoup as bs

In [32]:
def datascraping (url):
    r = requests.get(url)
    if r.status_code == 200:
        soup = bs(r.text, 'html.parser')
        #add all tags about text
        allp = soup.findAll('p')
        sitetext = ''
        for i in range(1,7):
            allh = soup.findAll(f'h{i}')
            for h in allh:
                sitetext += h.text
        for p in allp:
            sitetext += p.text 
        return sitetext
    else:
        print ('No server connection')

In [33]:
sitetext = datascraping('https://oxylabs.io/blog/python-web-scraping')

# Data preprocessing

In [34]:
import spacy
import re
nlp = spacy.load('en_core_web_sm')

In [35]:
def datapreprocessing(sitetext):
    sitetext = re.sub(r'[^\w\s]','', sitetext)
    sitetext = nlp(sitetext)
    tokens = []
    tokens.append([token.lemma_ for token in sitetext if token.is_stop == False])
    return tokens

In [36]:
sitetext = datapreprocessing(sitetext)

# Topics modeling

In [37]:
import gensim
import gensim.corpora as corpora
dictionary  = corpora.Dictionary(sitetext)
corpus = [dictionary.doc2bow(text) for text in sitetext]    

In [38]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [41]:
lda_model.get_document_topics()

TypeError: get_document_topics() missing 1 required positional argument: 'bow'