In [1]:
import os
import re
import time
import random
import itertools

import pandas as pd
from string import digits
from wordcloud import WordCloud
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tqdm.notebook import tqdm, trange
from bs4 import BeautifulSoup

In [2]:
# Put your html file in this folder

html_files = os.listdir('./web_datasets/')
files_number = len(html_files)

In [3]:
# Function of a file cleaning

def cleanHTMLDocument(html):
    soup = BeautifulSoup(html, "html.parser") 
    for script in soup(["script", "style"]): 
        script.extract()

    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

In [4]:
# Read necessay files

start_time = time.time()

files = [0] * files_number 
df = pd.DataFrame(data={}, columns = ['text'])

for i in range(files_number):
    with open("./web_datasets/" + html_files[i], "r", encoding='utf-8') as f:
        try:
            text = cleanHTMLDocument(f.read())
            text = re.sub(r'\b\w{1,2}\b', '', text)
            df = df.append({'text': text}, ignore_index=True)   
            
        except:
            files[i] = " "
            print("problem with file" + html_files[i])
            
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.857234001159668 seconds ---


In [5]:
# Additional stop words

other_stop_words = [
    'contact', 'cookies', 'confirm', 'website', 'share', 'login', 'password',
    'course', 'learn', 'programming', 'lpa', 'code', 'courses', 'java', 'learning', 'get', 'one', 'python', 'see', 'pluralsight', 'web',
    'org', 'become', 'javascript', 'android', 'may', 'site', 'developer'
]

In [6]:
# Clean datasets

nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(other_stop_words)

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def delete_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

data = df['text'].values.tolist()
data_words = list(sent_to_words(data))

data_words = delete_stopwords(data_words)

[nltk_data] Downloading package stopwords to /Users/beon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# LDA

id_2_word = corpora.Dictionary(data_words)
texts = data_words
corpus = [id_2_word.doc2bow(text) for text in texts]

number_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id_2_word,
                                       num_topics=number_topics)

print(lda_model.print_topics())

[(0, '0.009*"students" + 0.009*"read" + 0.007*"team" + 0.007*"information" + 0.006*"review" + 0.006*"free" + 0.006*"data" + 0.005*"full" + 0.005*"hours" + 0.005*"weekgraduate"'), (1, '0.011*"students" + 0.009*"team" + 0.008*"read" + 0.008*"review" + 0.007*"free" + 0.005*"reviews" + 0.005*"projects" + 0.005*"step" + 0.004*"way" + 0.004*"build"'), (2, '0.007*"free" + 0.007*"read" + 0.005*"path" + 0.005*"students" + 0.005*"data" + 0.005*"career" + 0.004*"team" + 0.004*"review" + 0.004*"skills" + 0.004*"experience"'), (3, '0.012*"students" + 0.011*"review" + 0.010*"team" + 0.009*"read" + 0.007*"free" + 0.006*"skills" + 0.004*"reviews" + 0.004*"month" + 0.004*"using" + 0.004*"way"'), (4, '0.008*"team" + 0.008*"skills" + 0.007*"students" + 0.006*"read" + 0.006*"data" + 0.005*"free" + 0.005*"review" + 0.005*"skill" + 0.005*"information" + 0.004*"hours"'), (5, '0.009*"review" + 0.009*"students" + 0.008*"read" + 0.008*"free" + 0.007*"team" + 0.005*"projects" + 0.005*"build" + 0.004*"reviews" + 

In [8]:
# Word counter

def freq(data_words):
    df = pd.DataFrame({}, columns=['Word', 'Frequency'])
    
    unique_words = set(data_words)
      
    for word in unique_words:
        df = df.append({'Word' : word,
                'Frequency' : data_words.count(word)}, 
                ignore_index=True)
        
    return df

data_words = itertools.chain.from_iterable(data_words)
data_words = list(data_words)
frequency = freq(data_words)
frequency = frequency.sort_values(by="Frequency", ascending=False)
frequency.to_csv('frequency.csv')