In [1]:
# Scraping
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
import PyQt5
%matplotlib qt

# ML Things
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import nltk
import string
import re
RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
nltk.download('stopwords')
STOP_WORDS = nltk.corpus.stopwords.words()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/josephepstein/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
headers = {"Accept-Language": "en-US, en;q=0.5", "User-Agent": "Mozilla/5.0"}

def clean_sentence(sentence):

    sentence = RE_PUNCT.sub(" ", sentence).lower().split()
    return_sentence = sentence
        
    for word in sentence:
        if word in STOP_WORDS or len(word) < 3:
            return_sentence = list(filter((word).__ne__, return_sentence))
    return return_sentence

In [3]:
def count_words(split_sentences):
    word_count_dict = {}
    
    for sentence in split_sentences:
        for word in sentence:
            if word not in word_count_dict:
                word_count_dict[word] = 0
        
            word_count_dict[word] += 1
        
    return word_count_dict

In [4]:
def blog_scraper( blog_url, post_link_selector, post_content_selector):
    
    post_text_array = []
    
    pages = np.arange(2, 50)

    for page in pages:
        
        blog = requests.get( blog_url + str(page), headers=headers)

        soup = BeautifulSoup(blog.text, 'html.parser')

        post_links = soup.select( post_link_selector )

        for post_link in post_links:
                        
            if len(post_text_array) >= 100:
                
                return post_text_array
            
            blog = requests.get(post_link['href'], headers=headers)
            
            post_soup = BeautifulSoup(blog.text, 'html.parser')
            
            post_text = post_soup.select_one( post_content_selector ).text
                        
            post_text_array.append(post_text)
    
    return post_text_array
            
        
    

In [30]:
def word_vector_plot(model, vocab, top_n=-1, plot=PCA):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []
    
    temp_vocab = vocab.copy()
    
    if top_n != -1:
        
        counter = 0
        
        for counter in range(0, top_n):
            tokens.append(model[max(temp_vocab, key=temp_vocab.get)])
            labels.append(max(temp_vocab, key=temp_vocab.get))
            temp_vocab.pop(max(temp_vocab, key=temp_vocab.get))
    else:
        for word in model.wv.vocab:
            tokens.append(model[word])
            labels.append(word)
        
    if plot == PCA:
        new_values = PCA(random_state=23).fit_transform(tokens)[:,:2]
    else:
        tsne_model = TSNE(perplexity=100, n_components=2, init='pca', n_iter=1000, random_state=23)
        new_values = tsne_model.fit_transform(tokens)
    
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
            xy=(x[i], y[i]),
            xytext=(5, 2),
            textcoords='offset points',
            ha='right',
            va='bottom')
        
    plt.show()

In [6]:
datasciencecentral_url = 'https://www.datasciencecentral.com/profiles/blog/list?promoted=1&page='
datasciencecentral_link_selector = '#xg .xg_module_body .title a:nth-child(2)'
datasciencecentral_content_selector = '#xg .postbody'

smartdatacollective_url = 'https://www.smartdatacollective.com/page/'
smartdatacollective_link_selector = '.content-inner .p-outer .p-footer a.btn'
smartdatacollective_content_selector = '.single-content .entry-content'

starbridgepartners_url = 'https://starbridgepartners.com/data-science-report/page/'
starbridgepartners_link_selector = 'article.post .entry-title-link'
starbridgepartners_content_selector = 'article'

data_science_blogs = blog_scraper( datasciencecentral_url, datasciencecentral_link_selector, datasciencecentral_content_selector )
data_science_blogs.extend(blog_scraper( smartdatacollective_url, smartdatacollective_link_selector, smartdatacollective_content_selector ))
data_science_blogs.extend(blog_scraper( starbridgepartners_url, starbridgepartners_link_selector, starbridgepartners_content_selector ))

In [7]:
data_science_blogs_split = [clean_sentence(row) for row in data_science_blogs]

word_count_dict = count_words(data_science_blogs_split)

In [8]:
model = Word2Vec(data_science_blogs_split, min_count=1,size= 50,workers=3, window =3, sg = 1)

In [52]:
word_vector_plot(model, word_count_dict, 10, plot=TSNE)

  from ipykernel import kernelapp as app
