In [1]:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import nltk

import re

import PyQt5

%matplotlib qt

In [2]:
nltk.download('stopwords')

STOP_WORDS = nltk.corpus.stopwords.words()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/josephepstein/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
headers = {"Accept-Language": "en-US, en;q=0.5"}

def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)
            continue

        if len(word) < 3:
            sentence.remove(word)
            continue
            
    return sentence

In [4]:
def blog_scraper( blog_url, post_link_selector, post_content_selector):
    
    post_text_array = []
    
    pages = np.arange(2, 50)

    for page in pages:

        blog = requests.get( blog_url + str(page), headers=headers)

        soup = BeautifulSoup(blog.text, 'html.parser')

        post_links = soup.select( post_link_selector )

        for post_link in post_links:
            
            if len(post_text_array) == 50:
                
                return post_text_array
            
            blog = requests.get(post_link['href'])
                        
            post_soup = BeautifulSoup(blog.text, 'html.parser')
            
            post_text = post_soup.select_one( post_content_selector ).text
                        
            post_text_array.append(post_text)
        
            
        
    

In [5]:
blog_url = 'https://www.datasciencecentral.com/profiles/blog/list?promoted=1&page='
post_link_selector = '#xg .xg_module_body .title a:nth-child(2)'
post_content_selector = '#xg .postbody'

data_science_blogs = blog_scraper( blog_url, post_link_selector, post_content_selector )

In [6]:
data_science_blogs_split = [clean_sentence(row) for row in data_science_blogs]

In [7]:
data_science_blogs_split[:2]

[['\n\nin',
  'blog',
  'introduce',
  'package',
  'heterogeneous',
  'ensemble',
  'learning',
  'classification',
  'regression',
  'fully',
  'automated\xa0it',
  'significantly',
  'lowers',
  'barrier',
  'practitioners',
  'apply',
  'heterogeneous',
  'ensemble',
  'learning',
  'techniques',
  'amateur',
  'fashion',
  'everyday',
  'predictive',
  'problems\nbefore',
  'dwell',
  'package',
  'details',
  'lets',
  'start',
  'understanding',
  'basic',
  'concepts\n\nwhy',
  'ensemble',
  'learning\ngenerally\xa0predictions',
  'become',
  'unreliable',
  'input',
  'sample',
  'training',
  'distribution',
  'bias',
  'data',
  'distribution',
  'error',
  'prone',
  'noise',
  'approaches',
  'require',
  'changes',
  'network',
  'architecture',
  'fine',
  'tuning',
  'balanced',
  'data',
  'increasing',
  'model',
  'size',
  'etc',
  'selection',
  'algorithm',
  'plays',
  'vital',
  'role',
  'scalability',
  'learning',
  'ability',
  'decrease',
  'complex',
  'da

In [8]:
model = Word2Vec(data_science_blogs_split, min_count=1,size= 50,workers=3, window =3, sg = 1)

In [9]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []
    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
        
    tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=250, random_state=23)
    new_values = tsne_model.fit_transform(tokens)
    
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
            xy=(x[i], y[i]),
            xytext=(5, 2),
            textcoords='offset points',
            ha='right',
            va='bottom')
        
    plt.show()
        
tsne_plot(model)

  


In [None]:
### Maybe refine this to top 250 most common words
### Improve word cleaning
### Add PCA option to plotting