In [1]:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import nltk

import string
import re
RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)

import PyQt5

%matplotlib qt

In [2]:
nltk.download('stopwords')

STOP_WORDS = nltk.corpus.stopwords.words()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/josephepstein/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
headers = {"Accept-Language": "en-US, en;q=0.5", "User-Agent": "Mozilla/5.0"}

def clean_sentence(sentence):

    sentence = RE_PUNCT.sub(" ", sentence).lower().split()
    return_sentence = sentence
    
    #print(sentence)
    
    for word in sentence:
        #print(word)
        if word in STOP_WORDS or len(word) < 3:
            return_sentence = list(filter((word).__ne__, return_sentence))
#             if word == 'go':
#                 print(return_sentence)
#                 print('\n\n\n\n\n\n\n\n\n\n\n\n')
#             return_sentence.remove(word)
#             if word == 'go':
#                 print(return_sentence)
#                 print('\n\n\n\n\n\n\n\n\n\n\n\n')
    return return_sentence

In [4]:
def count_words(split_sentences):
    word_count_dict = {}
    
    for sentence in split_sentences:
        for word in sentence:
            if word not in word_count_dict:
                word_count_dict[word] = 0
        
            word_count_dict[word] += 1
        
    return word_count_dict

In [5]:
def blog_scraper( blog_url, post_link_selector, post_content_selector):
    
    post_text_array = []
    
    pages = np.arange(2, 50)

    for page in pages:
        
        blog = requests.get( blog_url + str(page), headers=headers)

        soup = BeautifulSoup(blog.text, 'html.parser')

        post_links = soup.select( post_link_selector )

        for post_link in post_links:
                        
            if len(post_text_array) >= 100:
                
                return post_text_array
            
            blog = requests.get(post_link['href'], headers=headers)
            
            post_soup = BeautifulSoup(blog.text, 'html.parser')
            
            post_text = post_soup.select_one( post_content_selector ).text
                        
            post_text_array.append(post_text)
    
    return post_text_array
            
        
    

In [6]:
##/# Recreate using a dictionary of sites, and their link/content selectors

datasciencecentral_url = 'https://www.datasciencecentral.com/profiles/blog/list?promoted=1&page='
datasciencecentral_link_selector = '#xg .xg_module_body .title a:nth-child(2)'
datasciencecentral_content_selector = '#xg .postbody'

smartdatacollective_url = 'https://www.smartdatacollective.com/page/'
smartdatacollective_link_selector = '.content-inner .p-outer .p-footer a.btn'
smartdatacollective_content_selector = '.single-content .entry-content'

starbridgepartners_url = 'https://starbridgepartners.com/data-science-report/page/'
starbridgepartners_link_selector = 'article.post .entry-title-link'
starbridgepartners_content_selector = 'article'

data_science_blogs = blog_scraper( datasciencecentral_url, datasciencecentral_link_selector, datasciencecentral_content_selector )
print(len(data_science_blogs))
data_science_blogs.extend(blog_scraper( smartdatacollective_url, smartdatacollective_link_selector, smartdatacollective_content_selector ))
print(len(data_science_blogs))
data_science_blogs.extend(blog_scraper( starbridgepartners_url, starbridgepartners_link_selector, starbridgepartners_content_selector ))
print(len(data_science_blogs))


100
200
300


In [7]:
data_science_blogs_split = [clean_sentence(row) for row in data_science_blogs]

word_count_dict = count_words(data_science_blogs_split)

In [8]:
print(data_science_blogs_split[0])

['start', 'incorporating', 'machine', 'learning', 'models', 'user', 'applications', 'question', 'comes', '“when', 'model', 'good', 'enough', 'deploy', 'simply', 'single', 'right', 'answer', 'clear', 'cut', 'measure', 'machine', 'learning', 'model', 'ready', 'put', 'production', 'set', 'thought', 'experiments', 'new', 'model', 'identify', 'goal', 'machine', 'learning', 'model', 'trying', 'decide', 'machine', 'learning', 'model', 'ready', 'deployment', 'helpful', 'circle', 'back', 'algorithm’s', 'original', 'goal', 'trying', 'predict', 'customer', 'churn', 'reach', 'client', 'intent', 'automatically', 'approve', 'deny', 'someone’s', 'credit', 'application', 'use', 'case', 'model', 'determine', 'stringent', 'requirements', 'deployment', 'instance', 'machine', 'learning', 'model', 'simply', 'suggesting', 'things', 'user', 'deployment', 'requirements', 'wildly', 'different', 'compared', 'algorithm', 'designed', 'make', 'decisions', 'automatically', 'autonomous', 'driving', 'space', 'example

In [9]:
print(len(data_science_blogs_split))

300


In [10]:
model = Word2Vec(data_science_blogs_split, min_count=1,size= 50,workers=3, window =3, sg = 1)

In [11]:
### Make general plotting function with PCA

def tsne_plot(model, top_n=-1, vocab=model.wv.vocab):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []
    
    if top_n != -1 and vocab!=model.wv.vocab:
        
        counter = 0
        
        for counter in range(0, top_n):
            tokens.append(model[max(vocab, key=vocab.get)])
            labels.append(max(vocab, key=vocab.get))
            vocab.pop(max(vocab, key=vocab.get))
    else:
        for word in model.wv.vocab:
            tokens.append(model[word])
            labels.append(word)
        
    tsne_model = TSNE(perplexity=100, n_components=2, init='pca', n_iter=1000, random_state=23)
    new_values = tsne_model.fit_transform(tokens)
    
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
            xy=(x[i], y[i]),
            xytext=(5, 2),
            textcoords='offset points',
            ha='right',
            va='bottom')
        
    plt.show()
        


In [12]:
### Maybe refine this to top 250 most common words
### Improve word cleaning
### Add PCA option to plotting

In [13]:
model.wv.vocab

{'start': <gensim.models.keyedvectors.Vocab at 0x13a4f82d0>,
 'incorporating': <gensim.models.keyedvectors.Vocab at 0x13a4f8350>,
 'machine': <gensim.models.keyedvectors.Vocab at 0x13a4f8410>,
 'learning': <gensim.models.keyedvectors.Vocab at 0x13a4f8450>,
 'models': <gensim.models.keyedvectors.Vocab at 0x13a4f84d0>,
 'user': <gensim.models.keyedvectors.Vocab at 0x13a4f8550>,
 'applications': <gensim.models.keyedvectors.Vocab at 0x13a4f8590>,
 'question': <gensim.models.keyedvectors.Vocab at 0x13a4f85d0>,
 'comes': <gensim.models.keyedvectors.Vocab at 0x13a4f8490>,
 '“when': <gensim.models.keyedvectors.Vocab at 0x13a4f8510>,
 'model': <gensim.models.keyedvectors.Vocab at 0x13a4f8610>,
 'good': <gensim.models.keyedvectors.Vocab at 0x13a4f8650>,
 'enough': <gensim.models.keyedvectors.Vocab at 0x13a4f8690>,
 'deploy': <gensim.models.keyedvectors.Vocab at 0x13a4f86d0>,
 'simply': <gensim.models.keyedvectors.Vocab at 0x13a4f8710>,
 'single': <gensim.models.keyedvectors.Vocab at 0x13a4f8750>

In [14]:
word_count_dict

{'start': 108,
 'incorporating': 8,
 'machine': 592,
 'learning': 811,
 'models': 238,
 'user': 117,
 'applications': 216,
 'question': 50,
 'comes': 121,
 '“when': 2,
 'model': 410,
 'good': 150,
 'enough': 56,
 'deploy': 20,
 'simply': 44,
 'single': 60,
 'right': 165,
 'answer': 35,
 'clear': 45,
 'cut': 20,
 'measure': 33,
 'ready': 33,
 'put': 49,
 'production': 44,
 'set': 150,
 'thought': 24,
 'experiments': 5,
 'new': 818,
 'identify': 106,
 'goal': 36,
 'trying': 46,
 'decide': 20,
 'deployment': 43,
 'helpful': 26,
 'circle': 2,
 'back': 175,
 'algorithm’s': 2,
 'original': 19,
 'predict': 63,
 'customer': 265,
 'churn': 6,
 'reach': 61,
 'client': 42,
 'intent': 5,
 'automatically': 39,
 'approve': 3,
 'deny': 3,
 'someone’s': 1,
 'credit': 50,
 'application': 89,
 'use': 705,
 'case': 120,
 'determine': 66,
 'stringent': 6,
 'requirements': 38,
 'instance': 48,
 'suggesting': 4,
 'things': 86,
 'wildly': 2,
 'different': 262,
 'compared': 20,
 'algorithm': 88,
 'designed': 

In [15]:
sorted_word_count_dict = {k: v for k, v in sorted(word_count_dict.items(), key=lambda item: item[1])}

In [16]:
sorted_word_count_dict

{'someone’s': 1,
 'dilemma': 1,
 'mccourt': 1,
 'sigopt': 1,
 'memorized': 1,
 'cassie': 1,
 'kozyrkov': 1,
 'codified': 1,
 'baselines': 1,
 'adhered': 1,
 'retrain': 1,
 'answered': 1,
 'encouraged': 1,
 'prescribed': 1,
 'reminder': 1,
 'brimming': 1,
 'headfirst': 1,
 'acquaint': 1,
 'underpinned': 1,
 'strategizing': 1,
 'brightest': 1,
 'powers': 1,
 'ancient': 1,
 'cardinality': 1,
 'comprehensible': 1,
 'provoking': 1,
 'regressi': 1,
 'processing”': 1,
 'holder': 1,
 'patterson': 1,
 'regents': 1,
 'oklahoma': 1,
 'osu': 1,
 'evangelism': 1,
 'blending': 1,
 'flat': 1,
 'retriever': 1,
 'docx': 1,
 'crawling”': 1,
 'kni': 1,
 'igo2cb4vhphvmiqf': 1,
 'booklet': 1,
 '“tokenization”': 1,
 'whitespace': 1,
 'tokenizers': 1,
 'erasure': 1,
 'hyphens': 1,
 'tokenized': 1,
 'tagger': 1,
 'conceptually': 1,
 '“topic': 1,
 'tripadvisor': 1,
 '“interactive': 1,
 'search”': 1,
 'dirichlet': 1,
 'extractor': 1,
 '“large': 1,
 'large”': 1,
 'word2vec': 1,
 'wordembedding': 1,
 'positional'

In [17]:
len(sorted_word_count_dict)

13042

In [19]:
tsne_plot(model, 500, sorted_word_count_dict)

  # This is added back by InteractiveShellApp.init_path()


In [None]:
### Better scraping input
### Add TF-IDF
### Add PCA