In [37]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score

import nltk
from nltk import *
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import sys
import codecs
import string
import random
import re

import requests
import urllib
from bs4 import BeautifulSoup
from readability.readability import Document # https://github.com/buriy/python-readability. Tried Goose, Newspaper (python libraries on Github). Bad results.
from http.cookiejar import CookieJar # 

# Data Exploration
First, load in science article news data, which contains basic information of science articles including URL and description. I previously mined the articles from Stat News, Science Daily, and Scientific American, but they are missing a lot of crucial information for the goal of this text project.

In [10]:
news_df = pd.read_json("data/news.json")
print('Shape of data (rows, columns):', news_df.shape, '\n')
print('Column Names: ', news_df.columns) # flagged means whether or not the article is incomplete or inaccurate

Shape of data (rows, columns): (4000, 12) 

Column Names:  Index(['articleUrl', 'author', 'categories', 'createdAt', 'date',
       'description', 'flagged', 'imageUrl', 'slug', 'source', 'sourceUrl',
       'title'],
      dtype='object')


I have a couple of <b>goals</b> in mind for this text data: exploring some interesting features about the articles and extracting keywords. Although there is also a "categories" column in this data, it would be nice to create a model to categorize into categories -- for example, "Physics" or "Artificial Intelligence" -- to be able to compare these articles perhaps with new ones without the "categories" label. Also, it would be cool to see if article writers sometimes "incorrectly" categorizes an article.<br><br>
To clean up this data a bit, let's also remove a few unnecesary columns. For example, since "description" is unclean and can be replaced by the full "article, this column can be deleted.

In [11]:
# new article features to be populated
news_df['article'] = '' # full article
news_df['num_words_not_in_dictionary'] = 0 # number of words not in English dictionary (super scientific words)
news_df['stopwords2words'] = 0 # ratio of stopwords to total words
news_df['readability_score'] = 0 # readability score probably based on Flesch–Kincaid
news_df['sentiment_polarity'] = 0 # the sentiment polarity within an article (range of emotions)
news_df = news_df.drop(['author', 'createdAt', 'imageUrl', 'slug', 'date', 'description'], axis=1)
news_df = news_df.replace(np.nan, 0) # replace NaN with 0 for "flagged" column
news_df.head()

Unnamed: 0,articleUrl,categories,flagged,source,sourceUrl,title,article,num_words_not_in_dictionary,stopwords2words,readability_score,sentiment_polarity
0,https://www.statnews.com/2017/02/09/dana-farbe...,[science policy],1.0,Stat News,https://www.statnews.com,Dana-Farber will avoid ‘controversial venues’ ...,,0,0,0,0
1,https://www.statnews.com/2017/02/09/antibiotic...,[science policy],1.0,Stat News,https://www.statnews.com,Why your doctor’s advice to take all your anti...,,0,0,0,0
2,https://www.statnews.com/2017/02/07/scientist-...,[science policy],0.0,Stat News,https://www.statnews.com,A lot of Americans don’t know a single scienti...,,0,0,0,0
3,https://www.statnews.com/2017/02/09/twitter-ho...,[science policy],0.0,Stat News,https://www.statnews.com,The doctor is in: 13 clinicians to follow on T...,,0,0,0,0
4,https://www.statnews.com/2017/02/09/obamacare-...,[science policy],0.0,Stat News,https://www.statnews.com,Patient groups fear Obamacare repeal could und...,,0,0,0,0


# Scraping the Full Article Texts
First, populate the "article" column bye the whole article from the URL (will clean those full texts up later).<br> <b>NOTE:</b> The following cell will take a long time, so it is commented. The cell after it will directly load in the result of the cell.

In [8]:
# for i in range(len(news_df['articleUrl'])):
#     try:
#         url = news_df['articleUrl'][i] # Bad: 2841, 3013, 3015, 3077, 3091, 3092
#         opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor)
#         opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.77 Safari/535.7')]
#         html =  opener.open(url).read().decode('utf-8')
#         readable_article = Document(html).summary()
#         soup = BeautifulSoup(readable_article, "lxml")
#         text = soup.get_text()
#         news_df.loc[i, 'article'] = text
#         print(i, " ", end="")
#         break
#     except:
#         print(str(i) + "error", " ", end="")

In [78]:
# Run this to skip above steps
news_df = pd.read_json("data/news_with_articles.json")
news_df = news_df.sort_index(ascending=True)
news_df = news_df[['articleUrl', 'categories', 'flagged', 'source', 'sourceUrl', 'title', 'article', 
             'num_words_not_in_dictionary', 'stopwords2words', 'readability_score', 'sentiment_polarity']]
news_df.head()

Unnamed: 0,articleUrl,categories,flagged,source,sourceUrl,title,article,num_words_not_in_dictionary,stopwords2words,readability_score,sentiment_polarity
0,https://www.statnews.com/2017/02/09/dana-farbe...,[science policy],1,Stat News,https://www.statnews.com,Dana-Farber will avoid ‘controversial venues’ ...,"\nT\nhe Dana-Farber Cancer Institute, which ha...",0,0,0,0
1,https://www.statnews.com/2017/02/09/antibiotic...,[science policy],1,Stat News,https://www.statnews.com,Why your doctor’s advice to take all your anti...,\nY\nou’ve heard it many times before from you...,0,0,0,0
2,https://www.statnews.com/2017/02/07/scientist-...,[science policy],0,Stat News,https://www.statnews.com,A lot of Americans don’t know a single scienti...,\nI\n’ve recently been thinking about this: Th...,0,0,0,0
3,https://www.statnews.com/2017/02/09/twitter-ho...,[science policy],0,Stat News,https://www.statnews.com,The doctor is in: 13 clinicians to follow on T...,\nC\nlinical medicine is rapidly changing in t...,0,0,0,0
4,https://www.statnews.com/2017/02/09/obamacare-...,[science policy],0,Stat News,https://www.statnews.com,Patient groups fear Obamacare repeal could und...,\nW\nASHINGTON — As Republicans confront the t...,0,0,0,0


Web scraping is very messy, so we have to clean the "article" column. We will keep the '\n' characters to be able to split the text into paragraphs (without the use of NLTK).<br>
On closer ispection of the articles from "Science Daily," many articles did not scrape correctly. I will remove them now. This is the downsides of webscraping: so much noise in data.<br>
Let's also save the list of articles to use as a corpus.

In [80]:
news_df = news_df.replace({'\xa0': ' '}, regex=True) # clean these empty null characters.
news_df['article'][19]

'\n \n\n'

In [81]:
for i in range(len(news_df)):
    if len(news_df['article'][i]) < 10:
        news_df = news_df.drop(i)
news_df = news_df.reset_index(drop=True)
news_df

Unnamed: 0,articleUrl,categories,flagged,source,sourceUrl,title,article,num_words_not_in_dictionary,stopwords2words,readability_score,sentiment_polarity
0,https://www.statnews.com/2017/02/09/dana-farbe...,[science policy],1,Stat News,https://www.statnews.com,Dana-Farber will avoid ‘controversial venues’ ...,"\nT\nhe Dana-Farber Cancer Institute, which ha...",0,0,0,0
1,https://www.statnews.com/2017/02/09/antibiotic...,[science policy],1,Stat News,https://www.statnews.com,Why your doctor’s advice to take all your anti...,\nY\nou’ve heard it many times before from you...,0,0,0,0
2,https://www.statnews.com/2017/02/07/scientist-...,[science policy],0,Stat News,https://www.statnews.com,A lot of Americans don’t know a single scienti...,\nI\n’ve recently been thinking about this: Th...,0,0,0,0
3,https://www.statnews.com/2017/02/09/twitter-ho...,[science policy],0,Stat News,https://www.statnews.com,The doctor is in: 13 clinicians to follow on T...,\nC\nlinical medicine is rapidly changing in t...,0,0,0,0
4,https://www.statnews.com/2017/02/09/obamacare-...,[science policy],0,Stat News,https://www.statnews.com,Patient groups fear Obamacare repeal could und...,\nW\nASHINGTON — As Republicans confront the t...,0,0,0,0
5,https://www.statnews.com/2017/02/08/drug-names...,[science policy],0,Stat News,https://www.statnews.com,The creative science of coining drug names,\n‘Creation engineering’: The art and science ...,0,0,0,0
6,https://www.statnews.com/2017/02/09/alzheimers...,[science policy],0,Stat News,https://www.statnews.com,Alzheimer’s-preventing drugs may already exist...,\nT\nhe search for drugs to prevent or delay A...,0,0,0,0
7,https://www.statnews.com/2016/03/28/google-lif...,[science policy],0,Stat News,https://www.statnews.com,,"\nM\nOUNTAIN VIEW, Calif. — Google’s brash att...",0,0,0,0
8,https://www.statnews.com/2017/02/06/mental-hea...,[science policy],0,Stat News,https://www.statnews.com,A dangerous wait: Colleges can’t meet soaring ...,\nC\nolleges across the country are failing to...,0,0,0,0
9,https://www.statnews.com/2017/02/09/doctors-tr...,[science policy],0,Stat News,https://www.statnews.com,Future doctors should be trained to promote so...,\nM\nany Americans believe that we have the be...,0,0,0,0


In [82]:
articles = [str(s) for s in news_df['article']] # create a corpus of all articles
articles[0]

'\nT\nhe Dana-Farber Cancer Institute, which has faced strong criticism for its decision to hold a fundraiser this month at President Trump’s Mar-a-Lago Club, said Thursday that it would avoid “controversial venues” in the future.\nThe hospital’s chief executive, Dr. Laurie Glimcher, said in a statement that its upcoming event, scheduled for Feb. 18, had become a “lightning rod” for some.\n“In the future we will avoid controversial venues that may distract from our focus on cancer care and research,” Glimcher said, adding that this year’s event would go on.\nadvertisement\nThe announcement from the Boston hospital could increase pressure on other clinics and charities that have faced calls to cancel events at Mar-a-Lago.\nAmong them is the Cleveland Clinic, which had one of its staff doctors barred from entering the United States after President Trump’s executive order on immigration.\n\nDana-Farber has held a fundraiser at Mar-a-Lago every year since 2011, and once in 2008. It expects

### Stopwords, stemming, and tokenization to clean the text even more
tokenize_and_stem: tokenizes and also stems each token.<br>
tokenize_only: tokenizes the text only to be able to convert stemmed tokens back to the full word.<br>

In [256]:
# load nltk's English stopwords
stopwords = nltk.corpus.stopwords.words('english')
# load nltk's SnowballStemmer
stemmer = SnowballStemmer("english")

# Define a tokenizer and stemmer that returns the set of stems of the text
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if len(token) <= 1: # remove one character words, "a" apostraphe s.
            continue
        if token.isalpha():
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if len(token) <= 1: # remove one character words, "a" apostraphe s.
            continue
        if token.isalpha():
            filtered_tokens.append(token)
    return filtered_tokens

Use stemming + tokenizing and tokenizing only functions to create two vocabulary lists.<br>
These two lists create a Pandas DataFrame with the stemmed vocabulary as the index and the tokenized words as the column (to convert stem into its first full word/token, could be many, but take first for simplicity).<br>
Stemming is not the best... especially when it comes to Proper Nouns and other special words.

In [257]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in articles:
    allwords_stemmed = tokenize_and_stem(i) #for each article in "articles", tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)
    
vocab_df = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed) # map stem to first actual word for simplicity
print('Shape of vocab_df (rows, columns):', vocab_df.shape, '\n')
vocab_df.head()

Shape of vocab_df (rows, columns): (1662526, 1) 



Unnamed: 0,words
he,he
cancer,cancer
institut,institute
which,which
has,has


### Also, let's start filling out the other columns since we have the full article texts now


In [87]:
stopwords = set(w.lower() for w in nltk.corpus.stopwords.words('english'))

def stopwords_to_words(s):
    raw = nltk.word_tokenize(s)
    text = nltk.Text(raw)
    text_vocab = set(w.lower() for w in text if w.isalpha())
    return len(text_vocab.intersection(stopwords)) / len(text_vocab)
    
for i in range(len(news_df['articleUrl'])):
    try:
        news_df.loc[i, 'stopwords2words'] = stopwords_to_words(news_df['article'][i])
        print(i, ':', news_df['articleUrl'][i], ":", stopwords_to_words(news_df['article'][i]))
    except:
        pass # keeps empty articles 0.0

0 : https://www.statnews.com/2017/02/09/dana-farber-mar-lago/ : 0.25
1 : https://www.statnews.com/2017/02/09/antibiotics-resistance-superbugs/ : 0.17032967032967034
2 : https://www.statnews.com/2017/02/07/scientist-march-trump/ : 0.20891364902506965
3 : https://www.statnews.com/2017/02/09/twitter-hospitals-medicine/ : 0.14606741573033707
4 : https://www.statnews.com/2017/02/09/obamacare-repeal-preexisting-conditions/ : 0.18032786885245902
5 : https://www.statnews.com/2017/02/08/drug-names-process/ : 0.11202830188679246
6 : https://www.statnews.com/2017/02/09/alzheimers-disease-statins/ : 0.18518518518518517
7 : https://www.statnews.com/2016/03/28/google-life-sciences-exodus/ : 0.08673894912427023
8 : https://www.statnews.com/2017/02/06/mental-health-college-students/ : 0.13237063778580024
9 : https://www.statnews.com/2017/02/09/doctors-training-social-change/ : 0.14678899082568808
10 : https://www.statnews.com/2017/02/09/injury-states-unusually-common/ : 0.22522522522522523
11 : https:

89 : https://www.scientificamerican.com/article/wetlands-can-help-fight-climate-change/ : 0.19230769230769232
90 : https://www.scientificamerican.com/article/march-for-science-set-for-earth-day/ : 0.296551724137931
91 : https://www.scientificamerican.com/article/science-and-the-u-s-supreme-court-the-cases-to-watch-in-2017/ : 0.1425339366515837
92 : https://www.scientificamerican.com/article/florida-has-seen-bad-effects-from-trump-like-climate-gag-orders/ : 0.16666666666666666
93 : https://www.scientificamerican.com/article/groundhog-day-is-hogwash/ : 0.4
94 : https://www.scientificamerican.com/article/beijing-vows-deep-cut-in-coal-use-in-2017-to-fight-smog/ : 0.18604651162790697
95 : https://www.scientificamerican.com/podcast/episode/the-arctics-anti-snowball-snowball-effect/ : 0.24285714285714285
96 : https://www.scientificamerican.com/article/e-u-looks-to-china-for-climate-leadership/ : 0.1640625
97 : https://www.scientificamerican.com/article/house-science-panel-to-hold-hearing-on-l

165 : http://www.the-scientist.com//?articles.view/articleNo/47611/title/Low-Social-Status-May-Weaken-Immune-System-in-Monkeys/ : 0.2556818181818182
166 : https://www.statnews.com/2017/01/25/vertex-pain-opioids/ : 0.2911392405063291
167 : https://www.statnews.com/2017/01/23/week-of-jan-23-what-to-watch-for-in-biopharma/ : 0.29850746268656714
168 : http://www.the-scientist.com//?articles.view/articleNo/48229/title/ACMG-Urges-Caution-When-Editing-Embryo-Genomes/ : 0.22297297297297297
169 : http://www.the-scientist.com//?articles.view/articleNo/47333/title/Immunity-in-the-Brain/ : 0.24096385542168675
170 : http://www.the-scientist.com//?articles.view/articleNo/47905/title/Pharma-Redo/ : 0.21428571428571427
171 : http://www.the-scientist.com//?articles.view/articleNo/48034/title/Abscisic-Acid-s-Role-in-Ferns-Finally-Determined/ : 0.1970074812967581
172 : http://www.the-scientist.com//?articles.view/articleNo/47993/title/How-Plant-Soil-Feedback-Affects-Ecological-Diversity/ : 0.196675900277

257 : http://neurosciencenews.com/brain-development-autism-6034/ : 0.14672686230248308
258 : http://neurosciencenews.com/decision-making-network-6050/ : 0.1566579634464752
259 : http://neurosciencenews.com/insomnia-sexual-function-6038/ : 0.21649484536082475
260 : https://www.statnews.com/2017/02/09/cancer-drug-benefit-price/ : 0.32941176470588235
261 : https://www.statnews.com/2017/01/12/nevada-woman-superbug-resistant/ : 0.18610421836228289
262 : https://www.statnews.com/pharmalot/2017/02/09/drug-prices-pharmacy-benefits-managers/ : 0.24175824175824176
263 : https://www.scientificamerican.com/article/republicans-offer-to-tax-carbon-emissions/ : 0.1731266149870801
264 : https://www.scientificamerican.com/article/house-science-committee-may-soon-try-to-weaken-the-epa/ : 0.1540041067761807
265 : https://www.scientificamerican.com/article/house-science-committee-may-soon-try-to-weaken-the-epa/ : 0.1540041067761807
266 : https://www.scientificamerican.com/article/republicans-offer-to-tax-

356 : https://www.statnews.com/2017/01/13/tom-price-innate-immunotherapeutics/ : 0.12055335968379446
357 : https://www.statnews.com/2017/02/12/ex-drug-company-ceo-shkreli-speak-harvard/ : 0.24468085106382978
358 : https://www.statnews.com/2017/01/18/anorexia-forced-treatment/ : 0.1649782923299566
359 : https://www.statnews.com/2016/03/17/medical-students-match-day/ : 0.181640625
360 : https://www.statnews.com/2016/03/31/dna-shape-double-helix-dekker/ : 0.225
361 : https://www.statnews.com/2017/01/26/chimera-humans-animals-ethics/ : 0.1787709497206704
362 : https://www.statnews.com/2016/07/06/epipen-prices-allergies/ : 0.16962524654832348
363 : https://www.statnews.com/2017/01/10/kennedy-vaccine-book/ : 0.12351543942992874
364 : https://www.statnews.com/2017/01/28/egg-donors-risks/ : 0.15384615384615385
365 : https://www.statnews.com/2017/02/13/california-hiv-transmission-criminal-penalties/ : 0.2026315789473684
366 : https://www.statnews.com/2017/02/13/flumist-mystery/ : 0.329787234042

486 : https://www.statnews.com/2017/02/16/mammograms-plus-personalized-treatment-best-options-fight-breast-cancer/ : 0.23493975903614459
487 : https://www.statnews.com/2017/02/15/crispr-winners-losers/ : 0.207492795389049
488 : https://www.statnews.com/2017/02/16/carfentanil-china-ban/ : 0.17277486910994763
489 : https://www.statnews.com/2017/02/16/crispr-patent-decision-six-takeaways/ : 0.2833333333333333
490 : https://www.statnews.com/2017/02/16/aaas-qa-trump-science/ : 0.19389978213507625
491 : https://www.statnews.com/2017/02/16/pregnant-women-rural-america/ : 0.14067278287461774
492 : http://www.the-scientist.com//?articles.view/articleNo/48492/title/Henrietta-Lacks-s-Family-Seeks-Compensation/ : 0.30128205128205127
493 : https://www.statnews.com/pharmalot/2017/02/16/marathon-drug-prices-phrma/ : 0.35
494 : https://www.statnews.com/pharmalot/2017/02/16/shkreli-crispr-dermatology/ : 0.27586206896551724
495 : https://www.scientificamerican.com/article/falling-buttered-toast/ : 0.221

602 : http://news.mit.edu/2016/making-big-data-manageable-1214 : 0.1807909604519774
603 : http://news.mit.edu/2016/recorded-speech-images-automated-speech-recognition-1206 : 0.168141592920354
604 : http://news.mit.edu/2016/computer-learns-recognize-sounds-video-1202 : 0.18018018018018017
605 : http://news.mit.edu/2016/machine-learning-system-brain-recognizes-faces-1201 : 0.17995444191343962
606 : http://news.mit.edu/2016/creating-videos-of-the-future-1129 : 0.1925133689839572
607 : http://news.mit.edu/2016/artificial-intelligence-system-surfs-web-improve-performance-1110 : 0.18622448979591838
608 : http://news.mit.edu/2016/driverless-scooters-1107 : 0.22118380062305296
609 : http://news.mit.edu/2016/scene-at-mit-nightmare-on-ames-street-1031 : 0.2549019607843137
610 : http://news.mit.edu/2016/making-computers-explain-themselves-machine-learning-1028 : 0.18973214285714285
611 : http://news.mit.edu/2016/finding-patterns-corrupted-data-1026 : 0.18087855297157623
612 : http://news.mit.edu/

757 : http://venturebeat.com/2017/02/20/how-well-stay-productive-in-the-connected-car/ : 0.18809980806142035
758 : http://venturebeat.com/2017/02/21/how-ai-will-help-you-connect-with-customers/ : 0.16032064128256512
759 : https://www.statnews.com/2017/02/21/testosterone-gel-men-memory/ : 0.19935691318327975
760 : https://www.bostonglobe.com/metro/2017/02/21/medical-students-face-fallout-over-trump-immigration-order/BTHCdec6aJtnbUBM178xyH/story.html : 0.20256410256410257
761 : http://www.the-scientist.com//?articles.view/articleNo/48510/title/Abundant-Sequence-Errors-in-Public-Databases/ : 0.18902439024390244
762 : https://www.scientificamerican.com/article/giant-antarctic-ice-shelf-crack-threatens-to-become-a-massive-iceberg/ : 0.18276762402088773
763 : http://neurosciencenews.com/ampa-inactivation-erases-fear-memory-mice-6142/ : 0.16083916083916083
764 : http://neurosciencenews.com/ampa-inactivation-erases-fear-memory-mice-6142/ : 0.16083916083916083
765 : http://neurosciencenews.com/

868 : http://venturebeat.com/2017/02/24/apple-extending-seattle-footprint-with-new-offices-dedicated-to-ai-and-machine-learning/ : 0.21518987341772153
869 : http://www.news-medical.net/news/20170225/Fred-Hutch-scientists-make-important-step-in-identifying-specific-T-cells-to-fight-against-cancer.aspx : 0.22988505747126436
870 : https://www.scientificamerican.com/video/watch-velvet-worms-fire-their-slime-cannons/ : 0.2549019607843137
871 : http://neurosciencenews.com/memory-impairment-cocaine-dads-6157/ : 0.13719512195121952
872 : http://neurosciencenews.com/memory-impairment-cocaine-dads-6157/ : 0.13719512195121952
873 : http://neurosciencenews.com/schizophrenia-ra-6158/ : 0.15772870662460567
874 : http://neurosciencenews.com/schizophrenia-ra-6158/ : 0.15772870662460567
875 : http://neurosciencenews.com/schizophrenia-ra-6158/ : 0.15772870662460567
876 : http://neurosciencenews.com/schizophrenia-ra-6158/ : 0.15772870662460567
877 : http://neurosciencenews.com/memory-impairment-cocaine-d

975 : https://www.statnews.com/2017/02/28/trump-fda-reaction/ : 0.26956521739130435
976 : http://www.the-scientist.com//?articles.view/articleNo/48539/title/Massively-Parallel-Perturbations/ : 0.21256038647342995
977 : http://www.the-scientist.com//?articles.view/articleNo/48619/title/Infographic--Single-Cell-CRISPR-Screens/ : 0.24675324675324675
978 : http://www.the-scientist.com//?articles.view/articleNo/48576/title/A-Selection-of-CRISPR-Proof-of-Principle-Studies/ : 0.12903225806451613
979 : https://www.statnews.com/2015/12/24/freeze-fat-weight-loss/ : 0.19036144578313252
980 : https://www.statnews.com/2017/03/01/pap-smear-cervical-cancer-hpv/ : 0.34782608695652173
981 : https://www.statnews.com/2017/03/01/grief-parents-children-loss/ : 0.21782178217821782
982 : http://www.news-medical.net/news/20170301/UNC-Lineberger-achieves-major-milestone-with-launch-of-cellular-immunotherapy-program.aspx : 0.1585014409221902
983 : https://www.scientificamerican.com/article/inside-the-quest-to-m

1099 : https://www.statnews.com/pharmalot/2017/03/03/trump-fda-crowley-rare-disease/ : 0.3142857142857143
1100 : http://venturebeat.com/2017/03/03/forget-obamacare-ai-is-driving-the-real-health-care-transformation/ : 0.18769230769230769
1101 : https://www.statnews.com/2017/03/03/fake-milk-fight/ : 0.14978902953586498
1102 : https://www.statnews.com/2017/03/03/cole-trump-budget-nih/ : 0.26136363636363635
1103 : https://www.statnews.com/2017/03/03/health-habits-neighborhood-maps/ : 0.24621212121212122
1104 : https://www.statnews.com/2016/07/18/crispr-off-target-effects/ : 0.19329896907216496
1105 : https://www.statnews.com/2016/12/09/opoid-overdose-deaths-us/ : 0.36065573770491804
1106 : https://www.statnews.com/2016/05/04/mouth-full-bacteria-blooming-beautiful/ : 0.22988505747126436
1107 : http://www.the-scientist.com//?articles.view/articleNo/48701/title/Cortical-Interneurons-Show-Layer-Specific-Activities/ : 0.19086021505376344
1108 : https://www.scientificamerican.com/article/how-the

1232 : https://www.statnews.com/2017/03/08/md-anderson-depinho-resigns/ : 0.17537313432835822
1233 : https://www.statnews.com/pharmalot/2017/03/08/grassley-epipen-kaleo-prices/ : 0.2872340425531915
1234 : https://www.statnews.com/2017/03/08/drug-ads-spending-single-drug/ : 0.3218390804597701
1235 : https://www.statnews.com/pharmalot/2017/03/08/fda-india-drug-maker-violations/ : 0.3069306930693069
1236 : https://www.statnews.com/2017/03/08/drug-ads-spending-single-drug/ : 0.3218390804597701
1237 : https://www.statnews.com/2017/03/08/brazil-yellow-fever/ : 0.22676579925650558
1238 : http://www.news-medical.net/news/20170309/UVA-researchers-use-probiotic-bacteria-in-yogurt-to-reverse-depression-symptoms.aspx : 0.23588039867109634
1239 : https://www.statnews.com/2016/01/21/als-patient-foundation-research/ : 0.19170984455958548
1240 : https://www.scientificamerican.com/article/obamacare-repeal-and-trump-rsquo-s-spending-plan-put-cdc-budget-in-peril/ : 0.18829516539440203
1241 : https://www.

1359 : https://www.statnews.com/2017/03/13/biotech-regulation/ : 0.28865979381443296
1360 : http://neurosciencenews.com/crispr-neck-pain-6241/ : 0.1675531914893617
1361 : http://neurosciencenews.com/crispr-neck-pain-6241/ : 0.1675531914893617
1362 : https://www.statnews.com/2017/03/13/genetic-privacy-wellness/ : 0.1714922048997773
1363 : http://neurosciencenews.com/locked-in-als-fnirs-6238/ : 0.16023738872403562
1364 : https://www.statnews.com/2017/03/13/gop-health-plan-takeaways/ : 0.182648401826484
1365 : http://neurosciencenews.com/neurobiology-crime-6244/ : 0.16747572815533981
1366 : http://neurosciencenews.com/locked-in-als-fnirs-6238/ : 0.16023738872403562
1367 : http://neurosciencenews.com/neurobiology-crime-6244/ : 0.16747572815533981
1368 : http://neurosciencenews.com/locked-in-als-fnirs-6238/ : 0.16023738872403562
1369 : http://neurosciencenews.com/diet-baby-blues-depression-6242/ : 0.1483375959079284
1370 : http://neurosciencenews.com/depression-mct-6237/ : 0.209677419354838

1475 : https://www.statnews.com/2017/03/16/funding-basic-science-nih/ : 0.3088235294117647
1476 : https://www.statnews.com/pharmalot/2017/03/16/fda-drug-prices-canada-importation/ : 0.21875
1477 : https://www.statnews.com/2017/03/16/funding-basic-science-nih/ : 0.3088235294117647
1478 : https://www.statnews.com/2017/03/16/user-fee-fda-budget/ : 0.4230769230769231
1479 : http://neurosciencenews.com/serotonin-psychology-6252/ : 0.1596244131455399
1480 : http://www.the-scientist.com//?articles.view/articleNo/48840/title/Unstructured-Proteins-Help-Tardigrades-Survive-Desiccation/ : 0.2392857142857143
1481 : https://www.statnews.com/2017/03/16/nih-fogarty-center/ : 0.1895424836601307
1482 : http://www.the-scientist.com//?articles.view/articleNo/48833/title/Inflammation-Drives-Gut-Bacteria-Evolution/ : 0.20249221183800623
1483 : http://www.the-scientist.com//?articles.view/articleNo/48833/title/Inflammation-Drives-Gut-Bacteria-Evolution/ : 0.20249221183800623
1484 : https://www.statnews.com/

1581 : https://www.statnews.com/pharmalot/2017/03/20/amgen-cholesterol-drug-prices/ : 0.22962962962962963
1582 : https://www.bostonglobe.com/business/2017/03/19/biopharma-facing-stiffer-opposition/EmzTSsLhlanZ33E6AOUteO/story.html : 0.18509615384615385
1583 : http://www.news-medical.net/news/20170320/Linking-RNA-to-autoimmune-diseases.aspx : 0.15917602996254682
1584 : https://www.statnews.com/2017/03/20/esperion-amgen-pcsk9/ : 0.3053435114503817
1585 : https://www.statnews.com/2017/03/20/hospitals-rural-health-care-aca-trump/ : 0.20316027088036118
1586 : https://www.youtube.com/watch?v=AVwUkBAwjgs&list=LLPEk13IV9vkbxwtDR0KpWOA&index=8 : 0.5
1587 : https://kynplex.com : 0.20512820512820512
1588 : https://kynplex.com : 0.20512820512820512
1589 : https://kynplex.com : 0.20512820512820512
1590 : https://kynplex.com : 0.20512820512820512
1591 : https://kynplex.com : 0.20512820512820512
1592 : https://www.statnews.com/2016/12/20/transgender-youth-doctors-clinics/ : 0.125
1593 : https://www.s

1687 : http://neurosciencenews.com/sensory-signals-neuroscience-6278/ : 0.14741035856573706
1688 : http://neurosciencenews.com/parkinsons-exercise-neurology-6282/ : 0.1597444089456869
1689 : http://neurosciencenews.com/immune-system-vaccines-6287/ : 0.16389548693586697
1690 : http://neurosciencenews.com/parkinsons-exercise-neurology-6282/ : 0.1597444089456869
1691 : http://neurosciencenews.com/pleasure-amygdala-neuroscience-6286/ : 0.17506631299734748
1692 : http://neurosciencenews.com/brain-rewiring-senses-blind-6276/ : 0.1455026455026455
1693 : https://www.scientificamerican.com/article/california-adopts-strict-rules-for-methane-emissions/ : 0.20233463035019456
1694 : https://www.scientificamerican.com/article/ethical-guidelines-on-lab-grown-embryos-beg-for-revamping-scientists-say/ : 0.15514018691588785
1695 : http://neurosciencenews.com/pleasure-amygdala-neuroscience-6286/ : 0.17506631299734748
1696 : http://www.news-medical.net/news/20170323/Analysis-of-antibodies-could-be-new-too

1790 : http://www.the-scientist.com//?articles.view/articleNo/48973/title/Making-CAR-T-Cell-Therapy-Safer/ : 0.11833550065019506
1791 : http://www.the-scientist.com//?articles.view/articleNo/48903/title/Starvation-Response-Triggers-Melanoma-Invasion/ : 0.19014084507042253
1792 : http://www.the-scientist.com//?articles.view/articleNo/49047/title/Cancer-Genomes/ : 0.20833333333333334
1793 : http://www.the-scientist.com//?articles.view/articleNo/48909/title/Location--Location--Location/ : 0.12278630460448642
1794 : http://www.the-scientist.com//?articles.view/articleNo/49000/title/Neoantigens-Enable-Personalized-Cancer-Immunotherapy/ : 0.09871794871794871
1795 : http://www.the-scientist.com//?articles.view/articleNo/48973/title/Making-CAR-T-Cell-Therapy-Safer/ : 0.11833550065019506
1796 : http://www.the-scientist.com//?articles.view/articleNo/49003/title/Circadian-Rhythms-Influence-Treatment-Effects/ : 0.09655172413793103
1797 : http://www.the-scientist.com//?articles.view/articleNo/48910

1897 : http://news.mit.edu/2017/csail-launches-artificial-intelligence-initiative-with-industry-0406 : 0.15384615384615385
1898 : https://www.scientificamerican.com/article/politicians-shouldn-rsquo-t-troll-through-scientists-rsquo-e-mails/ : 0.20930232558139536
1899 : https://www.scientificamerican.com/podcast/episode/extreme-storms-are-extreme-eroders/ : 0.24380165289256198
1900 : https://www.scientificamerican.com/article/earths-co2-could-spike-to-a-level-not-seen-since-the-dinosaurs/ : 0.22012578616352202
1901 : https://www.scientificamerican.com/article/a-tiny-detection-chip-could-find-methane-leaks-autonomously/ : 0.1590909090909091
1902 : https://www.scientificamerican.com/article/most-americans-oppose-climate-science-cuts/ : 0.24630541871921183
1903 : https://www.scientificamerican.com/article/most-americans-oppose-climate-science-cuts/ : 0.24630541871921183
1904 : https://www.scientificamerican.com/article/astronomers-observe-milky-way-like-galaxies-in-early-universe/ : 0.1695

2010 : http://neurosciencenews.com/alzheimers-music-therapy-6414/ : 0.2247557003257329
2011 : http://neurosciencenews.com/stem-cell-brain-tissue-6407/ : 0.16159250585480095
2012 : http://neurosciencenews.com/chemobrain-cancer-6397/ : 0.15085158150851583
2013 : http://neurosciencenews.com/artificial-intelligence-human-prejudice-6411/ : 0.12382739212007504
2014 : http://neurosciencenews.com/immunotherapy-glioblastoma-6416/ : 0.11555555555555555
2015 : http://neurosciencenews.com/alcohol-dependence-amygdala-6399/ : 0.132013201320132
2016 : http://neurosciencenews.com/artificial-intelligence-human-prejudice-6411/ : 0.12382739212007504
2017 : http://neurosciencenews.com/oxytocin-ptsd-addiction-6412/ : 0.2197452229299363
2018 : http://neurosciencenews.com/cognitive-map-scaling-6403/ : 0.13852813852813853
2019 : http://neurosciencenews.com/sensory-cell-regeneration-hearing-6391/ : 0.14772727272727273
2020 : http://neurosciencenews.com/oxytocin-ptsd-addiction-6412/ : 0.2197452229299363
2021 : 

In [92]:
# WARNING: Takes a long time. Might not be accurate but relatively should determine which article has more complex words.

# def num_unusual_words(s):
#     raw = nltk.word_tokenize(s)
#     text = nltk.Text(raw)
#     text_vocab = set(w.lower() for w in text if w.isalpha())
#     english_vocab = set(w.lower() for w in nltk.corpus.words.words())
#     unusual = text_vocab - english_vocab
#     return len(sorted(unusual))

# for i in range(len(news_df['articleUrl'])):
#     try:
#         news_df.loc[i, 'num_words_not_in_dictionary'] = num_unusual_words(news_df['article'][i])
#         print(i, ':', news_df['articleUrl'][i], ":", num_unusual_words(news_df['article'][i]))
#     except:
#         pass # keeps empty articles 0.0

0 : https://www.statnews.com/2017/02/09/dana-farber-mar-lago/ : 34
1 : https://www.statnews.com/2017/02/09/antibiotics-resistance-superbugs/ : 116
2 : https://www.statnews.com/2017/02/07/scientist-march-trump/ : 76
3 : https://www.statnews.com/2017/02/09/twitter-hospitals-medicine/ : 105
4 : https://www.statnews.com/2017/02/09/obamacare-repeal-preexisting-conditions/ : 84
5 : https://www.statnews.com/2017/02/08/drug-names-process/ : 231
6 : https://www.statnews.com/2017/02/09/alzheimers-disease-statins/ : 85
7 : https://www.statnews.com/2016/03/28/google-life-sciences-exodus/ : 311
8 : https://www.statnews.com/2017/02/06/mental-health-college-students/ : 192
9 : https://www.statnews.com/2017/02/09/doctors-training-social-change/ : 102
10 : https://www.statnews.com/2017/02/09/injury-states-unusually-common/ : 27
11 : https://www.statnews.com/2017/02/08/social-media-compulsion-mental-illness/ : 130
12 : https://www.statnews.com/2017/02/09/harvard-reunion-sugar-study/ : 72
13 : https://ww

89 : https://www.scientificamerican.com/article/wetlands-can-help-fight-climate-change/ : 63
90 : https://www.scientificamerican.com/article/march-for-science-set-for-earth-day/ : 26
91 : https://www.scientificamerican.com/article/science-and-the-u-s-supreme-court-the-cases-to-watch-in-2017/ : 107
92 : https://www.scientificamerican.com/article/florida-has-seen-bad-effects-from-trump-like-climate-gag-orders/ : 102
93 : https://www.scientificamerican.com/article/groundhog-day-is-hogwash/ : 4
94 : https://www.scientificamerican.com/article/beijing-vows-deep-cut-in-coal-use-in-2017-to-fight-smog/ : 39
95 : https://www.scientificamerican.com/podcast/episode/the-arctics-anti-snowball-snowball-effect/ : 39
96 : https://www.scientificamerican.com/article/e-u-looks-to-china-for-climate-leadership/ : 74
97 : https://www.scientificamerican.com/article/house-science-panel-to-hold-hearing-on-ldquo-making-the-epa-great-again-rdquo/ : 37
98 : https://www.scientificamerican.com/article/house-science-

169 : http://www.the-scientist.com//?articles.view/articleNo/47333/title/Immunity-in-the-Brain/ : 36
170 : http://www.the-scientist.com//?articles.view/articleNo/47905/title/Pharma-Redo/ : 8
171 : http://www.the-scientist.com//?articles.view/articleNo/48034/title/Abscisic-Acid-s-Role-in-Ferns-Finally-Determined/ : 100
172 : http://www.the-scientist.com//?articles.view/articleNo/47993/title/How-Plant-Soil-Feedback-Affects-Ecological-Diversity/ : 77
173 : http://www.the-scientist.com//?articles.view/articleNo/48376/title/Toward-Killing-Cancer-with-Bacteria/ : 77
174 : https://www.statnews.com/2017/02/03/refugees-detroit-stress/ : 120
175 : http://www.bostonglobe.com/metro/2017/02/04/dressed-scrubs-she-roamed-hospital-but-she-wasn-supposed-there/3OkuPYs4PklE3MGdeLirhM/story.html : 134
176 : https://www.statnews.com/2017/02/09/cuba-doctors-us/ : 82
177 : https://www.statnews.com/2017/02/03/super-bowl-can-make-you-sick/ : 78
178 : https://www.statnews.com/2017/02/06/doctors-work-life-balanc

273 : https://www.statnews.com/2017/02/07/let-patients-talk/ : 95
274 : https://www.statnews.com/2016/02/11/stat-harvard-poll-gene-editing/ : 92
275 : https://www.statnews.com/2017/02/09/malaria-odor-mosquitoes/ : 67
276 : https://www.statnews.com/2016/09/01/optogenetics/ : 278
277 : https://www.statnews.com/pharmalot/2017/02/10/trump-fda-drug-prices-cancer/ : 15
278 : https://www.statnews.com/2017/02/10/pharma-acquisitions/ : 19
279 : https://www.statnews.com/2017/02/10/pharma-tax-break-jobs/ : 16
280 : https://www.statnews.com/2017/02/10/syria-refugees-boston-childrens/ : 37
281 : https://www.scientificamerican.com/article/arctic-2-0-what-happens-after-all-the-ice-goes/ : 210
282 : https://www.statnews.com/2017/02/10/price-hhs-confirmation/ : 100
283 : https://www.statnews.com/2017/02/10/medicaid-tinkering-trump-verma/ : 82
284 : https://www.statnews.com/2017/02/10/marathons-move-on-rare-disease-drug-martin-shrkeli-all-over-again/ : 16
285 : https://www.statnews.com/pharmalot/2017/02

378 : https://www.statnews.com/pharmalot/2017/02/13/sec-pharma-accounting-allergan/ : 21
379 : http://www.the-scientist.com//?articles.view/articleNo/47973/title/First-Bumblebee-Species-Declared-Endangered-in-U-S-/ : 40
380 : http://neurosciencenews.com/cognition-older-mothers-6107/ : 68
381 : http://neurosciencenews.com/big-brother-brain-scan-6105/ : 95
382 : http://neurosciencenews.com/cognition-older-mothers-6107/ : 68
383 : http://neurosciencenews.com/big-brother-brain-scan-6105/ : 95
384 : http://neurosciencenews.com/ptsd-genetics-memory-6107/ : 82
385 : http://neurosciencenews.com/big-brother-brain-scan-6105/ : 95
386 : http://neurosciencenews.com/ptsd-genetics-memory-6107/ : 82
387 : http://neurosciencenews.com/neurons-cancer-growth-6106/ : 98
388 : http://neurosciencenews.com/neurons-cancer-growth-6106/ : 98
389 : http://neurosciencenews.com/ptsd-genetics-memory-6107/ : 82
390 : http://neurosciencenews.com/neurons-cancer-growth-6106/ : 98
391 : http://neurosciencenews.com/big-b

484 : https://www.statnews.com/2017/01/20/david-byrne-neuroscience/ : 109
485 : https://www.statnews.com/2016/09/21/chronic-fatigue-syndrome-pace-trial/ : 189
486 : https://www.statnews.com/2017/02/16/mammograms-plus-personalized-treatment-best-options-fight-breast-cancer/ : 68
487 : https://www.statnews.com/2017/02/15/crispr-winners-losers/ : 84
488 : https://www.statnews.com/2017/02/16/carfentanil-china-ban/ : 101
489 : https://www.statnews.com/2017/02/16/crispr-patent-decision-six-takeaways/ : 24
490 : https://www.statnews.com/2017/02/16/aaas-qa-trump-science/ : 87
491 : https://www.statnews.com/2017/02/16/pregnant-women-rural-america/ : 159
492 : http://www.the-scientist.com//?articles.view/articleNo/48492/title/Henrietta-Lacks-s-Family-Seeks-Compensation/ : 29
493 : https://www.statnews.com/pharmalot/2017/02/16/marathon-drug-prices-phrma/ : 11
494 : https://www.statnews.com/pharmalot/2017/02/16/shkreli-crispr-dermatology/ : 29
495 : https://www.scientificamerican.com/article/falli

586 : http://feedproxy.google.com/~r/RenewableEnergyNewsRssFeed/~3/pbniUEcoTdc/pruitt-confirmed-as-epa-administrator.html : 23
587 : http://feedproxy.google.com/~r/RenewableEnergyNewsRssFeed/~3/uqxDj_TZyuI/xcel-energy-partners-with-faa-on-utility-drone-technology.html : 38
588 : http://feedproxy.google.com/~r/RenewableEnergyNewsRssFeed/~3/3vvxk33d9So/china-widens-wind-power-lead-over-us.html : 30
589 : http://feedproxy.google.com/~r/RenewableEnergyNewsRssFeed/~3/CQSy-VBzgR8/wind-power-first-electricity-delivered-from-dudgeon-offshore-wind-farm.html : 38
590 : http://feedproxy.google.com/~r/RenewableEnergyNewsRssFeed/~3/ry5wbOW52mA/australian-renewable-energy-agency-commits-4-1m-to-advance-energy-storage.html : 29
591 : http://feedproxy.google.com/~r/RenewableEnergyNewsRssFeed/~3/YM9vBoo4oEI/spp-sets-north-american-record-for-wind-power.html : 16
592 : http://feedproxy.google.com/~r/RenewableEnergyNewsRssFeed/~3/E9IZtT62_nE/national-lab-studies-solar-panel-corrosion.html : 47
593 : http

673 : http://venturebeat.com/2017/01/21/5-industries-ripe-for-human-machine-learning/ : 108
674 : http://venturebeat.com/2017/02/04/how-ai-will-transform-education-in-2017/ : 87
675 : http://venturebeat.com/2017/02/01/beyond-the-gimmick-implementing-effective-machine-learning-vb-live/ : 47
676 : http://venturebeat.com/2017/01/26/how-silicon-valley-is-teaching-language-to-machines/ : 120
677 : http://venturebeat.com/2017/02/07/3-common-jobs-ai-will-augment-or-displace/ : 90
678 : http://venturebeat.com/2017/01/31/ai-is-going-mobile/ : 107
679 : http://venturebeat.com/2017/01/24/why-we-need-pioneers-in-cognitive-computing/ : 96
680 : http://venturebeat.com/2017/01/23/5-super-smart-gadgets-that-debuted-at-ces-2017/ : 63
681 : http://venturebeat.com/2017/01/31/patients-are-about-to-see-a-new-doctor-artificial-intelligence/ : 112
682 : http://venturebeat.com/2017/01/19/3-questions-marketers-should-ask-before-investing-in-ai/ : 78
683 : https://www.statnews.com/2017/02/20/aaas-politics-trump

756 : http://venturebeat.com/2017/02/20/outsource-your-boring-back-office-paperwork-to-ai/ : 123
757 : http://venturebeat.com/2017/02/20/how-well-stay-productive-in-the-connected-car/ : 96
758 : http://venturebeat.com/2017/02/21/how-ai-will-help-you-connect-with-customers/ : 131
759 : https://www.statnews.com/2017/02/21/testosterone-gel-men-memory/ : 67
760 : https://www.bostonglobe.com/metro/2017/02/21/medical-students-face-fallout-over-trump-immigration-order/BTHCdec6aJtnbUBM178xyH/story.html : 79
761 : http://www.the-scientist.com//?articles.view/articleNo/48510/title/Abundant-Sequence-Errors-in-Public-Databases/ : 84
762 : https://www.scientificamerican.com/article/giant-antarctic-ice-shelf-crack-threatens-to-become-a-massive-iceberg/ : 87
763 : http://neurosciencenews.com/ampa-inactivation-erases-fear-memory-mice-6142/ : 78
764 : http://neurosciencenews.com/ampa-inactivation-erases-fear-memory-mice-6142/ : 78
765 : http://neurosciencenews.com/ampa-inactivation-erases-fear-memory-m

856 : http://neurosciencenews.com/complex-learning-bees-6155/ : 85
857 : http://neurosciencenews.com/complex-learning-bees-6155/ : 85
858 : http://neurosciencenews.com/complex-learning-bees-6155/ : 85
859 : http://neurosciencenews.com/mci-virtual-reality-6154/ : 108
860 : http://neurosciencenews.com/alcohol-brain-aging-6156/ : 174
861 : http://neurosciencenews.com/alcohol-brain-aging-6156/ : 174
862 : http://neurosciencenews.com/alcohol-brain-aging-6156/ : 174
863 : http://neurosciencenews.com/mci-virtual-reality-6154/ : 108
864 : http://neurosciencenews.com/mci-virtual-reality-6154/ : 108
865 : http://venturebeat.com/2017/02/24/conversation-design-is-the-next-big-ux-challenge-for-capital-one/ : 75
866 : http://www.news-medical.net/news/20170224/New-research-pinpoints-how-bacteria-get-into-lungs-of-healthy-people.aspx : 102
867 : https://www.statnews.com/2016/09/21/hippocratic-oath-medical-students-doctors/ : 63
868 : http://venturebeat.com/2017/02/24/apple-extending-seattle-footprint-

957 : http://venturebeat.com/2017/02/28/machine-learning-in-microsoft-words-new-editor-gave-me-the-frights/ : 46
958 : https://www.statnews.com/2017/02/28/vision-blurry-middle-age/ : 79
959 : http://neurosciencenews.com/neuroscience-news-virality-6170/ : 124
960 : http://neurosciencenews.com/memories-music-6170/ : 61
961 : http://neurosciencenews.com/music-physical-commitment-6169/ : 119
962 : http://neurosciencenews.com/neuroscience-news-virality-6170/ : 124
963 : http://neurosciencenews.com/alzheimers-tmc-6171/ : 118
964 : http://neurosciencenews.com/music-physical-commitment-6169/ : 119
965 : http://neurosciencenews.com/neuroscience-news-virality-6170/ : 124
966 : http://neurosciencenews.com/music-physical-commitment-6169/ : 119
967 : http://neurosciencenews.com/memories-music-6170/ : 61
968 : http://neurosciencenews.com/alzheimers-tmc-6171/ : 118
969 : http://neurosciencenews.com/memories-music-6170/ : 61
970 : https://www.statnews.com/2017/02/28/colorectal-cancer-millennials-gen-x

1055 : http://neurosciencenews.com/mirror-movement-brain-abnormality-6186/ : 125
1056 : http://neurosciencenews.com/information-flow-neuroscience-6187/ : 86
1057 : http://neurosciencenews.com/information-flow-neuroscience-6187/ : 86
1058 : http://neurosciencenews.com/music-speech-neuroscience-6289/ : 102
1059 : http://neurosciencenews.com/mirror-movement-brain-abnormality-6186/ : 125
1060 : http://neurosciencenews.com/information-flow-neuroscience-6187/ : 86
1061 : http://neurosciencenews.com/cte-neurology-6188/ : 75
1062 : http://neurosciencenews.com/genetics-red-hair-parkinsons-cancer-6185/ : 111
1063 : http://neurosciencenews.com/cte-neurology-6188/ : 75
1064 : http://neurosciencenews.com/music-speech-neuroscience-6289/ : 102
1065 : http://neurosciencenews.com/mirror-movement-brain-abnormality-6186/ : 125
1066 : http://www.news-medical.net/news/20170302/Researchers-uncover-mechanism-that-contributes-to-joint-inflammationc2a0in-rheumatoid-arthritis-patients.aspx : 33
1067 : http://ne

1155 : http://neurosciencenews.com/tbi-genetics-brain-damage-6204/ : 112
1156 : http://neurosciencenews.com/ketamine-no-wonder-drug-depression/ : 73
1157 : http://neurosciencenews.com/visual-memory-perception-6207/ : 96
1158 : http://neurosciencenews.com/ketamine-no-wonder-drug-depression/ : 73
1159 : http://neurosciencenews.com/ketamine-no-wonder-drug-depression/ : 73
1160 : https://www.statnews.com/pharmalot/2017/03/06/states-epipen-alternatives/ : 19
1161 : https://www.statnews.com/2016/04/25/scribes-emergency-room/ : 124
1162 : https://www.statnews.com/2017/03/06/medicaid-gop-bill/ : 73
1163 : https://www.statnews.com/2016/08/05/lung-cancer-cuba-biotech/ : 199
1164 : http://www.news-medical.net/news/20170307/Researchers-develop-skin-tests-decision-support-tool-to-increase-antibiotic-use-in-hospitalized-patients.aspx : 86
1165 : http://www.news-medical.net/news/20170306/New-technology-provides-extracorporeal-support-and-enables-recovery-of-damaged-donor-lungs.aspx : 80
1166 : https:

1250 : https://www.scientificamerican.com/podcast/episode/forensic-science-trials-with-errors/ : 20
1251 : https://www.statnews.com/2017/03/09/colon-cancer-young-adults/ : 54
1252 : http://www.the-scientist.com//?articles.view/articleNo/48768/title/Five-More-Synthetic-Yeast-Chromosomes-Completed/ : 95
1253 : https://www.statnews.com/2017/03/09/cheese-recall-sargento-indiana/ : 74
1254 : https://www.statnews.com/pharmalot/2017/03/09/regeneron-ceo-drug-prices/ : 20
1255 : https://www.scientificamerican.com/article/alaskan-caribou-are-adapting-to-warming/ : 47
1256 : https://www.statnews.com/2017/03/09/learning-doctor-shadow-trumps-wall/ : 85
1257 : https://www.statnews.com/2017/03/09/synthetic-genome-yeast/ : 102
1258 : http://neurosciencenews.com/memory-training-longevity-6221/ : 122
1259 : http://neurosciencenews.com/brain-activity-neurobiology-6224/ : 111
1260 : http://neurosciencenews.com/alzheimers-jumping-genes-apoptosis-6219/ : 129
1261 : http://neurosciencenews.com/dopamine-decis

1349 : http://www.the-scientist.com//?articles.view/articleNo/48776/title/Opinion--Redefining-Species-Based-on-Compatibility-of-Nuclear-and-Mitochondrial-Genes/ : 83
1350 : https://www.statnews.com/pharmalot/2017/03/13/fda-scott-gottlieb-valeant-amgen/ : 19
1351 : https://www.statnews.com/2017/03/13/week-biopharma-2/ : 6
1352 : https://www.statnews.com/2017/03/13/death-patients-diane-meier/ : 73
1353 : https://www.statnews.com/2016/12/13/actuaries-health-care-precision-medicine/ : 63
1354 : https://www.statnews.com/2017/03/13/tuberculosis-who-bacteria-antibiotics/ : 68
1355 : https://www.statnews.com/2017/03/13/tuberculosis-who-antibiotic-resistance/ : 82
1356 : https://www.statnews.com/2017/03/13/price-employers-genetic-information/ : 45
1357 : https://www.statnews.com/2016/03/15/walmart-house-calls-hospitals/ : 82
1358 : https://www.statnews.com/2016/02/19/brain-hackers-nootropics/ : 188
1359 : https://www.statnews.com/2017/03/13/biotech-regulation/ : 19
1360 : http://neurosciencenew

1446 : http://neurosciencenews.com/speech-processing-dementia-6246/ : 123
1447 : http://neurosciencenews.com/speech-processing-dementia-6246/ : 123
1448 : http://neurosciencenews.com/genetics-brain-aging-6250/ : 75
1449 : http://neurosciencenews.com/genetics-brain-aging-6250/ : 75
1450 : http://neurosciencenews.com/genetics-brain-aging-6250/ : 75
1451 : https://www.statnews.com/2015/11/17/gene-drive-hijack-evolution/ : 243
1452 : https://www.statnews.com/2017/01/20/obamacare-single-payer-national-health-insurance/ : 81
1453 : https://www.statnews.com/2017/03/15/death-end-of-life-states/ : 95
1454 : https://www.statnews.com/2016/11/30/fda-califf-stem-cell-nejm/ : 79
1455 : https://www.statnews.com/pharmalot/2017/03/15/fda-sanders-marathon-drug-prices/ : 13
1456 : https://www.statnews.com/pharmalot/2017/03/15/gottlieb-markey-fda-opioids/ : 13
1457 : https://www.statnews.com/pharmalot/2017/03/15/california-bill-transparency-drug-pricing/ : 20
1458 : http://www.news-medical.net/news/201703

1544 : https://www.statnews.com/pharmalot/2017/03/17/lilly-patents-canada-tribunal/ : 18
1545 : https://www.scientificamerican.com/article/triumph-of-the-city-engines-of-innovation/ : 21
1546 : https://www.statnews.com/2016/08/02/hospital-ratings-skepticism/ : 86
1547 : https://www.scientificamerican.com/article/dark-matter-did-not-dominate-early-galaxies/ : 42
1548 : https://www.statnews.com/2016/02/01/communication-failures-malpractice-study/ : 130
1549 : http://neurosciencenews.com/personality-trait-intellectual-humility-6257/ : 98
1550 : http://neurosciencenews.com/personality-trait-intellectual-humility-6257/ : 98
1551 : http://neurosciencenews.com/personality-trait-intellectual-humility-6257/ : 98
1552 : http://neurosciencenews.com/camkii-learning-memory-6258/ : 106
1553 : http://neurosciencenews.com/camkii-learning-memory-6258/ : 106
1554 : http://neurosciencenews.com/camkii-learning-memory-6258/ : 106
1555 : http://neurosciencenews.com/camkii-learning-memory-6258/ : 106
1556 : 

1649 : https://www.statnews.com/2017/03/21/craig-venter-sequence-genome/ : 22
1650 : https://www.statnews.com/2017/03/21/syrian-hearing-loss/ : 75
1651 : https://www.statnews.com/2017/03/21/trump-medical-research-budget/ : 71
1652 : http://www.news-medical.net/news/20170321/Treatment-with-interferon-may-ease-symptoms-of-Ebola-patients.aspx : 66
1653 : https://www.statnews.com/2017/03/21/epipen-recall-mylan/ : 30
1654 : http://www.news-medical.net/news/20170321/Efficacy-of-innovative-gene-therapy-can-be-tested-quickly-and-cost-effectively-using-new-cellular-model.aspx : 56
1655 : http://neurosciencenews.com/older-mothers-psychology-6271/ : 73
1656 : https://www.statnews.com/2017/03/21/prostate-cancer-treatment-side-effects/ : 62
1657 : https://www.statnews.com/pharmalot/2017/03/21/chile-patents-hepatitis-cancer/ : 24
1658 : http://neurosciencenews.com/alzheimers-vascular-system-6272/ : 112
1659 : https://www.statnews.com/pharmalot/2017/03/21/maryland-generic-drug-price-gouging/ : 13
166

1739 : https://www.statnews.com/2017/03/27/soon-shiong-cancer-video-questions/ : 188
1740 : http://www.news-medical.net/news/20170324/Researchers-develop-new-lab-on-a-chip-platform-that-aims-to-improve-pathogen-detection.aspx : 34
1741 : http://www.news-medical.net/news/20170329/Study-explores-new-immunotherapy-combinations-to-treat-prostate-cancer.aspx : 93
1742 : https://www.scientificamerican.com/article/hospitals-halt-hiring-projects-amid-uncertain-fate-of-obamacare/ : 99
1743 : https://www.scientificamerican.com/podcast/episode/whats-driving-the-self-driving-cars-rush/ : 3
1744 : https://www.statnews.com/2017/03/29/manchin-mylan-drug-pricing/ : 13
1745 : http://www.news-medical.net/news/20170325/UVA-researchers-discover-unexpected-interaction-between-mens-testes-and-immune-system.aspx : 52
1746 : https://www.statnews.com/2017/03/29/waivers-health-care-republicans-medicaid/ : 114
1747 : https://www.statnews.com/2017/03/29/mumps-outbreaks/ : 163
1748 : http://www.the-scientist.com//

1820 : https://www.statnews.com/2017/03/30/drug-prices-new-approved/ : 28
1821 : https://www.statnews.com/pharmalot/2017/03/30/glaxo-trump-fda-price/ : 23
1822 : https://www.statnews.com/pharmalot/2017/03/29/lawmakers-bill-drug-prices/ : 19
1823 : https://www.statnews.com/pharmalot/2017/03/29/uk-doctors-drug-makers/ : 15
1824 : https://www.statnews.com/pharmalot/2017/03/29/arizona-law-off-label-free-speech/ : 18
1825 : https://www.statnews.com/pharmalot/2017/03/29/fda-gottlieb-glaxo-mylan/ : 23
1826 : https://www.statnews.com/pharmalot/2017/03/28/mylan-epipen-medicaid-taxpayers/ : 16
1827 : http://www.the-scientist.com//?articles.view/articleNo/49000/title/Neoantigens-Enable-Personalized-Cancer-Immunotherapy/ : 212
1828 : http://www.the-scientist.com//?articles.view/articleNo/49005/title/Infographic--Targeting-Cancer-Antigens/ : 37
1829 : https://www.statnews.com/2017/03/31/moonshot-medicine-cancer-aging/ : 64
1830 : https://www.statnews.com/2017/04/04/preclinical-cellular-therapy-ovar

1905 : https://www.scientificamerican.com/article/epa-proposal-cuts-hundreds-of-climate-change-employees/ : 68
1906 : https://www.scientificamerican.com/article/epa-proposal-cuts-hundreds-of-climate-change-employees/ : 68
1907 : https://www.scientificamerican.com/article/states-challenge-trump-over-clean-power-plan/ : 41
1908 : https://www.scientificamerican.com/report/chemical-and-biological-weapons/ : 70
1909 : http://www.the-scientist.com//?articles.view/articleNo/49151/title/Viral-Trigger-for-Celiac-Disease-/ : 85
1910 : https://www.statnews.com/2016/05/06/science-janet-jacksons-pregnancy/ : 61
1911 : https://www.statnews.com/2017/04/06/crispr-pig-organs-transplant-luhan-yang/ : 259
1912 : https://www.statnews.com/2016/02/23/zika-olympics-first-opinion/ : 44
1913 : http://neurosciencenews.com/deep-sleep-aging-6349/ : 92
1914 : http://neurosciencenews.com/brain-navigation-6364/ : 109
1915 : http://neurosciencenews.com/ptsd-doxycycline-6365/ : 104
1916 : http://neurosciencenews.com/p

1984 : https://www.statnews.com/pharmalot/2017/04/05/fda-drugs-ema-approvals/ : 24
1985 : https://www.statnews.com/2017/04/11/where-do-former-officials-go-after-fda-look-down-k-street/ : 13
1986 : https://www.statnews.com/pharmalot/2017/04/10/stada-bain-cinven-teva-novo-diabetes/ : 29
1987 : https://www.statnews.com/pharmalot/2017/04/06/mallinckrodt-phrma-drug-prices/ : 21
1988 : https://www.statnews.com/pharmalot/2017/04/07/actelion-kfc-antibiotics-merck/ : 23
1989 : https://www.statnews.com/pharmalot/2017/04/05/democrats-trump-patents-drug-prices/ : 16
1990 : https://www.statnews.com/pharmalot/2017/04/07/winners-losers-gottlieb-fda-mallinckrodt/ : 32
1991 : https://www.statnews.com/2017/04/06/drug-development-price/ : 12
1992 : https://www.statnews.com/pharmalot/2017/04/05/cancer-doctors-right-to-try-laws/ : 9
1994 : https://www.statnews.com/pharmalot/2017/04/06/gottlieb-fda-allergan-botox/ : 24
1995 : https://www.statnews.com/pharmalot/2017/04/06/access-medicines-evidence-lacking/ :

2080 : https://www.statnews.com/2017/04/27/gottlieb-help-vote/ : 13
2081 : https://www.statnews.com/2017/04/25/oncology-cancer-precision-medicine-gleevec/ : 123
2082 : https://www.statnews.com/2017/04/27/undergrads-drugs-tropical-diseases/ : 13
2083 : https://www.statnews.com/2017/04/24/mylan-west-virginia-lawsuit/ : 12
2084 : https://www.statnews.com/2016/05/02/adam-feuerstein-biotech/ : 336
2085 : https://www.statnews.com/2017/04/27/gottlieb-help-vote/ : 13
2086 : https://www.statnews.com/pharmalot/2017/04/27/novartis-korea-glaxo-fda/ : 25
2087 : https://www.statnews.com/pharmalot/2017/04/26/patent-cliff-biosimilars-generics/ : 20
2088 : https://www.statnews.com/pharmalot/2017/04/26/teva-collins-fda-mallinckrodt/ : 23
2089 : https://www.statnews.com/2017/04/25/gottlieb-fda-right-to-try/ : 20
2090 : https://www.statnews.com/pharmalot/2017/04/25/roche-india-biosimilar-competition/ : 17
2091 : https://www.statnews.com/pharmalot/2017/04/25/supreme-court-biosimilar-amgen-sandoz/ : 21
2092

"readability_score" is not super important for this project, but here are some interesting leads for it: https://datawarrior.wordpress.com/2016/03/29/flesch-kincaid-readability-measure/

In [96]:
# From: http://www.nltk.org/howto/sentiment.html
def sentiment_polarity(s):
    paragraphs = re.split('\n' , s)
    paragraphs = [p for p in paragraphs if p != '']
    sid = SentimentIntensityAnalyzer()
    neu = []
    compound = []
    for sentence in paragraphs:
        ss = sid.polarity_scores(sentence)
        neu.append(ss['neu'])
    return np.var(neu)

for i in range(len(news_df['articleUrl'])):
    try:
        news_df.loc[i, 'sentiment_polarity'] = sentiment_polarity(news_df['article'][i])
        print(i, ':', news_df['articleUrl'][i], ":", sentiment_polarity(news_df['article'][i]))
    except:
        pass # keeps empty articles 0.0

0 : https://www.statnews.com/2017/02/09/dana-farber-mar-lago/ : 0.0704219183673
1 : https://www.statnews.com/2017/02/09/antibiotics-resistance-superbugs/ : 0.0353557191358
2 : https://www.statnews.com/2017/02/07/scientist-march-trump/ : 0.0554019555556
3 : https://www.statnews.com/2017/02/09/twitter-hospitals-medicine/ : 0.04819336
4 : https://www.statnews.com/2017/02/09/obamacare-repeal-preexisting-conditions/ : 0.040396102071
5 : https://www.statnews.com/2017/02/08/drug-names-process/ : 0.0216973525377
6 : https://www.statnews.com/2017/02/09/alzheimers-disease-statins/ : 0.0433910495868
7 : https://www.statnews.com/2016/03/28/google-life-sciences-exodus/ : 0.0387522992879
8 : https://www.statnews.com/2017/02/06/mental-health-college-students/ : 0.0254864799899
9 : https://www.statnews.com/2017/02/09/doctors-training-social-change/ : 0.0498242376543
10 : https://www.statnews.com/2017/02/09/injury-states-unusually-common/ : 0.14703521
11 : https://www.statnews.com/2017/02/08/social-med

82 : https://www.scientificamerican.com/article/can-empirical-entertainment-rescue-science-in-an-alt-fact-world/ : 0.0106801540062
83 : https://www.scientificamerican.com/article/trump-supreme-court-nominee-would-put-agencies-on-short-leash/ : 0.00972099555556
84 : https://www.scientificamerican.com/article/newfound-source-of-mysterious-cosmic-bursts-poses-deeper-enigmas/ : 0.0218558875
85 : https://www.scientificamerican.com/podcast/episode/rapid-response-vaccines-for-epidemic-outbreaks/ : 0.22325625
86 : https://www.scientificamerican.com/article/did-a-changing-climate-wipe-out-the-giant-kangaroo/ : 0.00485616
87 : https://www.scientificamerican.com/article/the-epa-has-started-to-remove-obama-era-information/ : 0.00388298765432
88 : https://www.scientificamerican.com/article/a-tax-on-carbon-pollution-can-benefit-business/ : 0.008463234375
89 : https://www.scientificamerican.com/article/wetlands-can-help-fight-climate-change/ : 0.00480357333333
90 : https://www.scientificamerican.com/

159 : http://www.the-scientist.com//?articles.view/articleNo/48073/title/RNA-Interference-Between-Kingdoms/ : 0.21329758284
160 : http://www.the-scientist.com//?articles.view/articleNo/48391/title/Pardis-Sabeti--An-American-Scientist-Born-in-Iran/ : 0.190500336484
161 : http://www.the-scientist.com//?articles.view/articleNo/47853/title/Researchers-Call-for-Retraction-of-Paper-that-Questions-HPV-Vaccine/ : 0.167078429752
162 : http://www.the-scientist.com//?articles.view/articleNo/48329/title/Regulators-OK-Clinical-Trials-Using-Donor-Stem-Cells/ : 0.207444888889
163 : https://www.statnews.com/2017/01/18/crispr-cas3-locus-biosciences/ : 0.00442040816327
164 : http://www.the-scientist.com//?articles.view/articleNo/47973/title/First-Bumblebee-Species-Declared-Endangered-in-U-S-/ : 0.186359928889
165 : http://www.the-scientist.com//?articles.view/articleNo/47611/title/Low-Social-Status-May-Weaken-Immune-System-in-Monkeys/ : 0.189051
166 : https://www.statnews.com/2017/01/25/vertex-pain-opio

254 : http://neurosciencenews.com/sleep-deprivation-memory-6051/ : 0.0
255 : http://neurosciencenews.com/dbs-heroin-relapse-6039/ : 0.0
256 : http://neurosciencenews.com/social-attraction-neuroscience-6025/ : 0.0
257 : http://neurosciencenews.com/brain-development-autism-6034/ : 0.0
258 : http://neurosciencenews.com/decision-making-network-6050/ : 0.0
259 : http://neurosciencenews.com/insomnia-sexual-function-6038/ : 0.0
260 : https://www.statnews.com/2017/02/09/cancer-drug-benefit-price/ : 0.106826609375
261 : https://www.statnews.com/2017/01/12/nevada-woman-superbug-resistant/ : 0.0429879259259
262 : https://www.statnews.com/pharmalot/2017/02/09/drug-prices-pharmacy-benefits-managers/ : 0.106668
263 : https://www.scientificamerican.com/article/republicans-offer-to-tax-carbon-emissions/ : 0.00320576
264 : https://www.scientificamerican.com/article/house-science-committee-may-soon-try-to-weaken-the-epa/ : 0.00382467346939
265 : https://www.scientificamerican.com/article/house-science-c

350 : https://www.statnews.com/2017/01/03/aging-control-telomere-effect/ : 0.03813032
351 : https://www.statnews.com/2016/09/29/medical-students-learn-empathy/ : 0.0447882006173
352 : https://www.statnews.com/2016/01/05/detox-water-flush-fat/ : 0.0552285555556
353 : https://www.statnews.com/2016/11/08/zika-in-cuba/ : 0.0263529375372
354 : https://www.statnews.com/2016/01/21/crispr-conflicts-of-interest/ : 0.0467907482639
355 : https://www.statnews.com/2016/07/08/immune-therapies-juno-cancer/ : 0.0398118376691
356 : https://www.statnews.com/2017/01/13/tom-price-innate-immunotherapeutics/ : 0.0277520810249
357 : https://www.statnews.com/2017/02/12/ex-drug-company-ceo-shkreli-speak-harvard/ : 0.107741673469
358 : https://www.statnews.com/2017/01/18/anorexia-forced-treatment/ : 0.02879889
359 : https://www.statnews.com/2016/03/17/medical-students-match-day/ : 0.0276505789474
360 : https://www.statnews.com/2016/03/31/dna-shape-double-helix-dekker/ : 0.0738252430556
361 : https://www.statnew

446 : http://www.the-scientist.com//?articles.view/articleNo/48096/title/May-the-Force-Be-with-You/ : 0.220994198771
447 : http://www.the-scientist.com//?articles.view/articleNo/48257/title/Science-Your-Plants-/ : 0.00810688888889
448 : https://www.scientificamerican.com/article/broken-california-dam-is-a-sign-of-emergencies-to-come/ : 0.00821305968779
449 : http://neurosciencenews.com/word-comprehension-motor-cortex-6118/ : 0.0
450 : http://neurosciencenews.com/autophagy-stress-neurology-6115/ : 0.0
451 : http://neurosciencenews.com/new-insight-into-autism/ : 0.0
452 : http://neurosciencenews.com/autism-risk-brain-changes-mri-6116/ : 0.0
453 : http://neurosciencenews.com/neurobiology-cocaine-6119/ : 0.0
454 : http://neurosciencenews.com/autism-risk-brain-changes-mri-6116/ : 0.0
455 : http://neurosciencenews.com/autism-risk-brain-changes-mri-6116/ : 0.0
456 : http://neurosciencenews.com/word-comprehension-motor-cortex-6118/ : 0.0
457 : http://neurosciencenews.com/autophagy-stress-neuro

542 : http://neurosciencenews.com/parkinsons-genetics-neurology-6128/ : 0.0
543 : http://neurosciencenews.com/parkinsons-genetics-neurology-6128/ : 0.0
544 : http://neurosciencenews.com/oxytocin-dad-bonding-6127/ : 0.0
545 : http://neurosciencenews.com/dopamine-bonding-chemistry-6126/ : 0.0
546 : https://www.statnews.com/2017/02/17/could-your-fitbit-data-be-used-to-deny-you-health-insurance/ : 0.0540815316804
547 : https://www.statnews.com/2017/02/01/medicare-emergency-room-deaths-hospitals/ : 0.0456479183673
548 : https://www.statnews.com/2016/04/21/minecraft-health-care/ : 0.0484119445983
549 : https://www.statnews.com/2017/01/13/citation-cartels-science/ : 0.0513154792244
550 : https://www.statnews.com/2016/11/11/kids-post-trump/ : 0.0453967619048
551 : https://www.statnews.com/2015/12/18/jimmy-carter-cancer-drug-keytruda/ : 0.0416612416
552 : https://www.statnews.com/2016/04/01/vaxxed-autism-movie-review/ : 0.0579895608741
553 : https://www.statnews.com/2016/12/05/crispr-patent-ora

629 : http://news.mit.edu/2016/voice-controlled-calorie-counter-0324 : 0.00244736111111
630 : http://news.mit.edu/2016/human-robot-rescue-teams-0217 : 0.00386558333333
631 : http://news.mit.edu/2016/algorithm-automatic-contingency-planning-0215 : 0.0225951875
632 : http://news.mit.edu/2016/federal-safety-chief-driverless-cars-0210 : 0.0115844475
633 : http://news.mit.edu/2016/neural-chip-artificial-intelligence-mobile-devices-0203 : 0.00693686980609
634 : http://news.mit.edu/2016/virtual-guide-dog-wearable-device-0202 : 0.00446553846154
635 : http://news.mit.edu/2016/marvin-minsky-obituary-0125 : 0.0104256210937
636 : http://news.mit.edu/2015/algorithms-recognize-objects-few-examples-1223 : 0.0146244612476
637 : http://news.mit.edu/2015/csail-deep-learning-algorithm-predicts-photo-memorability-near-human-levels-1215 : 0.0047060861678
638 : http://news.mit.edu/2015/csail-shows-demos-150-high-schoolers-hour-code-1214 : 0.0088551875
639 : http://news.mit.edu/2015/computer-system-passes-vi

700 : http://www.news-medical.net/news/20170214/TSRI-scientists-receive-2433-million-NIH-grant-to-create-new-breast-cancer-treatments.aspx : 0.00830642975207
701 : http://www.news-medical.net/news/20170217/Study-shows-potential-of-3D-tracking-system-in-improving-accuracy-of-robot-assisted-surgery.aspx : 0.00332906122449
702 : http://www.news-medical.net/news/20170216/IDSA-guidelines-Team-approach-vital-to-successful-treatment-of-complex-neurological-infections.aspx : 0.00303081632653
703 : http://www.news-medical.net/news/20170214/Ten-steps-for-a-health-pregnancy.aspx : 0.0304175713296
704 : http://neurosciencenews.com/right-left-handedness-6132/ : 0.0
705 : http://neurosciencenews.com/fear-memories-neuroscience-6134/ : 0.0
706 : http://neurosciencenews.com/immunotherapy-ms-neurology-6133/ : 0.0
707 : https://www.statnews.com/2017/02/21/hylands-homeopathic-teething-fda/ : 0.00777729
708 : https://www.statnews.com/2017/02/22/scientists-trump-policies-research/ : 0.0407324475
709 : https

783 : http://www.the-scientist.com//?articles.view/articleNo/48492/title/Henrietta-Lacks-s-Family-Seeks-Compensation/ : 0.196567166667
784 : https://www.statnews.com/2017/02/23/cancer-genome-multiple-myeloma/ : 0.0390688010974
785 : https://www.statnews.com/2017/02/23/vitamin-iv-infusion/ : 0.0450332741021
786 : https://www.statnews.com/2017/02/23/bioethics-harvard-george-church/ : 0.0205873565068
787 : https://www.statnews.com/2017/02/23/drug-czar-office-white-house/ : 0.0447951911357
788 : https://www.statnews.com/2017/02/23/minority-doctors-medical-school/ : 0.0210133838271
789 : https://www.scientificamerican.com/article/absorb-the-shock/ : 0.0085257856
790 : http://www.news-medical.net/news/20170223/Scientists-create-organ-on-a-chip-that-can-mimic-hearts-biomechanical-properties.aspx : 0.00356573333333
791 : https://www.statnews.com/2017/02/23/lab-grown-voice-boxes-cancer/ : 0.1041816875
792 : https://www.statnews.com/pharmalot/2017/02/23/liquid-children-medicines-costs/ : 0.10425

879 : http://neurosciencenews.com/dbs-anorexia-6159/ : 0.0
880 : http://neurosciencenews.com/dbs-anorexia-6159/ : 0.0
881 : http://neurosciencenews.com/dbs-anorexia-6159/ : 0.0
882 : http://neurosciencenews.com/memory-impairment-cocaine-dads-6157/ : 0.0
883 : http://venturebeat.com/2017/02/25/what-salesforce-einstein-teaches-us-about-enterprise-ai/ : 0.00621363905325
884 : https://www.statnews.com/2017/01/21/andrew-wakefield-trump-inaugural-ball/ : 0.04840144
885 : https://www.statnews.com/2016/12/01/21st-century-cures-act-fda-approval/ : 0.053568734375
886 : https://www.statnews.com/2016/11/22/palliative-care-rare-disease-norse/ : 0.0122280645905
887 : https://www.statnews.com/2016/11/21/dementia-rate-decline/ : 0.0463697933884
888 : https://www.statnews.com/2016/09/22/gut-microbiome-obesity/ : 0.0254294421769
889 : http://venturebeat.com/2017/02/25/4-myths-of-the-connected-car-revolution/ : 0.0117916622222
890 : https://www.scientificamerican.com/article/will-democracy-survive-big-da

973 : https://www.scientificamerican.com/article/take-nukes-off-a-short-fuse/ : 0.0135421155556
974 : http://www.the-scientist.com//?articles.view/articleNo/48611/title/Exploring-the-Mechanisms-of-Music-Therapy/ : 0.167068034995
975 : https://www.statnews.com/2017/02/28/trump-fda-reaction/ : 0.104499859375
976 : http://www.the-scientist.com//?articles.view/articleNo/48539/title/Massively-Parallel-Perturbations/ : 0.198787425378
977 : http://www.the-scientist.com//?articles.view/articleNo/48619/title/Infographic--Single-Cell-CRISPR-Screens/ : 0.231404958678
978 : http://www.the-scientist.com//?articles.view/articleNo/48576/title/A-Selection-of-CRISPR-Proof-of-Principle-Studies/ : 0.147017369093
979 : https://www.statnews.com/2015/12/24/freeze-fat-weight-loss/ : 0.0338347741935
980 : https://www.statnews.com/2017/03/01/pap-smear-cervical-cancer-hpv/ : 0.107356984375
981 : https://www.statnews.com/2017/03/01/grief-parents-children-loss/ : 0.0402344474506
982 : http://www.news-medical.net/

1063 : http://neurosciencenews.com/cte-neurology-6188/ : 0.0
1064 : http://neurosciencenews.com/music-speech-neuroscience-6289/ : 0.0
1065 : http://neurosciencenews.com/mirror-movement-brain-abnormality-6186/ : 0.0
1066 : http://www.news-medical.net/news/20170302/Researchers-uncover-mechanism-that-contributes-to-joint-inflammationc2a0in-rheumatoid-arthritis-patients.aspx : 0.002624
1067 : http://neurosciencenews.com/music-speech-neuroscience-6289/ : 0.0
1068 : http://neurosciencenews.com/mirror-movement-brain-abnormality-6186/ : 0.0
1069 : http://www.the-scientist.com//?articles.view/articleNo/48701/title/Cortical-Interneurons-Show-Layer-Specific-Activities/ : 0.201092394768
1070 : https://www.statnews.com/pharmalot/2017/03/02/pharma-securities-fraud-lawsuits/ : 0.107699609375
1071 : https://www.statnews.com/2017/03/02/pharma-republican-health-care-plan/ : 0.14409696
1072 : https://www.statnews.com/pharmalot/2017/03/02/fda-europe-plant-inspections/ : 0.1071811875
1073 : https://www.sci

1155 : http://neurosciencenews.com/tbi-genetics-brain-damage-6204/ : 0.0
1156 : http://neurosciencenews.com/ketamine-no-wonder-drug-depression/ : 0.0
1157 : http://neurosciencenews.com/visual-memory-perception-6207/ : 0.0
1158 : http://neurosciencenews.com/ketamine-no-wonder-drug-depression/ : 0.0
1159 : http://neurosciencenews.com/ketamine-no-wonder-drug-depression/ : 0.0
1160 : https://www.statnews.com/pharmalot/2017/03/06/states-epipen-alternatives/ : 0.107968
1161 : https://www.statnews.com/2016/04/25/scribes-emergency-room/ : 0.00728719555556
1162 : https://www.statnews.com/2017/03/06/medicaid-gop-bill/ : 0.04890175
1163 : https://www.statnews.com/2016/08/05/lung-cancer-cuba-biotech/ : 0.0308385324126
1164 : http://www.news-medical.net/news/20170307/Researchers-develop-skin-tests-decision-support-tool-to-increase-antibiotic-use-in-hospitalized-patients.aspx : 0.013470484375
1165 : http://www.news-medical.net/news/20170306/New-technology-provides-extracorporeal-support-and-enables-

1245 : https://www.statnews.com/2017/03/09/medicaid-overhaul-gop/ : 0.0387675744
1246 : https://www.statnews.com/2017/03/09/colon-virtual-reality/ : 0.022518272534
1247 : https://www.statnews.com/2017/03/09/off-label-cancer-drugs/ : 0.110359
1248 : https://www.statnews.com/2017/03/09/what-does-healthy-mean/ : 0.0448637052154
1249 : http://www.news-medical.net/news/20170309/Flu-can-cause-worse-symptoms-in-people-with-asthma-research-shows.aspx : 0.0158173333333
1250 : https://www.scientificamerican.com/podcast/episode/forensic-science-trials-with-errors/ : 0.00431402469136
1251 : https://www.statnews.com/2017/03/09/colon-cancer-young-adults/ : 0.049593234375
1252 : http://www.the-scientist.com//?articles.view/articleNo/48768/title/Five-More-Synthetic-Yeast-Chromosomes-Completed/ : 0.216495934256
1253 : https://www.statnews.com/2017/03/09/cheese-recall-sargento-indiana/ : 0.0773701570248
1254 : https://www.statnews.com/pharmalot/2017/03/09/regeneron-ceo-drug-prices/ : 0.00565926530612
12

1337 : http://venturebeat.com/2017/03/12/why-parents-might-not-be-ready-for-ai-in-the-classroom/ : 0.0087493530572
1338 : https://www.statnews.com/2017/03/13/boston-university-turkey-science/ : 0.045330884375
1339 : https://www.statnews.com/2017/03/12/joe-biden-cancer-moonshot-sxsw/ : 0.0476550895062
1340 : https://www.statnews.com/2015/12/22/pharmacy-delivery-vans-targeted/ : 0.0261250067904
1341 : http://venturebeat.com/2017/03/12/sorry-but-your-ai-needs-to-go-back-to-school/ : 0.0264379733333
1342 : http://venturebeat.com/2017/03/12/how-car-sensing-will-save-lives/ : 0.00785428571429
1343 : http://www.news-medical.net/news/20170313/Evolution-of-immuno-oncology.aspx : 0.0109008538776
1344 : http://neurosciencenews.com/alzheimers-neuroimaging-6236/ : 0.0
1345 : http://neurosciencenews.com/alzheimers-neuroimaging-6236/ : 0.0
1346 : http://neurosciencenews.com/alzheimers-neuroimaging-6236/ : 0.0
1347 : http://www.the-scientist.com//?articles.view/articleNo/48813/title/House-Bill-Could-H

1425 : https://www.statnews.com/2017/03/15/cancer-patients-drug-prices/ : 0.0256652988889
1426 : https://www.statnews.com/2017/03/15/psychoanalyst-freud-relevant/ : 0.029676199881
1427 : https://www.statnews.com/pharmalot/2017/03/15/pbm-rebates-wyden-bill/ : 0.104536984375
1428 : https://www.statnews.com/2017/03/15/health-care-debate-influential-players/ : 0.035850489893
1429 : https://www.statnews.com/pharmalot/2017/03/15/glaxo-ceo-pay-endo-opioids/ : 0.106162109375
1430 : http://neurosciencenews.com/stem-cell-modification-6248/ : 0.0
1431 : http://neurosciencenews.com/ms-myelin-repair-6249/ : 0.0
1432 : http://neurosciencenews.com/stem-cell-modification-6248/ : 0.0
1433 : https://www.statnews.com/2017/03/15/atyr-rna-recombinant/ : 0.10533275
1434 : http://venturebeat.com/2017/03/15/uber-appoints-zoubin-ghahramani-as-chief-scientist-3-months-after-acquiring-his-startup-geometric-intelligence/ : 0.11825725
1435 : http://neurosciencenews.com/mri-hiv-neuroscience-6239/ : 0.00011025
1436 

1513 : http://neurosciencenews.com/autism-blood-test-6253/ : 0.0
1514 : http://neurosciencenews.com/autism-vitamin-d-6256/ : 0.0
1515 : http://neurosciencenews.com/autism-blood-test-6253/ : 0.0
1516 : http://neurosciencenews.com/sleep-apnea-kids-mood-6255/ : 0.0
1517 : https://www.scientificamerican.com/article/trumps-defense-secretary-cites-climate-change-as-national-security-challenge/ : 0.0304832580349
1518 : http://neurosciencenews.com/autism-vitamin-d-6256/ : 0.0
1519 : http://neurosciencenews.com/neural-networks-genetics-social-autism-6254/ : 0.0
1520 : http://neurosciencenews.com/autism-vitamin-d-6256/ : 0.0
1521 : http://neurosciencenews.com/autism-vitamin-d-6256/ : 0.0
1522 : http://neurosciencenews.com/autism-blood-test-6253/ : 0.0
1523 : http://neurosciencenews.com/sleep-apnea-kids-mood-6255/ : 0.0
1524 : http://neurosciencenews.com/sleep-apnea-kids-mood-6255/ : 0.0
1525 : http://neurosciencenews.com/sleep-apnea-kids-mood-6255/ : 0.0
1526 : http://neurosciencenews.com/neural

1608 : http://neurosciencenews.com/evolution-developing-brain-networks-6261/ : 0.0
1609 : http://neurosciencenews.com/evolution-developing-brain-networks-6261/ : 0.0
1610 : http://neurosciencenews.com/reward-system-alcoholism-6262/ : 0.0
1611 : http://neurosciencenews.com/subcortical-number-processing-6266/ : 0.0
1612 : http://neurosciencenews.com/subcortical-number-processing-6266/ : 0.0
1613 : http://neurosciencenews.com/lithium-therapy-response-6268/ : 0.0
1614 : http://neurosciencenews.com/evolution-developing-brain-networks-6261/ : 0.0
1615 : http://neurosciencenews.com/sperm-swimming-math-6265/ : 0.0
1616 : http://neurosciencenews.com/sperm-swimming-math-6265/ : 0.0
1617 : http://venturebeat.com/2017/03/20/these-are-the-most-important-chatbot-metrics-to-track/ : 0.00284711111111
1618 : http://neurosciencenews.com/sperm-swimming-math-6265/ : 0.0
1619 : http://neurosciencenews.com/lithium-therapy-response-6268/ : 0.0
1620 : http://neurosciencenews.com/sexual-afterglow-bonding-6263/

1700 : http://neurosciencenews.com/satnav-brain-neuroscience-6274/ : 0.0
1701 : http://www.news-medical.net/news/20170323/Scientists-identify-molecule-that-could-be-targeted-to-prevent-vision-loss-in-diabetics-and-preterm-infants.aspx : 0.00624725
1702 : http://neurosciencenews.com/brain-function-parallel-computing-6288/ : 0.0
1703 : http://neurosciencenews.com/sleep-deprivation-facial-expression-6283/ : 0.0
1704 : http://neurosciencenews.com/sleep-deprivation-facial-expression-6283/ : 0.0
1705 : http://neurosciencenews.com/brain-function-parallel-computing-6288/ : 0.0
1706 : http://neurosciencenews.com/pleasure-amygdala-neuroscience-6286/ : 0.0
1707 : http://neurosciencenews.com/brain-function-parallel-computing-6288/ : 0.0
1708 : https://www.scientificamerican.com/slideshow/this-groundbreaking-museum-is-already-underwater/ : 0.173609555556
1709 : https://www.scientificamerican.com/article/south-africa-rsquo-s-san-people-issue-ethics-code-to-scientists/ : 0.00808072888889
1710 : https

1783 : http://www.the-scientist.com//?articles.view/articleNo/48912/title/Gel-Scaffolds-for-Delivery-of-Immunotherapies/ : 0.100309141967
1784 : http://www.the-scientist.com//?articles.view/articleNo/49003/title/Circadian-Rhythms-Influence-Treatment-Effects/ : 0.203546014688
1785 : http://www.the-scientist.com//?articles.view/articleNo/49008/title/Targeting-Tregs-Halts-Cancer-s-Immune-Helpers/ : 0.176295893878
1786 : http://www.the-scientist.com//?articles.view/articleNo/49006/title/Circadian-Clock-Affects-Health-and-Disease/ : 0.14616916
1787 : http://www.the-scientist.com//?articles.view/articleNo/49055/title/CRISPR-Screen-Detects-Functional-Gene-Regulation/ : 0.204584839319
1788 : http://www.the-scientist.com//?articles.view/articleNo/48906/title/Extra-Centrosomes-Can-Drive-Tumor-Formation-in-Mice/ : 0.0809558892734
1789 : http://www.the-scientist.com//?articles.view/articleNo/48863/title/Hitting-It-Out-of-the-Park/ : 0.173256138504
1790 : http://www.the-scientist.com//?articles.vie

1856 : https://www.scientificamerican.com/article/can-the-amazon-save-the-planet/ : 0.0236387241815
1857 : https://www.scientificamerican.com/article/national-corruption-breeds-personal-dishonesty/ : 0.0089299375
1858 : https://www.scientificamerican.com/article/the-world-wants-the-u-s-to-stay-in-the-paris-climate-deal/ : 0.00861119673469
1859 : https://www.scientificamerican.com/article/clean-power-worldwide-has-doubled-in-10-years/ : 0.00312222222222
1860 : http://www.the-scientist.com//?articles.view/articleNo/49066/title/Mutations-Linked-to-Secondary-Cancers/ : 0.166798686391
1861 : http://www.the-scientist.com//?articles.view/articleNo/49004/title/Infographic--Mechanisms-of-Resistance/ : 0.205244112222
1862 : http://neurosciencenews.com/patience-imagination-6340/ : 0.0
1863 : http://neurosciencenews.com/lsd-fear-emotion-6335/ : 0.0
1864 : http://neurosciencenews.com/hand-movement-pathway-6333/ : 0.0
1865 : http://neurosciencenews.com/depression-disability-health-6323/ : 0.0
1866 :

1936 : https://www.scientificamerican.com/article/the-arctic-ocean-is-becoming-more-like-the-atlantic-ocean/ : 0.00380398615917
1937 : https://www.scientificamerican.com/article/legendary-climate-scientist-likes-a-gop-proposal-on-global-warming/ : 0.0395888476331
1938 : https://www.scientificamerican.com/article/politicians-shouldnt-troll-through-scientists-e-mails/ : 0.002302484375
1939 : http://www.news-medical.net/news/20170410/Salk-Institute-receives-additional-243-million-Glenn-Foundation-gift-to-continue-aging-research.aspx : 0.00287565432099
1940 : http://www.news-medical.net/news/20170407/Patients-with-inducible-urticaria-can-benefit-from-treatment-with-asthma-drug.aspx : 0.005698609375
1941 : http://www.news-medical.net/news/20170411/TSRI-scientists-find-new-way-to-make-cell-population-resistant-to-HIV.aspx : 0.00701102469136
1942 : https://www.scientificamerican.com/article/a-last-ditch-attempt-to-save-the-world-rsquo-s-most-endangered-porpoise/ : 0.021038033241
1943 : http:/

2013 : http://neurosciencenews.com/artificial-intelligence-human-prejudice-6411/ : 0.0
2014 : http://neurosciencenews.com/immunotherapy-glioblastoma-6416/ : 0.0
2015 : http://neurosciencenews.com/alcohol-dependence-amygdala-6399/ : 0.0
2016 : http://neurosciencenews.com/artificial-intelligence-human-prejudice-6411/ : 0.0
2017 : http://neurosciencenews.com/oxytocin-ptsd-addiction-6412/ : 0.0
2018 : http://neurosciencenews.com/cognitive-map-scaling-6403/ : 0.0
2019 : http://neurosciencenews.com/sensory-cell-regeneration-hearing-6391/ : 0.0
2020 : http://neurosciencenews.com/oxytocin-ptsd-addiction-6412/ : 0.0
2021 : http://neurosciencenews.com/gambling-addiction-neuroimaging-6413/ : 0.0
2022 : http://neurosciencenews.com/plaand-genetics-6417/ : 0.0
2023 : https://www.washingtonpost.com/opinions/the-march-for-science-could-save-lives/2017/04/19/d042f956-2476-11e7-a1b3-faff0034e2de_story.html?utm_term=.62457cad7103 : 0.0
2024 : https://www.nytimes.com/2017/04/17/science/march-for-science-v

2100 : https://www.statnews.com/2017/04/25/oncology-cancer-precision-medicine-gleevec/ : 0.0277298302469
2101 : http://www.news-medical.net/news/20170421/Mice-study-yields-important-clues-about-protective-role-of-Clostridia-strains.aspx : 0.00555376559546
2102 : http://www.news-medical.net/news/20170419/Tips-to-help-prevent-allergies-during-spring-season.aspx : 0.0135517806122
2103 : http://www.news-medical.net/news/20170426/Unique-microscope-reveals-clues-to-destructive-autoimmune-disease.aspx : 0.00621790532544
2104 : http://www.news-medical.net/news/20170418/Cytokine-plays-critical-role-in-driving-immune-cells-to-promote-or-suppress-inflammatory-bowel-disease.aspx : 0.01037864
2105 : http://www.news-medical.net/news/20170424/UTSWc2a0researchers-develop-new-nanoparticle-vaccine-immunotherapy-that-targets-multiple-tumor-types.aspx : 0.0128298571429
2106 : http://www.news-medical.net/news/20170419/LIH-scientists-discover-unknown-immune-activation-mechanism-to-ward-off-pathogens.aspx : 

Let's create a checkpoint file to save our progress!

In [124]:
# news_df.to_json('data/news_checkpoint1.json')
# checkpoint_df = pd.read_json('data/news_checkpoint.json')
# checkpoint_df = checkpoint_df.sort_index(ascending=True)
# checkpoint_df = checkpoint_df[['articleUrl', 'categories', 'flagged', 'source', 'sourceUrl', 'title', 'article',
#                    'num_words_not_in_dictionary', 'stopwords2words', 'readability_score', 'sentiment_polarity']]
# news_df = checkpoint_df

# Explore TFIDF
### So far we have 3 important variables: news_df for all the news, articles for list/corpus of all articles, and vocab_df for the vocabularies of the articles
Now, I will explore TFIDF to extract keywords and try to match them to a categorizing model. This mapping can perhaps compare the results of the two algorithms. Many of the columns I populated above are not used, but they are interesting to look at and can be features in future models!<br><br>
Quoted directly from http://brandonrose.org/clustering#Stopwords,-stemming,-and-tokenizingHere.<br>
"I define term frequency-inverse document frequency (tf-idf) vectorizer parameters and then convert the synopses list into a tf-idf matrix. To get a Tf-idf matrix, first count word occurrences by document. This is transformed into a document-term matrix (dtm). This is also just called a term frequency matrix. An example of a dtm is here at right. Then apply the term frequency-inverse document frequency weighting: words that occur frequently within a document but not frequently within the corpus receive a higher weighting as these words are assumed to contain more meaning in relation to the document. A couple things to note about the parameters I define below: <br>-- max_df: this is the maximum frequency within the documents a given feature can have to be used in the tfi-idf matrix. If the term is in greater than 80% of the documents it probably cares little meanining (in the context of film synopses) <br>-- min_idf: this could be an integer (e.g. 5) and the term would have to be in at least 5 of the documents to be considered. Here I pass 0.2; the term must be in at least 20% of the document. I found that if I allowed a lower min_df I ended up basing clustering on names--for example "Michael" or "Tom" are names found in several of the movies and the synopses use these names frequently, but the names carry no real meaning. <br>-- ngram_range: this just means I'll look at unigrams, bigrams and trigrams."

In [261]:
# Define vectorizer parameters using Gridsearch (later in this notebook)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.01, max_df = 0.8, use_idf=True, ngram_range=(1,1),
                                   tokenizer=tokenize_and_stem)

tfidf_matrix = tfidf_vectorizer.fit_transform(articles) #fit the vectorizer to descriptions
terms = tfidf_vectorizer.get_feature_names() # terms is just a list of the features used in the tf-idf matrix.
print(tfidf_matrix.shape)
terms

(2150, 3235)


['abandon',
 'abil',
 'abl',
 'abnorm',
 'abolish',
 'abov',
 'abroad',
 'absenc',
 'absolut',
 'absorb',
 'abstract',
 'abund',
 'abus',
 'aca',
 'academ',
 'academi',
 'acceler',
 'accept',
 'access',
 'accid',
 'accommod',
 'accompani',
 'accomplish',
 'accord',
 'account',
 'accumul',
 'accur',
 'accuraci',
 'accus',
 'achiev',
 'acid',
 'acknowledg',
 'acquir',
 'acquisit',
 'act',
 'action',
 'activ',
 'activist',
 'actual',
 'acut',
 'ad',
 'adam',
 'adapt',
 'add',
 'addict',
 'addit',
 'address',
 'adequ',
 'adher',
 'adjust',
 'administ',
 'administr',
 'admit',
 'adolesc',
 'adopt',
 'adult',
 'advanc',
 'advantag',
 'advers',
 'advertis',
 'advic',
 'advis',
 'advisor',
 'advisori',
 'advoc',
 'advocaci',
 'affair',
 'affect',
 'affili',
 'afford',
 'afraid',
 'africa',
 'african',
 'afternoon',
 'afterward',
 'age',
 'agenc',
 'agenda',
 'agent',
 'aggreg',
 'aggress',
 'ago',
 'agre',
 'agreement',
 'agricultur',
 'ahead',
 'ai',
 'aid',
 'ailment',
 'aim',
 'air',
 'airp

### Clustering attempt to extract common keywords
Using the tf-idf matrix, I can now cluster the articles with k-means clustering. Since this a ML model that often converges to a "local optimum", sometimes these cells need to be run many times.

In [284]:
from sklearn.cluster import KMeans
num_clusters = 5 # played around with this number, and decided on 6 clusters / 6 main categories of the articles
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [288]:
from sklearn.externals import joblib
# joblib.dump(km, 'data/km.pkl') 
km = joblib.load('data/km.pkl') 

In [291]:
urls = [str(t) for t in news_df['articleUrl']]
categories = [c for c in news_df['categories']]# just to compare the cluster and the actual category
news_articles = { 'url': urls, 'article': articles, 'cluster': clusters, 'categories': categories}
cluster_df = pd.DataFrame(news_articles, index = [clusters] , columns = ['url', 'article', 'cluster', 'categories'])
print(cluster_df['cluster'].value_counts()) #number of films per cluster (clusters from 0 to 5)
cluster_df.head()

0    573
1    508
2    435
3    377
4    257
Name: cluster, dtype: int64


Unnamed: 0,url,article,cluster,categories
0,https://www.statnews.com/2017/02/09/dana-farbe...,"\nT\nhe Dana-Farber Cancer Institute, which ha...",0,[science policy]
0,https://www.statnews.com/2017/02/09/antibiotic...,\nY\nou’ve heard it many times before from you...,0,[science policy]
2,https://www.statnews.com/2017/02/07/scientist-...,\nI\n’ve recently been thinking about this: Th...,2,[science policy]
0,https://www.statnews.com/2017/02/09/twitter-ho...,\nC\nlinical medicine is rapidly changing in t...,0,[science policy]
0,https://www.statnews.com/2017/02/09/obamacare-...,\nW\nASHINGTON — As Republicans confront the t...,0,[science policy]


In [292]:
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster", i, "top words:", end='')
    for ind in order_centroids[i, :10]: # 10 words per cluster
#         print(terms[ind].split(' '))
#         print(vocab_df.ix[terms[ind].split(' ')].values.tolist())
        print(vocab_df.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=', ')
    print() #add whitespace
    print() #add whitespace
    
    # USE STEM
    
#     print("Cluster", i, "titles:", end='')
#     for title in cluster_df.ix[i]['title'].values.tolist():
#         print(title, end='')
#     print() #add whitespace
#     print() #add whitespace

Cluster 0 top words:b'said', b'health', b'patients', b'care', b'trump', b'medical', b'climate', b'hospital', b'drugs', b'states', 

Cluster 1 top words:b'brain', b'research', b'studies', b'imaging', b'neurons', b'http', b'neurosciencenews', b'sleeping', b'active', b'february', 

Cluster 2 top words:b'ai', b'science', b'research', b'machine', b'data', b'use', b'learn', b'say', b'computers', b'like', 

Cluster 3 top words:b'plus', b'stat', b'subscribe', b'logging', b'biotechs', b'drugs', b'reads', b'story', b'article', b'available', 

Cluster 4 top words:b'cell', b'cancer', b'gene', b'immune', b'research', b'tumors', b'protein', b'patients', b'diseases', b'neurons', 



Looks like Cluster 0 represents science policy/environmental science, Cluster 1 represents neuroscience, Cluster 2 represents artificial intelligence, Cluster 3 sort of represents pharmaceuticals, and Cluster 4 represnts immunology. This clustering will be the first D3 visualization!

# Visualization 
"dist is defined as 1 - the cosine similarity of each document. Cosine similarity is measured against the tf-idf matrix and can be used to generate a measure of similarity between each document and the other documents in the corpus (each synopsis among the synopses). Subtracting it from 1 provides cosine distance which I will use for plotting on a euclidean (2-dimensional) plane.
Note that with dist it is possible to evaluate the similarity of any two or more synopses."

In [299]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
dist

array([[ -2.22044605e-16,   9.36636486e-01,   9.37527487e-01, ...,
          9.49005120e-01,   9.48263112e-01,   9.71655691e-01],
       [  9.36636486e-01,   0.00000000e+00,   8.74903476e-01, ...,
          9.16889500e-01,   9.39226185e-01,   9.49783936e-01],
       [  9.37527487e-01,   8.74903476e-01,   2.22044605e-16, ...,
          8.53410505e-01,   8.87167371e-01,   9.26460679e-01],
       ..., 
       [  9.49005120e-01,   9.16889500e-01,   8.53410505e-01, ...,
          1.11022302e-16,   8.55425870e-01,   8.89534648e-01],
       [  9.48263112e-01,   9.39226185e-01,   8.87167371e-01, ...,
          8.55425870e-01,  -4.44089210e-16,   9.12466078e-01],
       [  9.71655691e-01,   9.49783936e-01,   9.26460679e-01, ...,
          8.89534648e-01,   9.12466078e-01,   0.00000000e+00]])

In [300]:
from sklearn.manifold import MDS
MDS()
# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
xs, ys = pos[:, 0], pos[:, 1]
xs, ys

(array([-0.43890913, -0.15147367,  0.2856028 , ...,  0.47134885,
         0.58932929,  0.59853611]),
 array([-0.47929189, -0.16887924,  0.08910257, ...,  0.06806458,
         0.06912784,  0.27755768]))

In [326]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}

#set up cluster names using a dict
cluster_names = {0: 'Sci Policy/Env Sci: trump, climate, states', 
                 1: 'Neuroscience: brain, imaging, neurons', 
                 2: 'Artificial Intelligence: ai, machine, data', 
                 3: 'Pharmaceuticals: biotechs, dugs', 
                 4: 'Immunology: cell, cancer, gene, immune, tumors, protein, parient, disease'}

#create data frame that has the result of the MDS plus the cluster numbers and titles
mds_df = pd.DataFrame(dict(x=xs, y=ys, cluster=clusters, categories=categories, url=urls)) 
mds_df.to_csv('data/mds_df.csv', index=False)
mds_df.head()

Unnamed: 0,categories,cluster,url,x,y
0,[science policy],0,https://www.statnews.com/2017/02/09/dana-farbe...,-0.438909,-0.479292
1,[science policy],0,https://www.statnews.com/2017/02/09/antibiotic...,-0.151474,-0.168879
2,[science policy],2,https://www.statnews.com/2017/02/07/scientist-...,0.285603,0.089103
3,[science policy],0,https://www.statnews.com/2017/02/09/twitter-ho...,-0.396158,-0.40984
4,[science policy],0,https://www.statnews.com/2017/02/09/obamacare-...,0.317679,-0.304971


To wrap up the visualization, let's also generate a word cloud for all the vocab in vocab_df.

In [335]:
full_text = [w for w in vocab_df['words'] if w not in stopwords and len(w) >= 5]
fdist = nltk.FreqDist(full_text)

In [339]:
df = pd.DataFrame(fdist.most_common(50), columns=["word", "count"])
df.to_csv("data/word_cloud.csv", index=False)
df

Unnamed: 0,word,count
0,research,4983
1,brain,4705
2,study,4108
3,health,3671
4,could,3464
5,university,3366
6,people,3359
7,patients,3278
8,would,3127
9,cells,3082
