In [1]:
## Import Packages and Libraries ##

# Web parcing, scraping, etc.
import bs4 as bs # BeautifulSoup4 
import urllib3
import re
import requests # HTTP parser
import html5lib

# DataFrames and math
import pandas as pd
import numpy as np

# Output related packages 
import pprint as pp
import json

# Progress bar and delaying requests 
from tqdm import tnrange, tqdm_notebook #progress bars
from random import randint
import datetime
import time

ModuleNotFoundError: No module named 'tqdm'

In [2]:
# stretch Jupyter coding blocks to fit screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>")) 

# make it run on py2 and py3
from __future__ import division, print_function

# Data Mining I
This  notebook is intended to perform the following processes:

    1.1 Read-in news articles from newsAPI for a given date range, and up to five queries (passed as a list).

    1.2 Extract features native to the articles (e.g. url).

    1.3 Perform data cleanup and preprocessing.

    1.4 Split dataset into n-csv-files for distrubuted computation or batching.

### __Documentation__

newsAPI (https://newsapi.org/) has limited documentation as to how their backend works. However, when developing this notebook the following website was frequently referenced -- as their documentation appears to closely match the behavior of newsAPI. 

https://docs.aylien.com/newsapi/#getting-started

Moreover, here is the user agreement:

https://newsapi.org/terms



___
### **Begin Data Mining I:** Read-in NewsAPI feed for a given date range

In [5]:
### NEWSAPI RELATED ###
# keys: 
#rhkey = '847446b32283474fafd2aec7f95e502b'
#r1key  = '1da951c142304f7bab52ba8e3970495b'
#r2key = '40d53e49ee3543a3b162e6a453e2e373'
#m1key = '211fc2107848473e99c1f235b400a07f'
#m2key = 'c0f99eab932d4cabb61c23239f3f482d'
m3key = '658cd65a714349fdbb7e8dd6ce59e9c4'
#m4key  = '8ba091b7a47b4c9a9162a83ca72eb1ca'
#e1key  = '2bc85776a0c14af6b9937366ad683e2f'
#e2key = '22e5c3a8f0ee4fa59aaf384ba9395a86'
#e3key = 'c554f8fb27ca4be1862192b44ee4425d'


# Install API 
# !pip install newsapi-python

# Import Client
from newsapi import NewsApiClient

# Initialize Client (create object)
news_api = NewsApiClient(api_key = m3key)
print(type(news_api))

<class 'newsapi.newsapi_client.NewsApiClient'>


__1.1 Read-in news articles from newsAPI for a given date range__

#### Function: **get_news**
Function establishes values to be used for control of loop then calls functions used to extract news article data

In [6]:
def get_news(query, start, stop, sort, lang, article_count, page_count, init):
    """
    control function for iterating over 100 pages of newsAPI's content 
    function then controls subordinate fuctions that extract ~100 articles per each page
    """
    
    import math
    import time  
    
    # extract information about response file to ensure proper loop control
    params = get_params(query, start, stop, sort, lang, article_count, page_count)

    # variable referencing
    status = params['status']
    results = params['totalResults']

    # Confirmation of data extraction
    print("\nVerify Read-in Process:", status)
    print("Number of Articles Correctly Read: ", results)
    print(type(params), params.keys())
           
    # per page article extraction stop variable -- if number of articles is greater than number articles per page
    loops = math.ceil(results/article_count)
    
    # batching control
    begin = 0 + init
    terminate = article_count + init
    print("Total number of iterations (pages):",loops)
    
    # check to ensure loop does not extract over 100pages*100endpoints=10000 articles
    if loops < terminate:
        terminate = loops
        
    print("Page range being extracted", begin, terminate)
    
    if page_count == 'all' or article_count <  results:
        print("\n\nExtracting News Data...\n")
        full_df = pd.DataFrame()
    
        # function is called withinin while, is subject to number of pages available as a function of total no. articles
        while begin < terminate:
            begin += 1
            page = begin  # for referencing clarity
            
            ### newsAPI has MAX HIT LIMIT of 60 per minute ### 
            time.sleep(1)   # delay of 1 second
            print("Extracting Page", page)

            # call sequencial pages of articles and appends to df
            df = news_data(query, start, stop, sort, lang, article_count, page)
            full_df = full_df.append(df, ignore_index = True)
            
        print('Batch extraction completed:',begin,'of',terminate)
        return(full_df)            
    else:
        # extracts articles assuming articles >= pages
        print("Possible Invalid Parameters: Check values")
        brief_df = news_api.get_everything(q = query,
                                          from_parameter= start,
                                          to= stop,
                                          sort_by= sort,
                                          language= lang,
                                          page_size= int(article_count)
                                         )
        return(brief_df)

#### Function: **get_params**
Function runs an initial newsAPI call, used to store values for controlling loops

In [14]:
def get_params(query, start, stop, sort, lang, article_count, page_count):
    """
    function accepts similar parameters to master function get_news.
    get_params is used to extract parameters to be used in controlling other functions 
    """
    
    print("\nExtracting Parameters for newsAPI...\n")
    params = news_api.get_everything(q = query,
                                     from_param= start,
                                     to= stop,
                                     sort_by= sort,
                                     language= lang,
                                     page_size= int(article_count)
                                    )
    
    # Confirmation of data extraction
    print("Read-in Status of Given Date Range:", params['status'])
    print("Number of Articles in Given Date Range: ", params['totalResults'])
    
    return(params)

#### Function: **news_data**
Function handles cases, and extracts values within 'articles'. Returns dataframe of contents: 


*Index(['author', 'description', 'publishedAt', 'source', 'title', 'url','urlToImage'],dtype='object')*


In [25]:
def news_data(query, start, stop, sort, lang, article_count, page):
    """
    Principal data extraction function - can handle various relationships between no.pages and no.articles 
    """
    
    if isinstance(page, int):
        params = news_api.get_everything(q = query,
                                         from_param= start,
                                         to= stop,
                                         sort_by= sort,
                                         language= lang,
                                         page_size= int(article_count),
                                         page = int(page)
                                        )
    ########### if params['articles'] throws error ########### 
    # either endpoint limit was met (10000)                  #
    # too many endpoint requests in a month (1000 per month) #
    # change to new api key.                                 #
    ##########################################################
    return(pd.DataFrame(params['articles'])) 

#### User provided parameters and function call.

In [39]:
#01/26/18 to 03/26/18
query = 'Bitcoin'         # can handle a list of up to five search topics
start = '2018-09-17'      # yyyy-mm-dd
stop  = '2018-09-23'
sort  = 'publishedAt'
lang  = 'en'
article_count = 20       # default is 20
page_count = 'all'        # enter 1, 2, ... Notes: 'all' iterates over all articLes

___
---
#### __NOTE__

Since newsAPI has a daily limit of endpoint requests (1000 per day), batching needs to be implemented.  The following cells controls the initialization for subsequent cells -- to ensure appropropriate, and sequential article extraction.

Also, webAPI has a limit of 10,000 articles per key -- unless you want to pay for the monthly access.
___
---

In [40]:
## ---------------------------------------------------------------------------- ##
## KEY: Initialize accroding to batch number (i.e. n*100, where n is the batch) ##
## USE: n > 0 ---- only once we can extract above 10000 articles
n = 0
init = n*100
## ---------------------------------------------------------------------------- ##

In [41]:
# object is the result of the following functions: 'get_params', 'get_news', and 'get_data'
news = get_news(query, start, stop, sort, lang, article_count, page_count, init)


Extracting Parameters for newsAPI...

Read-in Status of Given Date Range: ok
Number of Articles in Given Date Range:  1085

Verify Read-in Process: ok
Number of Articles Correctly Read:  1085
<class 'dict'> dict_keys(['status', 'totalResults', 'articles'])
Total number of iterations (pages): 55
Page range being extracted 0 20


Extracting News Data...

Extracting Page 1
Extracting Page 2
Extracting Page 3
Extracting Page 4
Extracting Page 5
Extracting Page 6
Extracting Page 7
Extracting Page 8
Extracting Page 9
Extracting Page 10
Extracting Page 11
Extracting Page 12
Extracting Page 13
Extracting Page 14
Extracting Page 15
Extracting Page 16
Extracting Page 17
Extracting Page 18
Extracting Page 19
Extracting Page 20
Batch extraction completed: 20 of 20


#### Explore nested key/value pairs from newsAPI data

In [42]:
## ENSURE API WAS ACCESSED ##
# if news.keys includes 'author', 'description', etc., success!

news.keys()

Index(['author', 'content', 'description', 'publishedAt', 'source', 'title',
       'url', 'urlToImage'],
      dtype='object')

In [43]:
print(len(news))
print(news.keys())
news.head(5)
len(news['url'].unique())

400
Index(['author', 'content', 'description', 'publishedAt', 'source', 'title',
       'url', 'urlToImage'],
      dtype='object')


399

__1.2 Extract features native to the articles__

#### Function: **get_info**
Function extracts variables from dataframe and stores each as a list, returning all of them as a single dataframe.

__Note:__ *urlToImage* is not included in this process, as we are uncertain as to the value of the feature

In [44]:
def get_info(df):
    """
    Accepts a dataframe of newsAPI articles, and controls all subbordinate functions that preprocess data
    """
    
    import copy
    
    author = []
    title = []
    publisher = []
    publish_url = []
    timeStamp = []
    description = []
    
    # loop appends rows to respective lists 
    for col_name in df:
        for index in df[col_name]:
            if col_name == 'author':
                author.append(index)
            elif col_name == 'title':
                title.append(index)
            elif col_name == 'source':
                name = index['name']
                publisher.append(name)
            elif col_name == 'url':
                publish_url.append(index)
            elif col_name == 'publishedAt':
                timeStamp.append(index)
            elif col_name == 'description':
                description.append(index)
            else:
                continue
    
    # merge lists and return them as dataframe.
    df = pd.DataFrame({'author' : author,
                       'title' : title,
                       'publisher' : publisher,
                       'source_url' : publish_url,
                       'timeStamp' : timeStamp,
                       'description' : description})
    
    return(df)
        

#### Completed newsAPI Read-in Process: 
##### newsDF contains features extracted from raw newsAPI feed, for a given data range, and query.

In [45]:
# Object creation
newsDF = get_info(news)

In [46]:
# Verifying correct data extraction
print("\nDataFrame Dimensions:", newsDF.shape, "\n")
newsDF.head(3)


DataFrame Dimensions: (400, 6) 



Unnamed: 0,author,title,publisher,source_url,timeStamp,description
0,Yashu Gola,"Content Creators on YouTube, Twitch, and Wikip...",Crypto Coins News,https://www.ccn.com/content-creators-on-youtub...,2018-09-22T23:30:12Z,Content creators on the internet will now be a...
1,BetaList,COINiD – The next generation Bitcoin wallet th...,Betalist.com,https://betalist.com/startups/coinid,2018-09-22T22:31:52Z,The next generation Bitcoin wallet that suppor...
2,"David Cottle, Analyst, David Cottle",Australian Dollar Gains May Be Stymied By Fed ...,Dailyfx.com,https://www.dailyfx.com/forex/fundamental/fore...,2018-09-22T22:00:00Z,The Australian Dollar has had a rare run of ga...


__1.3 Perform data cleanup and preprocessing.__



The following functions perform basic clean up on a dataframe. The purpose is to prepare the file to write-out (csv).  
 

In [47]:
# Replace 'None' values
def findNone(df):
    """
     Receives pandas datraframe, and removes null entries from author feature
    """
    print("Removed 'None' values in author feature...")
    author = df['author']
    publisher = df['publisher']
    
    for i in range(len(df)):
        if pd.isnull(author.loc[i]):
            author.loc[i] = publisher.loc[i]
    return(df)

In [48]:
# Remove gaps 
def gapStrip(df):
    """
    Receives pandas dataframe and leading and traling empty space`
    """
    df.columns = map(str.strip, df.columns) 
    print("Removed leading and trailing spaces and tabs...")
    # element-wise operation
    f = lambda x: x.strip() if (isinstance(x,str)) else x
    df = df.applymap(f)
    return(df)

In [49]:
# Standardize time stamps
def std_timeStamp(df):
    """
    Receives pandas dataframe and standardizes time stamps 
    """
    import datetime
    # Check to see time stamps are in zero timezones
    print("Converted Time Stamps to Desired Standard Formating...")
    for time in df['timeStamp']:
        if time.endswith('Z'):
            df['timeStamp'] = pd.to_datetime(df['timeStamp'],
                                             infer_datetime_format = True,
                                             utc = True)                       # returns a type '.Timestamp'
            return(df)
        else:
            print("Revisit appropriate variable or function to deal with time zones that are not zero")

In [50]:
def feature_clean(df):
    """
    Performs Generic Cleanup and Preprocessing on a given dataframe sourced from newsAPI
    """
    
    temp = findNone(df)           # removes missing values from author column
    temp2 = gapStrip(temp)        # remove leading and trailing white space
    temp3 = std_timeStamp(temp2)  # convert time stamps to 'utc' standard
    return(temp3)

In [51]:
riskEx_df = feature_clean(newsDF)

Removed 'None' values in author feature...
Removed leading and trailing spaces and tabs...
Converted Time Stamps to Desired Standard Formating...


In [52]:
# Ensure data was preprocessed
riskEx_df.head(5), riskEx_df.tail(5)

(                                author  \
 0                           Yashu Gola   
 1                             BetaList   
 2  David Cottle, Analyst, David Cottle   
 3                        Conor Maloney   
 4                        Cole Petersen   
 
                                                title          publisher  \
 0  Content Creators on YouTube, Twitch, and Wikip...  Crypto Coins News   
 1  COINiD – The next generation Bitcoin wallet th...       Betalist.com   
 2  Australian Dollar Gains May Be Stymied By Fed ...        Dailyfx.com   
 3  Bitcoin ATM CEO: Cryptocurrency Needs Regulati...  Crypto Coins News   
 4  Brazil’s Biggest Brokerage is Officially Joini...        Newsbtc.com   
 
                                           source_url  \
 0  https://www.ccn.com/content-creators-on-youtub...   
 1               https://betalist.com/startups/coinid   
 2  https://www.dailyfx.com/forex/fundamental/fore...   
 3  https://www.ccn.com/bitcoin-atm-ceo-cryptocurr... 

In [53]:
## Check file size
riskEx_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 6 columns):
author         400 non-null object
title          400 non-null object
publisher      400 non-null object
source_url     400 non-null object
timeStamp      400 non-null datetime64[ns, UTC]
description    399 non-null object
dtypes: datetime64[ns, UTC](1), object(5)
memory usage: 18.8+ KB


__1.4 Write out to csv.__

In [57]:
# write out n-csv-files each with 100 rows. Process is done to reduce computational load
riskEx_df.to_csv('rawData_test1008.csv', index_label = False)

In [59]:
df = pd.read_csv('rawData_test1008.csv')
#print(df.info())
print(len(df))
len(df['description'].unique())


400


383

In [96]:
from sklearn.feature_extraction.text import CountVectorizer

In [97]:
vectorizer = CountVectorizer()

In [109]:
type(df['description'])

pandas.core.series.Series

In [125]:
df['description'].dropna(axis = 0,inplace = True)
# df[df['description'].isnull()]

In [129]:
# df[df['description'].isnull()]
df.iloc[241,:]

author                                                  alexmark
title          $TLRY is the perfect example of why there is n...
publisher                                Investmentwatchblog.com
source_url     http://www.investmentwatchblog.com/tlry-is-the...
timeStamp                              2018-09-21 00:29:07+00:00
description                                                  NaN
Name: 241, dtype: object

In [123]:
bag_of_words = vectorizer.fit(df['description'].tolist())

In [130]:
bag_of_words = vectorizer.transform(df['description'].tolist())

In [133]:
wordss = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names())
wordss.head()

Unnamed: 0,000,0027,03,07,09,0rc4,10,100,105,10yr,...,œi,一职,一職,以及在加密市場中的定位,何謂場外交易,作者,担任,擔任,目前任职美国硅谷创投,目前任職美國矽谷創投
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [174]:
ggg = wordss.columns.tolist()

In [145]:
wordss.sum().sort_values(ascending = False)

the               709
of                332
to                299
and               279
in                230
bitcoin           170
on                164
is                162
for               129
that              119
has               116
cryptocurrency    112
it                 92
by                 89
with               84
are                71
this               70
from               70
as                 69
have               68
exchange           65
its                63
new                58
an                 57
at                 55
be                 54
crypto             53
was                49
been               48
blockchain         47
                 ... 
glyph               1
glorious            1
safety              1
saga                1
game                1
gamescom            1
gaming              1
gardner             1
garza               1
gauges              1
gavin               1
gaw                 1
gaze                1
gbp                 1
gdp       

In [168]:
gg = wordss.sum().sort_values(ascending = False).index
gg

Index(['the', 'of', 'to', 'and', 'in', 'bitcoin', 'on', 'is', 'for', 'that',
       ...
       'sacrifice', 'sacramento', 'gi', 'sabot', 'giants', 'giddy', 'gift',
       's9i', 'given', '目前任職美國矽谷創投'],
      dtype='object', length=3253)

In [146]:
from nltk.corpus import stopwords

In [149]:
stop_words = set(stopwords.words("english"))

In [150]:
type(wordss.sum())

pandas.core.series.Series

In [161]:
dtt = wordss.sum().to_frame("freq")

In [164]:
dtt[dtt['freq']>50]

Unnamed: 0,freq
an,57
and,279
are,71
as,69
at,55
be,54
bitcoin,170
by,89
crypto,53
cryptocurrency,112


In [208]:
filtered = [w for w in gg if not w in stop_words]
freqs = wordss[filtered].sum().to_frame("freq")
freqs[freqs['freq']>15]

Unnamed: 0,freq
bitcoin,170
cryptocurrency,112
exchange,65
new,58
crypto,53
blockchain,47
digital,44
week,44
said,41
market,39


__Note:__ if wanting to create batches of raw data files containing n-articles, use the following

In [109]:
#def df_to_csvs(df):
#    articlesPage = int(100)
#    totalArticles = len(df)
#    batchSize=round(totalArticles/articlesPage)          # number of rows in single output file
        
#    for id, df_i in  enumerate(np.array_split(df, batchSize)):
#        df_i.to_csv('rawData_{id}.csv'.format(id=id), index_label = False)                 

### **End Data Mining I:** Read-in NewsAPI feed for a given date range
___