In [1]:
## Import Packages and Libraries ##

# Web parcing, scraping, etc.
import bs4 as bs # BeautifulSoup4 
import urllib3
import re
import requests # HTTP parser
import html5lib

# DataFrames and math
import pandas as pd
import numpy as np

# Output related packages 
import pprint as pp
import json

# Progress bar and delaying requests 
from tqdm import tnrange, tqdm_notebook #progress bars
from random import randint
import datetime
import time

In [2]:
# stretch Jupyter coding blocks to fit screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>")) 

# make it run on py2 and py3
from __future__ import division, print_function

# Data Mining I
This  notebook is intended to perform the following processes:

    1.1 Read-in news articles from newsAPI for a given date range, and up to five queries (passed as a list).

    1.2 Extract features native to the articles (e.g. url).

    1.3 Perform data cleanup and preprocessing.

    1.4 Split dataset into n-csv-files for distrubuted computation or batching.

### __Documentation__

newsAPI (https://newsapi.org/) has limited documentation as to how their backend works. However, when developing this notebook the following website was frequently referenced -- as their documentation appears to closely match the behavior of newsAPI. 

https://docs.aylien.com/newsapi/#getting-started

Moreover, here is the user agreement:

https://newsapi.org/terms



___
### **Begin Data Mining I:** Read-in NewsAPI feed for a given date range

In [3]:
### NEWSAPI RELATED ###
# keys: 
#rhkey = '847446b32283474fafd2aec7f95e502b'
#r1key  = '1da951c142304f7bab52ba8e3970495b'
#r2key = '40d53e49ee3543a3b162e6a453e2e373'
#m1key = '211fc2107848473e99c1f235b400a07f'
#m2key = 'c0f99eab932d4cabb61c23239f3f482d'
m3key = '658cd65a714349fdbb7e8dd6ce59e9c4'
#m4key  = '8ba091b7a47b4c9a9162a83ca72eb1ca'
#e1key  = '2bc85776a0c14af6b9937366ad683e2f'
#e2key = '22e5c3a8f0ee4fa59aaf384ba9395a86'
#e3key = 'c554f8fb27ca4be1862192b44ee4425d'


# Install API 
#!pip install newsapi-python

# Import Client
from newsapi import NewsApiClient

# Initialize Client (create object)
news_api = NewsApiClient(api_key = m3key)
print(type(news_api))

<class 'newsapi.newsapi_client.NewsApiClient'>


__1.1 Read-in news articles from newsAPI for a given date range__

#### Function: **get_news**
Function establishes values to be used for control of loop then calls functions used to extract news article data

In [4]:
def get_news(query, start, stop, sort, lang, article_count, page_count, init):
    """
    control function for iterating over 100 pages of newsAPI's content 
    function then controls subordinate fuctions that extract ~100 articles per each page
    """
    
    import math
    import time  
    
    # extract information about response file to ensure proper loop control
    params = get_params(query, start, stop, sort, lang, article_count, page_count)

    # variable referencing
    status = params['status']
    results = params['totalResults']

    # Confirmation of data extraction
    print("\nVerify Read-in Process:", status)
    print("Number of Articles Correctly Read: ", results)
    print(type(params), params.keys())
           
    # per page article extraction stop variable -- if number of articles is greater than number articles per page
    loops = math.ceil(results/article_count)
    
    # batching control
    begin = 0 + init
    terminate = article_count + init
    print("Total number of iterations (pages):",loops)
    
    # check to ensure loop does not extract over 100pages*100endpoints=10000 articles
    if loops < terminate:
        terminate = loops
        
    print("Page range being extracted", begin, terminate)
    
    if page_count == 'all' or article_count <  results:
        print("\n\nExtracting News Data...\n")
        full_df = pd.DataFrame()
    
        # function is called withinin while, is subject to number of pages available as a function of total no. articles
        while begin < terminate:
            begin += 1
            page = begin  # for referencing clarity
            
            ### newsAPI has MAX HIT LIMIT of 60 per minute ### 
            time.sleep(1)   # delay of 1 second
            print("Extracting Page", page)

            # call sequencial pages of articles and appends to df
            df = news_data(query, start, stop, sort, lang, article_count, page)
            full_df = full_df.append(df, ignore_index = True)
            
        print('Batch extraction completed:',begin,'of',terminate)
        return(full_df)            
    else:
        # extracts articles assuming articles >= pages
        print("Possible Invalid Parameters: Check values")
        brief_df = news_api.get_everything(q = query,
                                          from_param= start,
                                          to= stop,
                                          sort_by= sort,
                                          language= lang,
                                          page_size= int(article_count)
                                         )
        return(brief_df)

#### Function: **get_params**
Function runs an initial newsAPI call, used to store values for controlling loops

In [5]:
def get_params(query, start, stop, sort, lang, article_count, page_count):
    """
    function accepts similar parameters to master function get_news.
    get_params is used to extract parameters to be used in controlling other functions 
    """
    
    print("\nExtracting Parameters for newsAPI...\n")
    params = news_api.get_everything(q = query,
                                     from_param= start,
                                     to= stop,
                                     sort_by= sort,
                                     language= lang,
                                     page_size= int(article_count)
                                    )
    
    # Confirmation of data extraction
    print("Read-in Status of Given Date Range:", params['status'])
    print("Number of Articles in Given Date Range: ", params['totalResults'])
    
    return(params)

#### Function: **news_data**
Function handles cases, and extracts values within 'articles'. Returns dataframe of contents: 


*Index(['author', 'description', 'publishedAt', 'source', 'title', 'url','urlToImage'],dtype='object')*


In [6]:
def news_data(query, start, stop, sort, lang, article_count, page):
    """
    Principal data extraction function - can handle various relationships between no.pages and no.articles 
    """
    
    if isinstance(page, int):
        params = news_api.get_everything(q = query,
                                         from_param= start,
                                         to= stop,
                                         sort_by= sort,
                                         language= lang,
                                         page_size= int(article_count),
                                         page = int(page)
                                        )
    ########### if params['articles'] throws error ########### 
    # either endpoint limit was met (10000)                  #
    # too many endpoint requests in a month (1000 per month) #
    # change to new api key.                                 #
    ##########################################################
    return(pd.DataFrame(params['articles'])) 

#### User provided parameters and function call.

In [7]:
#01/26/18 to 03/26/18
query = 'Bitcoin'         # can handle a list of up to five search topics
start = '2018-10-01'      # yyyy-mm-dd
stop  = '2018-10-05'
sort  = 'publishedAt'
lang  = 'en'
article_count = 100       # default is 20
page_count = 'all'        # enter 1, 2, ... Notes: 'all' iterates over all articLes

___
---
#### __NOTE__

Since newsAPI has a daily limit of endpoint requests (1000 per day), batching needs to be implemented.  The following cells controls the initialization for subsequent cells -- to ensure appropropriate, and sequential article extraction.

Also, webAPI has a limit of 10,000 articles per key -- unless you want to pay for the monthly access.
___
---

In [8]:
## ---------------------------------------------------------------------------- ##
## KEY: Initialize accroding to batch number (i.e. n*100, where n is the batch) ##
## USE: n > 0 ---- only once we can extract above 10000 articles
n = 0
init = n*100
## ---------------------------------------------------------------------------- ##

In [9]:
# object is the result of the following functions: 'get_params', 'get_news', and 'get_data'
news = get_news(query, start, stop, sort, lang, article_count, page_count,init)


Extracting Parameters for newsAPI...

Read-in Status of Given Date Range: ok
Number of Articles in Given Date Range:  981

Verify Read-in Process: ok
Number of Articles Correctly Read:  981
<class 'dict'> dict_keys(['status', 'totalResults', 'articles'])
Total number of iterations (pages): 10
Page range being extracted 0 10


Extracting News Data...

Extracting Page 1
Extracting Page 2
Extracting Page 3
Extracting Page 4
Extracting Page 5
Extracting Page 6
Extracting Page 7
Extracting Page 8
Extracting Page 9
Extracting Page 10
Batch extraction completed: 10 of 10


#### Explore nested key/value pairs from newsAPI data

In [10]:
## ENSURE API WAS ACCESSED ##
# if news.keys includes 'author', 'description', etc., success!

news.keys()

Index(['author', 'content', 'description', 'publishedAt', 'source', 'title',
       'url', 'urlToImage'],
      dtype='object')

In [11]:
type(news['content'])

pandas.core.series.Series

In [35]:
df = pd.DataFrame(news)

In [36]:
df.head(3)

Unnamed: 0,author,content,description,publishedAt,source,title,url,urlToImage
0,David Hundeyin,Open source browser-based cryptocurrency walle...,Open source browser-based cryptocurrency walle...,2018-10-05T23:25:23Z,"{'id': 'crypto-coins-news', 'name': 'Crypto Co...",Crypto Mining Giant Bitmain Acquires Bitcoin C...,https://www.ccn.com/crypto-mining-giant-bitmai...,https://www.ccn.com/wp-content/uploads/2018/07...
1,Chloe Aiello,"Bitcoin is close to bottoming, and once it reb...","Bitcoin is close to bottoming, said Spencer Bo...",2018-10-05T23:03:00Z,"{'id': 'cnbc', 'name': 'CNBC'}","Bitcoin is close to bottoming, cryptocurrency ...",https://www.cnbc.com/2018/10/05/bitcoin-is-clo...,https://fm.cnbc.com/applications/cnbc.com/reso...
2,Lawrence Abrams,Very very quiet week this. Not much new ransom...,Very very quiet week this. Not much new ransom...,2018-10-05T23:02:12Z,"{'id': None, 'name': 'Bleepingcomputer.com'}",The Week in Ransomware - October 5th 2018 - Re...,https://www.bleepingcomputer.com/news/security...,https://www.bleepstatic.com/images/news/column...


#### **Write news contents into a new csv**

In [46]:
import re
content=str(df['content'])
content

'0      Open source browser-based cryptocurrency walle...\n1      Bitcoin is close to bottoming, and once it reb...\n2      Very very quiet week this. Not much new ransom...\n3      A recent survey by Fundstrat Global Advisors h...\n4      Is this the start of a new phase for Bitcoin o...\n5      On October 5, 2018, an article published by Bl...\n6      Track the price of your favourite cryptocurren...\n7      An exchange-traded bitcoin fund (ETF) would si...\n8      "It is just not credible that the United State...\n9      BitGo, a major Palo Alto-based cryptocurrency ...\n10     Earlier this year, the city of Atlanta was str...\n11                                                  None\n12     The Securities and Exchange Commission (SEC) h...\n13     Scammers are taking advantage of Fortnite \'s p...\n14                                                  None\n15     The story of this week: Crude Oil- Brent makes...\n16     &lt;iframe style="border: none" src="//html5-p...\n17     Famou

In [48]:
df['content'].to_csv('newscontent.csv', index_label = False,index=False)
# object is type 'str'
# need to split str into list, of tokenized words

In [16]:
### print(type(news.content))
print(type(news['content'].iloc[1]))

<class 'str'>


In [17]:
print(len(news))
print(news.keys())
news.head()

len(news['url'].unique())

981
Index(['author', 'content', 'description', 'publishedAt', 'source', 'title',
       'url', 'urlToImage'],
      dtype='object')


978

__1.2 Extract features native to the articles__

#### Function: **get_info**
Function extracts variables from dataframe and stores each as a list, returning all of them as a single dataframe.

__Note:__ *urlToImage* is not included in this process, as we are uncertain as to the value of the feature

In [18]:
def get_info(df):
    """
    Accepts a dataframe of newsAPI articles, and controls all subbordinate functions that preprocess data
    """
    
    import copy
    
    author = []
    title = []
    publisher = []
    publish_url = []
    timeStamp = []
    description = []
    
    # loop appends rows to respective lists 
    for col_name in df:
        for index in df[col_name]:
            if col_name == 'author':
                author.append(index)
            elif col_name == 'title':
                title.append(index)
            elif col_name == 'source':
                name = index['name']
                publisher.append(name)
            elif col_name == 'url':
                publish_url.append(index)
            elif col_name == 'publishedAt':
                timeStamp.append(index)
            elif col_name == 'description':
                description.append(index)
            else:
                continue
    
    # merge lists and return them as dataframe.
    df = pd.DataFrame({'author' : author,
                       'title' : title,
                       'publisher' : publisher,
                       'source_url' : publish_url,
                       'timeStamp' : timeStamp,
                       'description' : description})
    
    return(df)
        

#### Completed newsAPI Read-in Process: 
##### newsDF contains features extracted from raw newsAPI feed, for a given data range, and query.

In [19]:
# Object creation

newsDF = get_info(news)

In [20]:
# Verifying correct data extraction
print("\nDataFrame Dimensions:", newsDF.shape, "\n")
newsDF.head(3)


DataFrame Dimensions: (981, 6) 



Unnamed: 0,author,title,publisher,source_url,timeStamp,description
0,David Hundeyin,Crypto Mining Giant Bitmain Acquires Bitcoin C...,Crypto Coins News,https://www.ccn.com/crypto-mining-giant-bitmai...,2018-10-05T23:25:23Z,Open source browser-based cryptocurrency walle...
1,Chloe Aiello,"Bitcoin is close to bottoming, cryptocurrency ...",CNBC,https://www.cnbc.com/2018/10/05/bitcoin-is-clo...,2018-10-05T23:03:00Z,"Bitcoin is close to bottoming, said Spencer Bo..."
2,Lawrence Abrams,The Week in Ransomware - October 5th 2018 - Re...,Bleepingcomputer.com,https://www.bleepingcomputer.com/news/security...,2018-10-05T23:02:12Z,Very very quiet week this. Not much new ransom...


__1.3 Perform data cleanup and preprocessing.__



The following functions perform basic clean up on a dataframe. The purpose is to prepare the file to write-out (csv).  
 

In [21]:
# Replace 'None' values
def findNone(df):
    """
     Receives pandas datraframe, and removes null entries from author feature
    """
    print("Removed 'None' values in author feature...")
    author = df['author']
    publisher = df['publisher']
    
    for i in range(len(df)):
        if pd.isnull(author.loc[i]):
            author.loc[i] = publisher.loc[i]
    return(df)

In [22]:
# Remove gaps 
def gapStrip(df):
    """
    Receives pandas dataframe and leading and traling empty space`
    """
    df.columns = map(str.strip, df.columns) 
    print("Removed leading and trailing spaces and tabs...")
    # element-wise operation
    f = lambda x: x.strip() if (isinstance(x,str)) else x
    df = df.applymap(f)
    return(df)

In [23]:
# Standardize time stamps
def std_timeStamp(df):
    """
    Receives pandas dataframe and standardizes time stamps 
    """
    import datetime
    # Check to see time stamps are in zero timezones
    print("Converted Time Stamps to Desired Standard Formating...")
    for time in df['timeStamp']:
        if time.endswith('Z'):
            df['timeStamp'] = pd.to_datetime(df['timeStamp'],
                                             infer_datetime_format = True,
                                             utc = True)                       # returns a type '.Timestamp'
            return(df)
        else:
            print("Revisit appropriate variable or function to deal with time zones that are not zero")

In [24]:
def feature_clean(df):
    """
    Performs Generic Cleanup and Preprocessing on a given dataframe sourced from newsAPI
    """
    
    temp = findNone(df)           # removes missing values from author column
    temp2 = gapStrip(temp)        # remove leading and trailing white space
    temp3 = std_timeStamp(temp2)  # convert time stamps to 'utc' standard
    return(temp3)

In [25]:
riskEx_df = feature_clean(newsDF)

Removed 'None' values in author feature...
Removed leading and trailing spaces and tabs...
Converted Time Stamps to Desired Standard Formating...


In [26]:
# Ensure data was preprocessed
riskEx_df.head(5), riskEx_df.tail(5)

(              author                                              title  \
 0     David Hundeyin  Crypto Mining Giant Bitmain Acquires Bitcoin C...   
 1       Chloe Aiello  Bitcoin is close to bottoming, cryptocurrency ...   
 2    Lawrence Abrams  The Week in Ransomware - October 5th 2018 - Re...   
 3  Osato Avan-Nomayo  Institutional Investors Bullish About Cryptocu...   
 4     Vildana Hajric  With its volatility on the decline, is Bitcoin...   
 
               publisher                                         source_url  \
 0     Crypto Coins News  https://www.ccn.com/crypto-mining-giant-bitmai...   
 1                  CNBC  https://www.cnbc.com/2018/10/05/bitcoin-is-clo...   
 2  Bleepingcomputer.com  https://www.bleepingcomputer.com/news/security...   
 3        Bitcoinist.com  https://bitcoinist.com/institutional-investors...   
 4           Latimes.com  http://www.latimes.com/business/la-fi-bitcoin-...   
 
                   timeStamp                                      

In [27]:
## Check file size
riskEx_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981 entries, 0 to 980
Data columns (total 6 columns):
author         981 non-null object
title          981 non-null object
publisher      981 non-null object
source_url     981 non-null object
timeStamp      981 non-null datetime64[ns, UTC]
description    979 non-null object
dtypes: datetime64[ns, UTC](1), object(5)
memory usage: 46.1+ KB


__1.4 Write out to csv.__

In [28]:
# write out n-csv-files each with 100 rows. Process is done to reduce computational load
riskEx_df.to_csv('rawData.csv', index_label = False,index=False)

In [29]:
df = pd.read_csv('rawData.csv')
#print(df.info())
print(len(df))
len(df['description'].unique())


981


917

__Note:__ if wanting to create batches of raw data files containing n-articles, use the following

In [30]:
#def df_to_csvs(df):
#    articlesPage = int(100)
#    totalArticles = len(df)
#    batchSize=round(totalArticles/articlesPage)          # number of rows in single output file
        
#    for id, df_i in  enumerate(np.array_split(df, batchSize)):
#        df_i.to_csv('rawData_{id}.csv'.format(id=id), index_label = False)                 

### **End Data Mining I:** Read-in NewsAPI feed for a given date range
___