In [91]:
import requests

API_ROOT = 'http://api.nytimes.com/svc/search/v2/articlesearch.'

API_SIGNUP_PAGE = 'http://developer.nytimes.com/docs/reference/keys'

class NoAPIKeyException(Exception):
    def __init__(self, value):
        self.value = value
    def __str__(self):
        return repr(self.value)

class articleAPI(object):
    def __init__(self, key = None):
        """
        Initializes the articleAPI class with a developer key. Raises an exception if a key is not given.

        Request a key at http://developer.nytimes.com/docs/reference/keys

        :param key: New York Times Developer Key

        """
        self.key = key
        self.response_format = 'json'

        if self.key is None:
            raise NoAPIKeyException('Warning: Missing API Key. Please visit ' + API_SIGNUP_PAGE + ' to register for a key.')

    def _bool_encode(self, d):
        """
        Converts boolean values to lowercase strings

        """
        for k, v in d.items():
            if isinstance(v, bool):
                d[k] = str(v).lower()

        return d

    def _options(self, **kwargs):
        """
        Formats search parameters/values for use with API

        :param \*\*kwargs: search parameters/values

        """
        def _format_fq(d):
            for k,v in d.items():
                if isinstance(v, list):
                    d[k] = ' '.join(map(lambda x: '"' + x + '"', v))
                else:
                    d[k] = '"' + v + '"'
            values = []
            for k,v in d.items():
                value = '%s:(%s)' % (k,v)
                values.append(value)
            values = ' AND '.join(values)
            return values

        kwargs = self._bool_encode(kwargs)

        values = ''

        for k, v in kwargs.items():
            if k is 'fq' and isinstance(v, dict):
                v = _format_fq(v)
            elif isinstance(v, list):
                v = ','.join(v)
            values += '%s=%s&' % (k, v)

        return values

    def search(self,
                response_format = None,
                key = None,
                **kwargs):
        """
        Calls the API and returns a dictionary of the search results

        :param response_format: the format that the API uses for its response,
                                includes JSON (.json) and JSONP (.jsonp).
                                Defaults to '.json'.

        :param key: a developer key. Defaults to key given when the articleAPI class was initialized.

        """
        if response_format is None:
            response_format = self.response_format
        if key is None:
            key = self.key

        url = '%s%s?%sapi-key=%s' % (
            API_ROOT, response_format, self._options(**kwargs), key
        )

        self.req = requests.get(url)
        return self.req.json()


In [6]:
articles = api.search(q="Obama", 
                          fq={"headline": "Obama", 
                              "source": ["Reuters", 
                                         "AP", 
                                         "The New York Times"]}, 
                          begin_date="20161001", # this can also be an int
                          facet_field=["source", "day_of_week"], 
                          facet_filter=True)

In [5]:
api = articleAPI('vyk8e6eHS3dV0Un4RkS6RSccPkMnrbbo')

In [7]:
articles

{'status': 'OK',
 'copyright': 'Copyright (c) 2019 The New York Times Company. All Rights Reserved.',
 'response': {'docs': [{'web_url': 'https://www.nytimes.com/video/us/100000006800276/obama-activism.html',
    'snippet': 'Former President Barack Obama on Tuesday challenged youth activists on their “purity” and “judgmentalism” during an interview about youth activism at the Obama Foundation summit.',
    'lead_paragraph': 'Former President Barack Obama on Tuesday challenged youth activists on their “purity” and “judgmentalism” during an interview about youth activism at the Obama Foundation summit.',
    'abstract': 'Former President Barack Obama on Tuesday challenged youth activists on their “purity” and “judgmentalism” during an interview about youth activism at the Obama Foundation summit.',
    'source': 'The New York Times',
    'multimedia': [{'rank': 0,
      'subtype': 'xlarge',
      'caption': None,
      'credit': None,
      'type': 'image',
      'url': 'images/2019/10/3

In [29]:
start_year = 2002
end_year = 2019
country = 'Sweden'
dictionary = 'Vote, Ballot, Poll'

In [92]:
import numpy as np
import pandas as pd
import time
import re
import requests
from bs4 import BeautifulSoup

def parse_articles(articles):
    
    news = []
    
    for i in articles['response']['docs']:
        
        dic = {}
        dic['date'] = i['pub_date'][0:10] # cutting time of day.
        dic['atype'] = i['type_of_material']
        dic['url'] = i['web_url']
        dic['word_count'] = int(i['word_count'])
        dic['lead_par'] = i['lead_paragraph']
        news.append(dic)
        
    return news

def get_articles_url(api, country, start_year, end_year):
    
    all_articles = []
    year = start_year
    
    print('Retrieving articles URL...'),
    
    #Loop through all years of interest
    while year <= end_year:
        
        # Some pages might return a 'No JSON object could be decoded'
        # Example: country = Turkey, year = 1998, page 4
        # To keep this error from stopping the loop a try/except was used.
        for i in range(0,100):

            try:
                # Call API method with the parameters discussed on the README file
                articles = api.search(
                    fq = {'source':['The New York Times'], 
                          'glocations':(country), 
                          'news_desk':('Foreign')}, 
                    begin_date = str(year) + '0101', 
                    end_date = str(year) + '0131', 
                    sort = 'oldest', page = str(i))
                
                # Check if page is empty
                if articles['response']['docs'] == []: break
                
                articles = parse_articles(articles)
                all_articles = all_articles + articles
                
            except Exception:

                pass
            
            # Avoid overwhelming the API
            time.sleep(1)
            
        year += 1
    
    # Copy all articles on the list to a Pandas dataframe
    articles_df = pd.DataFrame(all_articles)
    
    # Make sure we filter out non-news articles and remove 'atype' column
    #articles_df = articles_df.drop(articles_df[articles_df.atype != 'News'].index)
    #articles_df.drop('atype', axis = 1, inplace = True)
    
    # Discard non-working links (their number of word_count is 0).
    # Example: http://www.nytimes.com/2001/11/06/world/4-die-during-police-raid-in-istanbul.html
   # articles_df = articles_df[articles_df.word_count != 0]
    articles_df = articles_df.reset_index(drop = True)
    
    print('Done!')
    
    return(articles_df)

def scarp_articles_text(articles_df):
    
    # Unable false positive warning from Pandas dataframe manipulation
    pd.options.mode.chained_assignment = None
    
    articles_df['article_text'] = 'NaN'
    session = requests.Session()
    
    print('Scarping articles body text...'),
    
    for j in range(0, len(articles_df)):
        
        url = articles_df['url'][j]
        req = session.get(url)
        soup = BeautifulSoup(req.text, 'lxml')

        # Get only HTLM tags with article content
        # Articles through 1986 are found under different p tag 
        paragraph_tags = soup.find_all('p', class_= 'story-body-text story-content')
        if paragraph_tags == []:
            paragraph_tags = soup.find_all('p', itemprop = 'articleBody')

        # Put together all text from HTML p tags
        article = ''
        for p in paragraph_tags:
            article = article + ' ' + p.get_text()

        # Clean article replacing unicode characters
        article = article.replace(u'\u2018', u"'").replace(u'\u2019', u"'").replace(u'\u201c', u'"').replace(u'\u201d', u'"')

        # Copy article's content to the dataframe
        articles_df['article_text'][j] = article
    
    print('Done!')
    
    return articles_df

In [45]:
articles_df = get_articles_url(api, country, start_year, end_year)
articles_df.tail()

Retrieving articles URL...
Done!


Unnamed: 0,atype,date,lead_par,url,word_count
15,News,2016-01-31,"HALMSTAD, Sweden — Scores of masked men dresse...",https://www.nytimes.com/2016/02/01/world/europ...,394
16,News,2017-01-19,"LONDON — Julian Assange, the WikiLeaks founder...",https://www.nytimes.com/2017/01/19/world/europ...,526
17,News,2018-01-23,BEIJING — China on Tuesday publicly rebuffed d...,https://www.nytimes.com/2018/01/23/world/asia/...,790
18,News,2019-01-02,"HALMSTAD, Sweden — Maria Hussein, who escaped ...",https://www.nytimes.com/2019/01/02/world/europ...,1288
19,News,2019-01-18,STOCKHOLM — After a deadlock lasting more than...,https://www.nytimes.com/2019/01/18/world/europ...,849


In [11]:
year = 2003

In [12]:
i=1

In [33]:
 articles = api.search(
                    fq = {'source':['The New York Times'], 
                          'glocations':(country), 
                          'news_desk':('Foreign')}, 
                    begin_date = str(year) + '0101', 
                    end_date = str(2019) + '0131', 
                    sort = 'oldest', page = str(i))

In [34]:
articles

{'status': 'OK',
 'copyright': 'Copyright (c) 2019 The New York Times Company. All Rights Reserved.',
 'response': {'docs': [{'web_url': 'https://www.nytimes.com/2008/01/04/obituaries/04bolin.html',
    'snippet': 'Bert Bolin was a pioneering climatologist and the first chairman of the United Nations Intergovernmental Panel on Climate Change. ',
    'abstract': 'Bert Bolin was a pioneering climatologist and the first chairman of the United Nations Intergovernmental Panel on Climate Change. ',
    'print_page': '7',
    'source': 'The New York Times',
    'multimedia': [{'rank': 0,
      'subtype': 'articleInline',
      'caption': None,
      'credit': None,
      'type': 'image',
      'url': 'images/2008/01/04/world/04bolin.190.jpg',
      'height': 241,
      'width': 190,
      'legacy': {},
      'subType': 'articleInline',
      'crop_name': 'articleInline'},
     {'rank': 0,
      'subtype': 'thumbnail',
      'caption': None,
      'credit': None,
      'type': 'image',
      '

In [28]:
articles_df

In [35]:
if articles['response']['docs'] == []: hyj = 1

In [36]:
hyj

NameError: name 'hyj' is not defined

In [37]:
articles = parse_articles(articles)

NameError: name 'parse_articles' is not defined

In [42]:
articles_df = scarp_articles_text(articles_df)
articles_df.tail()

Scarping articles body text...
Done!


Unnamed: 0,atype,date,url,word_count,article_text
9,Obituary (Obit),2015-01-31,https://www.nytimes.com/2015/02/01/world/europ...,918,
10,News,2017-01-19,https://www.nytimes.com/2017/01/19/world/europ...,526,
11,News,2018-01-23,https://www.nytimes.com/2018/01/23/world/asia/...,790,
12,News,2019-01-02,https://www.nytimes.com/2019/01/02/world/europ...,1288,
13,News,2019-01-18,https://www.nytimes.com/2019/01/18/world/europ...,849,


In [46]:
export_csv = articles_df.to_csv (r'C:\Users\anton\Documents\work\workIA\coding\fIncubator\stage02\myProject\data\dumpFroJupyterLead.csv', index = None, header=True) 

In [93]:
import time

In [69]:
time_string = "20180621"
result = time.strptime(time_string, "%Y%m%d")

In [74]:
epochseconds = time.mktime(result)
deltaDay = 60*60*24
nextDayEpoch = epochsec+deltaDay
nexDay = time.localtime(nextDayEpoch)

In [79]:
nextDay= time.strftime('%Y%m%d', nexDay)

In [80]:
nextDay

'20180622'

In [94]:
def get_nextday(today):
    todayStruct = time.strptime(today, '%Y%m%d')
    epochseconds = time.mktime(todayStruct)
    deltaDay = 60*60*24
    nextDayEpoch = epochseconds+deltaDay
    nexDayStruct = time.localtime(nextDayEpoch)
    nextDay= time.strftime('%Y%m%d', nexDayStruct)
    return nextDay
    

In [85]:
get_nextday('20191102')

'20191103'

In [99]:
def get_articles_url(api, start_day, end_day):
    
    all_articles = []
    day = start_day
    
    print('Retrieving articles URL...'),
    
    #Loop through all years of interest
    while day != end_day:
        
        # Some pages might return a 'No JSON object could be decoded'
        # Example: country = Turkey, year = 1998, page 4
        # To keep this error from stopping the loop a try/except was used.
        for i in range(0,2):

            try:
                # Call API method with the parameters discussed on the README file
                articles = api.search(
                    fq = {'source':['The New York Times'],                           
                          'news_desk':('Business')}, 
                    begin_date = day, 
                    end_date = day, 
                    sort = 'oldest', page = str(i))
                
                # Check if page is empty
                if articles['response']['docs'] == []: break
                
                articles = parse_articles(articles)
                all_articles = all_articles + articles
                
            except Exception:

                pass
            
            # Avoid overwhelming the API
            time.sleep(1)
            
        day = get_nextday(day)
    
    # Copy all articles on the list to a Pandas dataframe
    articles_df = pd.DataFrame(all_articles)
    
    # Make sure we filter out non-news articles and remove 'atype' column
    #articles_df = articles_df.drop(articles_df[articles_df.atype != 'News'].index)
    #articles_df.drop('atype', axis = 1, inplace = True)
    
    # Discard non-working links (their number of word_count is 0).
    # Example: http://www.nytimes.com/2001/11/06/world/4-die-during-police-raid-in-istanbul.html
   # articles_df = articles_df[articles_df.word_count != 0]
    articles_df = articles_df.reset_index(drop = True)
    
    print('Done!')
    
    return(articles_df)

In [100]:
start_day = '20180101'
end_day = '20180105'
articles_df = get_articles_url(api, start_day, end_day)
articles_df.tail()

Retrieving articles URL...
Done!


Unnamed: 0,atype,date,lead_par,url,word_count
49,News,2018-01-04,Sales of electric and hybrid cars in Norway ou...,https://www.nytimes.com/2018/01/04/business/en...,785
50,News,2018-01-04,Newsroom employees at The Los Angeles Times be...,https://www.nytimes.com/2018/01/04/business/me...,869
51,News,2018-01-04,In the midst of a long-running bull market tha...,https://www.nytimes.com/2018/01/04/business/ma...,1406
52,News,2018-01-04,"PASADENA, Calif. — A top Fox TV executive said...",https://www.nytimes.com/2018/01/04/business/me...,740
53,News,2018-01-04,"On Wednesday, a group of security experts reve...",https://www.nytimes.com/2018/01/04/technology/...,1188


In [101]:
export_csv = articles_df.to_csv (r'C:\Users\anton\Documents\work\workIA\coding\fIncubator\stage02\myProject\data\dumpFroJupyterBusiness01-10.csv', index = None, header=True) 

In [None]:
articles = api.search(
                    fq = {'source':['The New York Times'],                           
                          'section_name':('Business')}, 
                    begin_date = day, 
                    end_date = day, 
                    sort = 'oldest', page = str(i))