In [9]:
import requests
import datetime
import re

In [10]:
"""This class converts a dict to nested objects"""

class Struct(object):
    """
    Attributes will depend on the structure of object. 
    If we keep calling the 'everything' newsapi, then the attributes will be:
    
                    articles: A list of articles, each with their own objects
                    status: Status of request, should be 'ok'
                    totalResults: The total number of results available for the request, will need
                                  to use the &page= parameter to get these as only 20 articles are
                                  returned per request.
                                  
    Resource: https://stackoverflow.com/questions/1305532/convert-python-dict-to-object
    """
    def __init__(self, data):
        for name, value in data.items():
            setattr(self, name, self._wrap(value))

    def _wrap(self, value):
        if isinstance(value, (tuple, list, set, frozenset)): 
            return type(value)([self._wrap(v) for v in value])
        else:
            return Struct(value) if isinstance(value, dict) else value


"""
General class to aggregate all useful objects. 
Could customize, e.g. change structure of get_raw_data to affect data object
"""

class myclass(object):
    """
    Attributes:
                call: The url sent to newsapi
                raw_data: The dictionary returned when requesting call
    """
    
    def get_raw_data(self,call):
        r = requests.get(call).json()
        
        for i in r['articles']:
            del i['author']
            del i['urlToImage']
            t = datetime.datetime.strptime(i['publishedAt'], "%Y-%m-%dT%H:%M:%S%fZ")
            nt = t.replace(hour=0, minute=0, second=0, microsecond=0)
            i['publishedAt'] = str(nt)
            i['source'] = i['source']['name']
        
        return r
    
    def __init__(self,call):
        self.call = call
        self.data = Struct(self.get_raw_data(call))
        self.data.n_pages = self.data.totalResults/20
        
    """
    Takes call and paginates over user input number of pages to provide a list of 
    lists made up of articles
    """
        
    def paginate(self,n):
        #If page argument already exists in call, remove it
        fp = self.call.find('&page=')
        if fp > 0:
            l = [x for x, v in enumerate(self.call) if v == '&']
            l.append(len(self.call))
            nxt = l[next(x[0] for x in enumerate(l) if x[1] > fp)]
            base_call = self.call[:fp] + self.call[nxt:]
        else:
            base_call = self.call
            
        #print(base_call)
        articles_list = []
        
        for i in range(1,n+1):
            new_call = base_call + "&page=" + str(i)
            #print(new_call)
            d = Struct(self.get_raw_data(new_call))
            #print(d.articles[0].__dict__)
            articles_list.extend(d.articles)
            
        return articles_list


In [11]:
n =  myclass('https://newsapi.org/v2/everything?q=(BTC OR bitcoin)&from=2018-01-11&to=2018-01-18&language=en&sortBy=popularity&apiKey=6d00cdefd3bc4ee38f8a7af69ac5bec4')

In [12]:
n.data.__dict__

{'articles': [<__main__.Struct at 0x7f378c03d1d0>,
  <__main__.Struct at 0x7f378c03d7f0>,
  <__main__.Struct at 0x7f378c03d2b0>,
  <__main__.Struct at 0x7f378c03d2e8>,
  <__main__.Struct at 0x7f378c03d358>,
  <__main__.Struct at 0x7f378c03d550>,
  <__main__.Struct at 0x7f378c03d390>,
  <__main__.Struct at 0x7f378c03dac8>,
  <__main__.Struct at 0x7f378c03d208>,
  <__main__.Struct at 0x7f378c03d6d8>,
  <__main__.Struct at 0x7f378c03d710>,
  <__main__.Struct at 0x7f378c03d5c0>,
  <__main__.Struct at 0x7f378c03d588>,
  <__main__.Struct at 0x7f378c03d668>,
  <__main__.Struct at 0x7f378c03dc50>,
  <__main__.Struct at 0x7f378c03d080>,
  <__main__.Struct at 0x7f378c03d128>,
  <__main__.Struct at 0x7f378c03dcc0>,
  <__main__.Struct at 0x7f378c03d940>,
  <__main__.Struct at 0x7f378c03d908>],
 'n_pages': 256.7,
 'status': 'ok',
 'totalResults': 5134}

In [13]:
l = n.paginate(250)

In [16]:
len(l)  #should be 20 * input of paginate()

5000

In [17]:
import pickle
PIK = "test.dat"

with open(PIK, "wb") as f:
    pickle.dump(l, f)