For questions or bugs, contact Dr. H  

## Import

In [1]:
import requests
import json
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

## Set credentials 

* You will need to get your own API credentials to use this code 
* [https://newsapi.org/docs/get-started](https://newsapi.org/docs/get-started)

In [2]:
baseURL = "https://newsapi.org/v2/everything?"
total_requests=2
verbose=True

# THIS CODE WILL NOT WORK UNLESS YOU INSERT YOUR API KEY IN THE NEXT LINE
API_KEY='INSERT_API_KEY'
TOPIC='coffee'

## Form URL and save result

In [3]:
URLpost = {'apiKey': API_KEY,
            'q': '+'+TOPIC,
            'sortBy': 'relevancy',
            'totalRequests': 1}

print(baseURL)
# print(URLpost)

#GET DATA FROM API
response = requests.get(baseURL, URLpost) #request data from the server
# print(response.url);  
response = response.json() #extract txt data from request into json

# PRETTY PRINT
# https://www.digitalocean.com/community/tutorials/python-pretty-print-json

print(json.dumps(response, indent=2))

# #GET TIMESTAMP FOR PULL REQUEST
from datetime import datetime
timestamp = datetime.now().strftime("%Y-%m-%d-H%H-M%M-S%S")

# SAVE TO FILE 
with open(timestamp+'-newapi-raw-data.json', 'w') as outfile:
    json.dump(response, outfile, indent=4)


https://newsapi.org/v2/everything?
{
  "status": "ok",
  "totalResults": 9635,
  "articles": [
    {
      "source": {
        "id": "engadget",
        "name": "Engadget"
      },
      "author": "Sarah Fielding",
      "title": "Corsair\u2019s first standing desk is a huge and expandable all-in-one",
      "description": "There are desks with just enough room to put a coffee next to your laptop without the risk of it spilling over the edge, and then there are desks that are big (and complex) enough that you could practically open up your own coffee stand. The upcoming Corsair \u2026",
      "url": "https://www.engadget.com/corsairs-first-standing-desk-is-a-huge-and-expandable-all-in-one-113519907.html",
      "urlToImage": "https://s.yimg.com/os/creatr-uploaded-images/2023-08/ac0ec3e0-4197-11ee-9df7-aa3c67cc596d",
      "publishedAt": "2023-08-23T11:35:19Z",
      "content": "There are desks with just enough room to put a coffee next to your laptop without the risk of it spilling ove

## Utility function

* Function to clean strings

In [4]:
def string_cleaner(input_string):
    try: 
        out=re.sub(r"""
                    [,.;@#?!&$-]+  # Accept one or more copies of punctuation
                    \ *           # plus zero or more copies of a space,
                    """,
                    " ",          # and replace it with a single space
                    input_string, flags=re.VERBOSE)

        #REPLACE SELECT CHARACTERS WITH NOTHING
        out = re.sub('[’.]+', '', input_string)

        #ELIMINATE DUPLICATE WHITESPACES USING WILDCARDS
        out = re.sub(r'\s+', ' ', out)

        #CONVERT TO LOWER CASE
        out=out.lower()
    except:
        print("ERROR")
        out=''
    return out

## Clean JSON

* clean data and make a list of lists

In [5]:
article_list=response['articles']   #list of dictionaries for each article
article_keys=article_list[0].keys()
print("AVAILABLE KEYS:")
print(article_keys)
index=0
cleaned_data=[];  
for article in article_list:
    tmp=[]
    if(verbose):
        print("#------------------------------------------")
        print("#",index)
        print("#------------------------------------------")

    for key in article_keys:
        if(verbose):
            print("----------------")
            print(key)
            print(article[key])
            print("----------------")

        if(key=='source'):
            src=string_cleaner(article[key]['name'])
            tmp.append(src) 

        if(key=='author'):
            author=string_cleaner(article[key])
            #ERROR CHECK (SOMETIMES AUTHOR IS SAME AS PUBLICATION)
            if(src in author): 
                print(" AUTHOR ERROR:",author);author='NA'
            tmp.append(author)

        if(key=='title'):
            tmp.append(string_cleaner(article[key]))

        # if(key=='description'):
        #     tmp.append(string_cleaner(article[key]))

        # if(key=='content'):
        #     tmp.append(string_cleaner(article[key]))

        if(key=='publishedAt'):
            #DEFINE DATA PATERN FOR RE TO CHECK  .* --> wildcard
            ref = re.compile('.*-.*-.*T.*:.*:.*Z')
            date=article[key]
            if(not ref.match(date)):
                print(" DATE ERROR:",date); date="NA"
            tmp.append(date)

    cleaned_data.append(tmp)
    index+=1


AVAILABLE KEYS:
dict_keys(['source', 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt', 'content'])
#------------------------------------------
# 0
#------------------------------------------
----------------
source
{'id': 'engadget', 'name': 'Engadget'}
----------------
----------------
author
Sarah Fielding
----------------
----------------
title
Corsair’s first standing desk is a huge and expandable all-in-one
----------------
----------------
description
There are desks with just enough room to put a coffee next to your laptop without the risk of it spilling over the edge, and then there are desks that are big (and complex) enough that you could practically open up your own coffee stand. The upcoming Corsair …
----------------
----------------
url
https://www.engadget.com/corsairs-first-standing-desk-is-a-huge-and-expandable-all-in-one-113519907.html
----------------
----------------
urlToImage
https://s.yimg.com/os/creatr-uploaded-images/2023-08/ac0ec3e0-4197-11

## Convert to Dataframe

* Convert to pandas data frame and write to csv



In [6]:
df = pd.DataFrame(cleaned_data)
print(df)
df.to_csv('cleaned.csv', index=False) #,index_label=['title','src','author','date','description'])


                   0                                1  \
0           engadget                   sarah fielding   
1   business insider                     eliza relman   
2   business insider                      alex bitter   
3   business insider            morgan mcfall-johnsen   
4        boing boing                 jason weisberger   
..               ...                              ...   
95          huffpost                     sara boboltz   
96  business insider                       grace dean   
97             /film  staff@slashfilmcom (mike shutt)   
98  business insider                    taylor berman   
99  business insider                       grace dean   

                                                    2                     3  
0   corsairs first standing desk is a huge and exp...  2023-08-23T11:35:19Z  
1                         loneliness is a design flaw  2023-09-06T09:30:01Z  
2   i shopped at amazon fresh to see why it's stru...  2023-09-04T09:00:06Z  
3  