In [1]:
import pandas as pd
import numpy as np
import requests
import cnfg
import json
from pymongo import MongoClient
import pprint
import time
import random
#import newspaper
from datetime import datetime
from bs4 import BeautifulSoup
from IPython.display import clear_output

In [2]:
import logging
logging.basicConfig(filename='nyt_error_log.log',filemode='w', level=logging.INFO)

In [3]:
config = cnfg.load(".katie_mongo.cnfg")
mongouser=config['user']
mongopwd=config['pwd']

In [4]:
###MONGO initialization
client = MongoClient("mongodb://"+mongouser+":"+mongopwd+"@localhost/test") #host='localhost:27017'
db = client.test #database
table = db.nyttest #"table" also called collection

In [5]:
def get_date(request_page):
    '''get date from final page in allowed page requests
    and return it in order to build new API query 
    & access older pages'''
    
    last_date=datetime.strptime(request_page[0]['pub_date'][:10], '%Y-%m-%d')
    prevdate_str = last_date.date().strftime('%Y%m%d')
    return prevdate_str

In [6]:
def get_items(request_page, page_num):
    ''' 
    arguments:
    request_page - the 10 item long json file of query results
    page_num - the page number of the larger api query
    
    get_items scrapes the date, url, desk, section name, type of material,  and keywords
    for one each query result and will store it in mongo db
    '''
    for article in request_page:
        entry={}
        keywords_list=[]
        try:
            entry['date']=article['pub_date']
        except:
            pass
        try:
            entry['url']=article['web_url']
        except:
            pass
        try:
            entry['source']=article['source']
        except:
            pass    
        try:
            entry['news_desk']=article['news_desk']
        except:
            pass    
        try:
            entry['section_name']=article['section_name']
        except:
            pass    
        try:
            entry['type_of_material']=article['type_of_material']
        except:
            pass    
        try:
            for i in range(0, len(article['keywords'])):
                keywords_list.append(article['keywords'][i]['value'])
            entry['keywords_list']=keywords_list
        except:
            pass

        # insert into db
        try:
            table.insert_one(entry)
        except:
            msg = 'Failed to insert item on page %i into mongodb' % page_num
            logging.error(msg)

In [7]:
nyt_api_config = cnfg.load("notebooks/.nyt_api_config")
nyt_api_key = nyt_api_config['api_key']

In [7]:
# ALREADY QUERIED: NYT & type_of_material= article,letter,blog
#                  NYT & type_of_material= column,editorial

startpage=0
endpage=150

base='https://api.nytimes.com/svc/search/v2/articlesearch.json?'
q="q=abortion"
fq="&fq=source:(\"The New York Times\") AND type_of_material:(\"column\" \"editorial\")"
fl="&fl=pub_date,headline,news_desk,source,section_name,web_url,keywords,type_of_material"
begin_date="&begin_date=19800101"
sort="&sort=newest"
facet_field="&facet-field=section_name"
facet_filter="&facet-filter=true"
a="&api-key="+nyt_api_key

#make server requests for content & url's page by page (0-200), currently 0-150
for i in range(startpage,endpage):
    page="&page="+str(i)
    querystring= base+q+fq+fl+facet_field+facet_filter+begin_date+page+sort+a
    response = requests.get(querystring)
    articles=response.json()
    
    if articles['status']=='OK':
        msg="Status: Round 0-100, OK Page %i" % i
        print(msg)
        get_items(articles['response']['docs'], i)
        clear_output(wait=True)
    else:
        msg = 'Blocked from server at date %d, page %i' % (begin_date,i)
        logging.error(msg)
        break

    #wait 1 seconds before submitting request for next page of results
    time.sleep(1)


clear_output()

# #make server requests for url's page by page - with updated end date (200-400)
# end_date="&end_date="+get_date(articles['response']['docs'])
# for i in range(startpage,endpage):
#     page="&page="+str(i)
#     querystring= base+q+fq+fl+facet_field+facet_filter+begin_date+end_date+page+sort+a
#     response = requests.get(querystring)
#     articles=response.json()
    
#     if articles['status']=='OK':
#         msg="Status: Round 200-400, OK Page %i" % i
#         print(msg)
#         get_items(articles['response']['docs'], i)
#         clear_output(wait=True)
#     else:
#         msg = 'Blocked from server at date %d, page %i' % (begin_date,i)
#         logging.error(msg)
#         break

#     #wait 1 seconds before submitting request for next page of results
#     time.sleep(1)
    

# #make server requests for url's page by page - with updated end date (400-600)
# end_date="&end_date="+get_date(articles['response']['docs'])
# for i in range(startpage,endpage):
#     page="&page="+str(i)
#     querystring= base+q+fq+fl+facet_field+facet_filter+begin_date+end_date+page+sort+a
#     response = requests.get(querystring)
#     articles=response.json()
    
#     if articles['status']=='OK':
#         msg="Status: Round 400-600, OK Page %i" % i
#         print(msg)
#         get_items(articles['response']['docs'], i)
#         clear_output(wait=True)
#     else:
#         msg = 'Blocked from server at date %d, page %i' % (begin_date,i)
#         logging.error(msg)
#         break

#     #wait 1 seconds before submitting request for next page of results
#     time.sleep(1)
    

# #make server requests for url's page by page - with updated end date (600-700)
# end_date="&end_date="+get_date(articles['response']['docs'])
# endpage=100
# for i in range(startpage,endpage):
#     page="&page="+str(i)
#     querystring= base+q+fq+fl+facet_field+facet_filter+begin_date+end_date+page+sort+a
#     response = requests.get(querystring)
#     articles=response.json()
    
#     if articles['status']=='OK':
#         msg="Status: Round 600-700, OK Page %i" % i
#         print(msg)
#         get_items(articles['response']['docs'], i)
#         clear_output(wait=True)
#     else:
#         msg = 'Blocked from server at date %d, page %i' % (begin_date,i)
#         logging.error(msg)
#         break

#     #wait 1 seconds before submitting request for next page of results
#     time.sleep(1)

In [9]:
# ALREADY QUERIED: NYT & type_of_material= article,letter,blog
#                  NYT & type_of_material= column,editorial
# NOW QUERING: NYT & type_of_materials= News

startpage=0
endpage=200

base='https://api.nytimes.com/svc/search/v2/articlesearch.json?'
q="q=abortion"
fq="&fq=source:(\"The New York Times\") AND type_of_material:(\"News\")"
fl="&fl=pub_date,headline,news_desk,source,section_name,web_url,keywords,type_of_material"
begin_date="&begin_date=20040924"
sort="&sort=oldest"
facet_field="&facet-field=section_name"
facet_filter="&facet-filter=true"
a="&api-key="+nyt_api_key

#make server requests for content & url's page by page (0-200)
for i in range(startpage,endpage):
    page="&page="+str(i)
    querystring= base+q+fq+fl+facet_field+facet_filter+begin_date+page+sort+a
    response = requests.get(querystring)
    articles=response.json()
    
    if articles['status']=='OK':
        msg="Status: Round 0-200, OK Page %i" % i
        print(msg)
        get_items(articles['response']['docs'], i)
        clear_output(wait=True)
    else:
        msg = 'Blocked from server at date %d, page %i' % (begin_date,i)
        print(msg)
        break

    #wait 1 seconds before submitting request for next page of results
    time.sleep(1)


clear_output()

    
while(True):
    begin_date="&begin_date="+get_date(articles['response']['docs'])
    for i in range(0,199):
        page="&page="+str(i)
        querystring= base+q+fq+fl+facet_field+facet_filter+begin_date+page+sort+a
        response = requests.get(querystring)
        articles=response.json()

        if articles['status']=='OK':
            msg="Status: begin date "+begin_date+", OK Page %i" % i
            print(msg)
            get_items(articles['response']['docs'], i)
            clear_output(wait=True)
        else:
            msg = 'Blocked from server at date %d, page %i' % (begin_date,i)
            print(msg)
            break

        #wait 1 seconds before submitting request for next page of results
        time.sleep(1)

IndexError: list index out of range

### PLANNING
Date
url
Keywords
Text
news_desk
section_name
type_of_material=article,blog,column,letter
type_of_material!=video,
source:("The New York Times") AND type_of_material:("article" "blog" "letter" "column" "editorial")


keywords_list=[]
date=a['pub_date']
url=a['web_url']
news_desk=a['news_desk']
section_name=a['section_name']
type_of_material=a['type_of_material']
for i in range(0, len(a['keywords'])):
    keywords_list.append(a['keywords'][i]['value'])


entry['original_text']=get_text(article['web_url'])

### Scrape text from articles in mongo db

In [26]:
def get_text(url):
    ''' 
    scrape the significant text from the url specified
    '''
    text=''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    
    try:
        for element in soup.find_all(class_='css-1i0edl6 e2kc3sl0'):
            text=text+' '+element.text
    except:
        pass
    try:
        for element in soup.find_all(class_='story-body-text story-content'):
            text=text+' '+element.text
    except:
        pass
    try:
        bod=soup.find(class_='articleBody')
        for element in bod.find_all('p'):
            text=text+''+element.text
    except:
        pass
    try:
        for element in soup.find_all(class_='story-body-text'):
            text=text+' '+element.text
    except:
        msg = 'Failed to get scrape text from page '+url
        print(msg)
        return('')
    return(text)

In [9]:
def text_scrape():
    ''' 
    iterate through mongodb documents that do NOT have original text yet, insert scraped text into document
    '''
    cursor=table.find({'original_text':{'$exists': False}})
    for item in cursor:
        text=get_text(item['url'])
        table.update_one({'_id':item['_id']}, {'$set': {'original_text': text} })
        #time.sleep(randint(1,3))

In [None]:
text_scrape()

In [10]:
table.find({'original_text':{'$exists': False}}).count()

  """Entry point for launching an IPython kernel.


0

In [27]:
def text_scrape2():
    ''' 
    iterate through mongodb documents that failed first round
    '''
    cursor=table.find({'original_text':''})
    
    for item in cursor:
        if requests.get('http://www.nytimes.com').status_code == 200:
            text=get_text(item['url'])
            table.update_one({'_id':item['_id']}, {'$set': {'original_text': text} }, upsert=False)
            time.sleep(random.randint(1,3))
        else:
            break

In [28]:
%time text_scrape2()

CPU times: user 12.6 s, sys: 248 ms, total: 12.9 s
Wall time: 7min 1s


In [29]:
table.find({'original_text':''}).count()

  """Entry point for launching an IPython kernel.


18