In [2]:
#Imports
import time
#Assign time.time() object to "start" so we can profile the code.
start = time.time()
import pandas as pd
import numpy as np
from newspaper import Article
#Load in pickle file with links and slice the first 50 links
links = pd.read_pickle("nytimes_links.pkl")[:50]

#Intialize list articles_info list
articles_info = []
for i in links:
    #Intialize dictionary
    article_dict = {}
    #Insert link "i" into the dictionary
    article_dict["link"] = i
    #Pass link into Article() function
    art = Article(i)
    #Download contents of art object
    art.download()
    
    #Try/except is included because not all articles can be parsed
    try:
        #If article can be successfully parsed then insert its text, title, publish_date, keywords
        #and summary into corresponding keys
        art.parse()
        article_dict["text"] = art.text
        article_dict["title"] = art.title
        article_dict["date"] = art.publish_date
        art.nlp()
        article_dict["keywords"] = art.keywords
        article_dict["summary"] = art.summary
    except ArticleException:
        #If article cannot be parse then insert null values for the following keys:
        #"text", "title", "date", "keywords", and "summary"
        article_dict["text"] = np.nan
        article_dict["title"] = np.nan
        article_dict["date"] = np.nan
        article_dict["keywords"] = np.nan
        article_dict["summary"] = np.nan
        
    #Insert dictionary of article info into the articles_info list
    articles_info.append(article_dict)
#Pass the list of dictionaries into a pandas data frame
corpus = pd.DataFrame(articles_info)
#Print how long the process took
print("Script took {:.2f} seconds to complete".format(time.time() - start))

Script took 38.52 seconds to complete


In [3]:
#Take a look at the data frame
corpus.head()

Unnamed: 0,date,keywords,link,summary,text,title
0,2017-09-08,"[intelligence, researchers, sexism, female, la...",https://www.nytimes.com/2017/09/08/upshot/sexi...,Across all the categories analyzed and all the...,"First, here’s what they reported finding last ...",Sexism and Shopping: Female Players Get Most o...
1,2017-09-08,"[latest, versus, intelligence, tegmark, patter...",https://www.nytimes.com/2017/09/08/books/revie...,PhotoGODZILLA VERSUS MOTHRA: James Patterson’s...,Photo\n\nGODZILLA VERSUS MOTHRA: James Patters...,James Patterson’s Latest Villain Looks a Lot L...
2,2017-09-01,"[intelligence, human, harm, systems, regulate,...",https://www.nytimes.com/2017/09/01/opinion/art...,PhotoThe technology entrepreneur Elon Musk rec...,Photo\n\nThe technology entrepreneur Elon Musk...,How to Regulate Artificial Intelligence
3,2017-09-14,"[intelligence, teaching, human, experiments, e...",https://www.nytimes.com/2017/09/14/opinion/art...,PhotoTo the Editor:Re “How to Regulate Artific...,Photo\n\nTo the Editor:\n\nRe “How to Regulate...,Ethics and Artificial Intelligence
4,2017-08-30,"[assistants, mr, working, way, cortana, bezos,...",https://www.nytimes.com/2017/08/30/technology/...,But Mr. Bezos and Mr. Nadella are concerned th...,But Mr. Bezos and Mr. Nadella are concerned th...,"‘Cortana, Open Alexa,’ Amazon Says. And Micros..."
