In [1]:
import requests
r = requests.get('https://www.nytimes.com/interactive/2017/06/23/opinion/trumps-lies.html')
#print the first 500 characters of html to see what we're dealing with.
print(r.text[0:500])

<!DOCTYPE html>
<!--[if (gt IE 9)|!(IE)]> <!--><html lang="en" class="no-js page-interactive section-opinion page-theme-standard tone-opinion page-interactive-default limit-small layout-xlarge app-interactive" itemid="https://www.nytimes.com/interactive/2017/06/23/opinion/trumps-lies.html" itemtype="http://schema.org/NewsArticle" itemscope xmlns:og="http://opengraphprotocol.org/schema/"><!--<![endif]-->
<!--[if IE 9]> <html lang="en" class="no-js ie9 lt-ie10 page-interactive section-opinion page


In [2]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')
#cell 1 and 2 are the standard start to every webscraping project.

In [5]:
results = soup.find_all('span', attrs =  {'class':'short-desc'})
#searches the soup object for all <span> tags in the html code with attribute class = 'short-desc' which is what 
#this article uses to define each particular lie.
len(results)
#note that len(results) here differs from the article, I wonder why?

180

In [10]:
#now we can slice the objects like a list to see if what we've done makes sense.
results[0:3]

[<span class="short-desc"><strong>Jan. 21 </strong>“I wasn't a fan of Iraq. I didn't want to go into Iraq.” <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span></span>,
 <span class="short-desc"><strong>Jan. 21 </strong>“A reporter for Time magazine — and I have been on their cover 14 or 15 times. I think we have the all-time record in the history of Time magazine.” <span class="short-truth"><a href="http://nation.time.com/2013/11/06/10-things-you-didnt-know-about-time/" target="_blank">(Trump was on the cover 11 times and Nixon appeared 55 times.)</a></span></span>,
 <span class="short-desc"><strong>Jan. 23 </strong>“Between 3 million and 5 million illegal votes caused me to lose the popular vote.” <span class="short-truth"><a href="https://www.nytimes.com/2017/01/23/us/politics/donald-trump-congress-democrats.html" target="_

In [9]:
#and once more at the end.
results[177:179]

[<span class="short-desc"><strong>Nov. 1 </strong>“Again, we're the highest-taxed nation, just about, in the world.” <span class="short-truth"><a href="http://www.politifact.com/truth-o-meter/statements/2016/may/08/donald-trump/donald-trump-us-not-highest-taxed-nation-in-world/" target="_blank">(We're not.)</a></span></span>,
 <span class="short-desc"><strong>Nov. 7 </strong>“When you look at the city with the strongest gun laws in our nation, it's Chicago.” <span class="short-truth"><a href="http://www.politifact.com/truth-o-meter/statements/2017/nov/07/donald-trump/trump-wrongly-repeats-chicago-has-strongest-gun-la/" target="_blank">(Several other cities, including New York and Los Angeles, have stronger gun laws.)</a></span></span>]

In [12]:
#now we start experimenting with the code to see if we can begin to scrape it correctly.
first_result = results[0]
print(first_result)

<span class="short-desc"><strong>Jan. 21 </strong>“I wasn't a fan of Iraq. I didn't want to go into Iraq.” <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span></span>


In [13]:
#since we see the date is tagged with the 'strong' tag we'll use the soup .find() method
first_result.find('strong')

<strong>Jan. 21 </strong>

In [14]:
#this is still a tag object so we need to turn it to actual text.
first_result.find('strong').text

'Jan. 21\xa0'

In [15]:
#this \xa0 is an escape character an has length 1, so we'll just slice it off.
first_result.find('strong').text[0:-1]

'Jan. 21'

In [16]:
#Now let's add the year.
first_result.find('strong').text[0:-1] + ', 2017'

'Jan. 21, 2017'

In [20]:
#Great, now that we've done that, let's see if we can extract the text in the body.  Upon investigating the html we see
#that there aren't any special preceding tags in front of the body, so we can't use the find() method here.
#Instead we will use a method called content which returns the children, the tags and strings nested in a tag.
first_result.contents

[<strong>Jan. 21 </strong>,
 "“I wasn't a fan of Iraq. I didn't want to go into Iraq.” ",
 <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span>]

In [21]:
#Since this is returned as a list object, we can start slicing experimentally
first_result.contents[1]

"“I wasn't a fan of Iraq. I didn't want to go into Iraq.” "

In [22]:
#We see this gives us the lie we wanted. But we want to clean it up a bit.
first_result.contents[1][1:-2]

"I wasn't a fan of Iraq. I didn't want to go into Iraq."

In [23]:
#Now for the second piece
first_result.contents[2]

<span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span>

In [25]:
first_result.contents[2].text[1:-1]

'He was for an invasion before he was against it.'

In [26]:
#Why does this work?  Well let's look at just the text
first_result.contents[2].text

'(He was for an invasion before he was against it.)'

In [None]:
#By using the .text method we get rid of the html and can easily scrape the wanted text.
#Then we just had to dispose of the parentheses.

In [27]:
#Now we finally extract the url.  We'll use the .find() method again since the url is tagged with 'a' and specifically 
#soup treats tag attributes and values like key-value pairs in a dictionary.  Thus we may rip the url as follows:
first_result.find('a')['href']

'https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the'

Alright, we've experimented and figured out how to build the data set.  Now we will merely loop over the beautifulsoup object and build a list.

In [29]:
records = []
for result in results:
    data = result.find('strong').text[0:-1] + ', 2017'
    lie = result.contents[1][1:-2]
    explanation = result.contents[2].text[1:-1]
    url = result.find('a')['href']
    records.append((data, lie, explanation, url))
    
len(records)

180

In [30]:
records[0:5]

[('Jan. 21, 2017',
  "I wasn't a fan of Iraq. I didn't want to go into Iraq.",
  'He was for an invasion before he was against it.',
  'https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the'),
 ('Jan. 21, 2017',
  'A reporter for Time magazine — and I have been on their cover 14 or 15 times. I think we have the all-time record in the history of Time magazine.',
  'Trump was on the cover 11 times and Nixon appeared 55 times.',
  'http://nation.time.com/2013/11/06/10-things-you-didnt-know-about-time/'),
 ('Jan. 23, 2017',
  'Between 3 million and 5 million illegal votes caused me to lose the popular vote.',
  "There's no evidence of illegal voting.",
  'https://www.nytimes.com/2017/01/23/us/politics/donald-trump-congress-democrats.html'),
 ('Jan. 25, 2017',
  'Now, the audience was the biggest ever. But this crowd was massive. Look how far back it goes. This crowd was massive.',
  "Official aerial photos show Obama's 2009 inauguration was mu

In [43]:
#Finally then, we want to make this into a pandas dataframe.
import pandas as pd
df = pd.DataFrame(records, columns = ['date','lie','explanation', 'url'])
df.head()
#Why am I not getting output?

Unnamed: 0,date,lie,explanation,url
0,"Jan. 21, 2017",I wasn't a fan of Iraq. I didn't want to go in...,He was for an invasion before he was against it.,https://www.buzzfeed.com/andrewkaczynski/in-20...
1,"Jan. 21, 2017",A reporter for Time magazine — and I have been...,Trump was on the cover 11 times and Nixon appe...,http://nation.time.com/2013/11/06/10-things-yo...
2,"Jan. 23, 2017",Between 3 million and 5 million illegal votes ...,There's no evidence of illegal voting.,https://www.nytimes.com/2017/01/23/us/politics...
3,"Jan. 25, 2017","Now, the audience was the biggest ever. But th...",Official aerial photos show Obama's 2009 inaug...,https://www.nytimes.com/2017/01/21/us/politics...
4,"Jan. 25, 2017",Take a look at the Pew reports (which show vot...,The report never mentioned voter fraud.,https://www.nytimes.com/2017/01/24/us/politics...


In [35]:
df.tail()

Unnamed: 0,data,lie,explanation,url
175,"Oct. 25, 2017",We have trade deficits with almost everybody.,We have trade surpluses with more than 100 cou...,https://www.bea.gov/newsreleases/international...
176,"Oct. 27, 2017","Wacky & totally unhinged Tom Steyer, who has b...",Steyer has financially supported many winning ...,https://www.opensecrets.org/donor-lookup/resul...
177,"Nov. 1, 2017","Again, we're the highest-taxed nation, just ab...",We're not.,http://www.politifact.com/truth-o-meter/statem...
178,"Nov. 7, 2017",When you look at the city with the strongest g...,"Several other cities, including New York and L...",http://www.politifact.com/truth-o-meter/statem...
179,"Nov. 11, 2017","I'd rather have him – you know, work with him...","There is no evidence that Democrats ""set up"" R...",https://www.nytimes.com/interactive/2017/12/10...


In [36]:
df

Unnamed: 0,data,lie,explanation,url
0,"Jan. 21, 2017",I wasn't a fan of Iraq. I didn't want to go in...,He was for an invasion before he was against it.,https://www.buzzfeed.com/andrewkaczynski/in-20...
1,"Jan. 21, 2017",A reporter for Time magazine — and I have been...,Trump was on the cover 11 times and Nixon appe...,http://nation.time.com/2013/11/06/10-things-yo...
2,"Jan. 23, 2017",Between 3 million and 5 million illegal votes ...,There's no evidence of illegal voting.,https://www.nytimes.com/2017/01/23/us/politics...
3,"Jan. 25, 2017","Now, the audience was the biggest ever. But th...",Official aerial photos show Obama's 2009 inaug...,https://www.nytimes.com/2017/01/21/us/politics...
4,"Jan. 25, 2017",Take a look at the Pew reports (which show vot...,The report never mentioned voter fraud.,https://www.nytimes.com/2017/01/24/us/politics...
5,"Jan. 25, 2017",You had millions of people that now aren't ins...,"The real number is less than 1 million, accord...",https://www.nytimes.com/2017/03/13/us/politics...
6,"Jan. 25, 2017","So, look, when President Obama was there two w...",There were no gun homicide victims in Chicago ...,https://www.dnainfo.com/chicago/2017-chicago-m...
7,"Jan. 26, 2017",We've taken in tens of thousands of people. We...,Vetting lasts up to two years.,https://www.nytimes.com/interactive/2017/01/29...
8,"Jan. 26, 2017",I cut off hundreds of millions of dollars off ...,Most of the cuts were already planned.,https://www.washingtonpost.com/news/fact-check...
9,"Jan. 28, 2017",The coverage about me in the @nytimes and the ...,It never apologized.,https://www.nytimes.com/2016/11/13/us/election...


In [37]:
df[60:80]

Unnamed: 0,data,lie,explanation,url
60,"April 11, 2017","I like Steve, but you have to remember he was ...",He knew Steve Bannon since 2011.,https://www.nytimes.com/2017/04/12/us/politics...
61,"April 12, 2017","You can't do it faster, because they're obstru...","At this point, he had not nominated anyone for...",https://www.nytimes.com/2017/04/12/us/politics...
62,"April 12, 2017",The New York Times said the word wiretapped in...,There were separate headlines for print and we...,https://www.nytimes.com/2017/03/23/us/politics...
63,"April 12, 2017",The secretary general and I had a productive d...,NATO has been engaged in counterterrorism effo...,https://www.nytimes.com/2017/04/12/us/politics...
64,"April 12, 2017",Mosul was supposed to last for a week and now ...,The campaign was expected to take months.,https://www.washingtonpost.com/news/the-fix/wp...
65,"April 16, 2017",Someone should look into who paid for the smal...,There's no evidence of paid protesters.,https://www.nytimes.com/2017/04/15/us/politics...
66,"April 18, 2017","The fake media goes, ‘Donald Trump changed his...",He did.,https://www.nytimes.com/2017/04/14/business/ch...
67,"April 21, 2017",On 90 planes I saved $725 million. It's actual...,Much of the price cuts were already projected.,https://www.washingtonpost.com/news/fact-check...
68,"April 21, 2017",When WikiLeaks came out ... never heard of Wik...,He criticized it as early as 2010.,https://www.washingtonpost.com/news/fact-check...
69,"April 27, 2017",I want to help our miners while the Democrats ...,The bill to extend health benefits for certain...,http://www.pbs.org/newshour/rundown/retired-co...


In [44]:
#When investigating the entire dataset we can see some dates aren't abbreviated.  
#I'm not sure how we would know this without looking at the entire thing.  Still, let's fix that.
df['date'] = pd.to_datetime(df['date'])
df[60:80]

Unnamed: 0,date,lie,explanation,url
60,2017-04-11,"I like Steve, but you have to remember he was ...",He knew Steve Bannon since 2011.,https://www.nytimes.com/2017/04/12/us/politics...
61,2017-04-12,"You can't do it faster, because they're obstru...","At this point, he had not nominated anyone for...",https://www.nytimes.com/2017/04/12/us/politics...
62,2017-04-12,The New York Times said the word wiretapped in...,There were separate headlines for print and we...,https://www.nytimes.com/2017/03/23/us/politics...
63,2017-04-12,The secretary general and I had a productive d...,NATO has been engaged in counterterrorism effo...,https://www.nytimes.com/2017/04/12/us/politics...
64,2017-04-12,Mosul was supposed to last for a week and now ...,The campaign was expected to take months.,https://www.washingtonpost.com/news/the-fix/wp...
65,2017-04-16,Someone should look into who paid for the smal...,There's no evidence of paid protesters.,https://www.nytimes.com/2017/04/15/us/politics...
66,2017-04-18,"The fake media goes, ‘Donald Trump changed his...",He did.,https://www.nytimes.com/2017/04/14/business/ch...
67,2017-04-21,On 90 planes I saved $725 million. It's actual...,Much of the price cuts were already projected.,https://www.washingtonpost.com/news/fact-check...
68,2017-04-21,When WikiLeaks came out ... never heard of Wik...,He criticized it as early as 2010.,https://www.washingtonpost.com/news/fact-check...
69,2017-04-27,I want to help our miners while the Democrats ...,The bill to extend health benefits for certain...,http://www.pbs.org/newshour/rundown/retired-co...


In [45]:
df.tail()

Unnamed: 0,date,lie,explanation,url
175,2017-10-25,We have trade deficits with almost everybody.,We have trade surpluses with more than 100 cou...,https://www.bea.gov/newsreleases/international...
176,2017-10-27,"Wacky & totally unhinged Tom Steyer, who has b...",Steyer has financially supported many winning ...,https://www.opensecrets.org/donor-lookup/resul...
177,2017-11-01,"Again, we're the highest-taxed nation, just ab...",We're not.,http://www.politifact.com/truth-o-meter/statem...
178,2017-11-07,When you look at the city with the strongest g...,"Several other cities, including New York and L...",http://www.politifact.com/truth-o-meter/statem...
179,2017-11-11,"I'd rather have him – you know, work with him...","There is no evidence that Democrats ""set up"" R...",https://www.nytimes.com/interactive/2017/12/10...


In [48]:
#index = false means don't include the index on the left.
df.to_csv('trump_lies.csv', index = False, encoding = 'utf-8')

And that's it!  The final code is below to quickly see what to follow when attempting other webscraping projects.

In [None]:
import requests  
r = requests.get('https://www.nytimes.com/interactive/2017/06/23/opinion/trumps-lies.html')

from bs4 import BeautifulSoup  
soup = BeautifulSoup(r.text, 'html.parser')  
results = soup.find_all('span', attrs={'class':'short-desc'})

records = []  
for result in results:  
    date = result.find('strong').text[0:-1] + ', 2017'
    lie = result.contents[1][1:-2]
    explanation = result.find('a').text[1:-1]
    url = result.find('a')['href']
    records.append((date, lie, explanation, url))

import pandas as pd  
df = pd.DataFrame(records, columns=['date', 'lie', 'explanation', 'url'])  
df['date'] = pd.to_datetime(df['date'])  
df.to_csv('trump_lies.csv', index=False, encoding='utf-8')  

In [None]:
df.tail()