<a href="https://colab.research.google.com/github/jiahui989/Projects/blob/main/Web_Scraping_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**1: CSS SCRAPY**

In [1]:
# install scrapy package
!pip install scrapy

Collecting scrapy
  Downloading Scrapy-2.5.1-py2.py3-none-any.whl (254 kB)
[K     |████████████████████████████████| 254 kB 10.4 MB/s 
[?25hCollecting cssselect>=0.9.1
  Downloading cssselect-1.1.0-py2.py3-none-any.whl (16 kB)
Collecting cryptography>=2.0
  Downloading cryptography-36.0.1-cp36-abi3-manylinux_2_24_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 60.3 MB/s 
[?25hCollecting zope.interface>=4.1.3
  Downloading zope.interface-5.4.0-cp37-cp37m-manylinux2010_x86_64.whl (251 kB)
[K     |████████████████████████████████| 251 kB 61.5 MB/s 
[?25hCollecting itemloaders>=1.0.1
  Downloading itemloaders-1.0.4-py3-none-any.whl (11 kB)
Collecting parsel>=1.5.0
  Downloading parsel-1.6.0-py2.py3-none-any.whl (13 kB)
Collecting pyOpenSSL>=16.2.0
  Downloading pyOpenSSL-22.0.0-py2.py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.9 MB/s 
[?25hCollecting queuelib>=1.4.2
  Downloading queuelib-1.6.2-py2.py3-none-any.whl (13 kB)
Collectin

In [2]:
# import lib
from scrapy import Selector

In [29]:
html = '''
<html>
  <body>
    <div class = "class-intro">
      <p>"Hello CSS"</p>
      <p>"Hello Python"</p>
    </div>
    <div class = "class-intro">
      <p>"Hello scrapy"</p>
      <p>"Hello spider"</p>
    </div>
    <div class = "class-intro2">
      <p>"Hello TTC3213"</p>
    </div>
    <p id="p-example">
      Hello world!
      Try <a href="http://www.datacamp.com">DataCamp</a> today!
    </p>
  </body>
</html>
'''

# initialize selector obj
sel = Selector(text = html)

# Look forward one generation - all p of first div
print(sel.css('html>body>div:nth-of-type(1)').extract())
print(sel.css('div:nth-of-type(1)').extract()) # <-- same as the above line

# Display specific element
print(sel.css('html>body>div:nth-of-type(2)>p:nth-of-type(1)').extract())

# all children
print(sel.css('div').extract())

# element with specific class and id
print(sel.css('.class-intro, #p-example').extract())

# get innerHTML of a HTML element
print(sel.css('[href]::text').extract())
print(sel.css('[href]::text')) # <-- must have function extract

# get text of a HTML element
print(sel.css('p#p-example::text').extract())

# get all text in all HTML elements
print(sel.css('p::text').extract())

# get everything after the body element
print(sel.css('div ').extract())

['<div class="class-intro">\n      <p>"Hello CSS"</p>\n      <p>"Hello Python"</p>\n    </div>']
['<div class="class-intro">\n      <p>"Hello CSS"</p>\n      <p>"Hello Python"</p>\n    </div>']
['<p>"Hello scrapy"</p>']
['<div class="class-intro">\n      <p>"Hello CSS"</p>\n      <p>"Hello Python"</p>\n    </div>', '<div class="class-intro">\n      <p>"Hello scrapy"</p>\n      <p>"Hello spider"</p>\n    </div>', '<div class="class-intro2">\n      <p>"Hello TTC3213"</p>\n    </div>']
['<div class="class-intro">\n      <p>"Hello CSS"</p>\n      <p>"Hello Python"</p>\n    </div>', '<div class="class-intro">\n      <p>"Hello scrapy"</p>\n      <p>"Hello spider"</p>\n    </div>', '<p id="p-example">\n      Hello world!\n      Try <a href="http://www.datacamp.com">DataCamp</a> today!\n    </p>']
['DataCamp']
[<Selector xpath='descendant-or-self::*[@href]/text()' data='DataCamp'>]
['\n      Hello world!\n      Try ', ' today!\n    ']
['"Hello CSS"', '"Hello Python"', '"Hello scrapy"', '"Hello

#**2: Web Scraping with Spider Lib**

In [30]:
import scrapy
import scrapy.crawler as crawler
from multiprocessing import Process, Queue
from twisted.internet import reactor

##**Example 1:**

In [49]:
# create spider
class quoteScraper(scrapy.Spider):
    name = 'quotes'
    start_urls = ['http://quotes.toscrape.com/tag/inspirational/page/1', 'http://quotes.toscrape.com/tag/inspirational/page/2']

    def parse(self, response):
        for quote in response.css('div.quote'):
            print(quote.css('span.text::text').get())
            print(quote.css('.author::text').get())

In [50]:
# Wrapper function to run spider created
def run_spider(spider):
    def f(q):
        try:
            runner = crawler.CrawlerRunner()
            deferred = runner.crawl(spider)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)
    
    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result

In [51]:
run_spider(quoteScraper)

“Life isn't about finding yourself. Life is about creating yourself.”
George Bernard Shaw
“You may say I'm a dreamer, but I'm not the only one. I hope someday you'll join us. And the world will live as one.”
John Lennon
“A person's a person, no matter how small.”
Dr. Seuss
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
Albert Einstein
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
Marilyn Monroe
“I have not failed. I've just found 10,000 ways that won't work.”
Thomas A. Edison
“This life is what you make it. No matter what, you're going to mess up sometimes, it's a universal truth. But the good part is you get to decide how you're going to mess it up. Girls will be your friends - they'll act like it anyway. But just remember, some come, some go. The ones that stay with you through everything - they're your true best friends. Don't let go 

##**Example 2:**

In [53]:
# Marilyn Manson Quotes
class manson(scrapy.Spider):
    name = 'manson_spider'
    start_urls = ['https://en.wikiquote.org/wiki/Marilyn_Manson']

    def parse(self, response):
        for manson_quote in response.css('div.mw-parser-output>ul'):
            print(manson_quote.css('li::text').get())

run_spider(manson)

If people really stopped and realized how much art and creative people move the world versus politics and religion, I mean it’s not even up for debate. An artist at least creates things, puts things into the world. Where as these other people are destroying things, taking things out of the world.

I'm fucking sick of people who always try to blame movies, bands, songs, or talk shows for whatever the fuck hits them today – teen suicides, drug overdoses or everything else. If someone is stupid enough to kill himself because of a song, then that's exactly what they deserve – they weren't contributing anything to the society – it's one less idiot in the world. There's too many people – if more people kill themselves over music, it wouldn't disappoint me. What would disappoint me is that people are that stupid.

… I don't expect everyone to get something deep out of it. Some people can just listen to the music, or get their aggressions out, but I think with any great painting or movie, albu

##**Example 3:**

In [60]:
# Scrape bood synopsis
class Book_Synopsis(scrapy.Spider):
    name = "book"
    start_urls = ['https://thegreatestbooks.org/']
    
    def parse(self, response):
        for sinopsis in response.css('div.pb-3> div > p::text'):
            print(sinopsis.get().lstrip()) # this code will extract the text and the element tag 
            # write the code will only extract/print the text without the element

run_spider(Book_Synopsis)

Swann's Way, the first part of A la recherche de temps perdu, Marcel Proust's seven-part cycle, was published in 1913. In it, Proust introduces the themes that run through the entire work. The narr...
            
Ulysses chronicles the passage of Leopold Bloom through Dublin during an ordinary day, June 16, 1904. The title parallels and alludes to Odysseus (Latinised into Ulysses), the hero of Homer's Odyss...
            
Alonso Quixano, a retired country gentleman in his fifties, lives in an unnamed section of La Mancha with his niece and a housekeeper. He has become obsessed with books of chivalry, and believes th...
            
One of the 20th century's enduring works, One Hundred Years of Solitude is a widely beloved and acclaimed novel known throughout the world, and the ultimate achievement in a Nobel Prize–winning car...
            
The novel chronicles an era that Fitzgerald himself dubbed the "Jazz Age". Following the shock and chaos of World War I, American society enjoye

In [74]:
# Scrape book author
class Book_Title_Author(scrapy.Spider):
    name = "book_title_author"
    start_urls = ['https://thegreatestbooks.org/']
    #main > div > div > div > div.list-body > ol > li:nth-child(2) > div > div > div > h4 > a:nth-child(2)
    def parse(self, response):
        for book_ttl_aut in response.css('div.list-body>ol>li>div>div>div.col>h4'):
            print('Book Title: {}'.format(book_ttl_aut.css('a:nth-child(1)::text').get()))
            print('Book Author: {}'.format(book_ttl_aut.css('a:nth-child(2)::text').get()))
          
run_spider(Book_Title_Author)

Book Title: In Search of Lost Time 
Book Author: Marcel Proust
Book Title: Ulysses
Book Author: James Joyce
Book Title: Don Quixote
Book Author: Miguel de Cervantes
Book Title: One Hundred Years of Solitude 
Book Author: Gabriel Garcia Marquez
Book Title: The Great Gatsby 
Book Author: F. Scott Fitzgerald
Book Title: Moby Dick
Book Author: Herman Melville
Book Title: War and Peace
Book Author: Leo Tolstoy
Book Title: Hamlet
Book Author: William Shakespeare
Book Title: The Odyssey
Book Author: Homer
Book Title: Madame Bovary
Book Author: Gustave Flaubert
Book Title: The Divine Comedy 
Book Author: Dante Alighieri
Book Title: Lolita 
Book Author: Vladimir Nabokov
Book Title: The Brothers Karamazov 
Book Author: Fyodor Dostoyevsky
Book Title: Crime and Punishment 
Book Author: Fyodor Dostoyevsky
Book Title: Wuthering Heights
Book Author: Emily Brontë
Book Title: The Catcher in the Rye
Book Author: J. D. Salinger
Book Title: Pride and Prejudice
Book Author: Jane Austen
Book Title: The Adve

#**3: BeautifulSoup**
ref: https://www.kaggle.com/brianckeegan/web-data-scraping-class-01

##**XML:**

In [108]:
# import lib
import requests
from bs4 import BeautifulSoup

In [109]:
house_raw= requests.get('http://clerk.house.gov/xml/lists/MemberData.xml').text
senate_raw= requests.get('https://www.senate.gov/legislative/LIS_MEMBER/cvc_member_data.xml').text

house_soup = BeautifulSoup(house_raw, 'lxml')

In [110]:
children_lst = list(set([tag.name for tag in house_soup.findChildren()]))
children_lst

['pred-title',
 'formal-name',
 'minority',
 'prior-congress',
 'firstname',
 'caucus',
 'ratio',
 'congress-num',
 'subcommittee',
 'state-fullname',
 'body',
 'committee',
 'clerk',
 'memberdata',
 'phone',
 'session',
 'suffix',
 'lastname',
 'sort-name',
 'member',
 'sworn-date',
 'pred-memindex',
 'pred-party',
 'pred-vacate-date',
 'members',
 'footnote-ref',
 'committee-fullname',
 'office-zip',
 'office-room',
 'predecessor-info',
 'title-info',
 'pred-firstname',
 'elected-date',
 'pred-middlename',
 'html',
 'courtesy',
 'official-name',
 'pred-footnote',
 'pred-sort-name',
 'party',
 'bioguideid',
 'pred-footnote-ref',
 'office-building',
 'district',
 'member-info',
 'congress-text',
 'townname',
 'committee-assignments',
 'weburl',
 'namelist',
 'pred-official-name',
 'pred-formal-name',
 'pred-lastname',
 'middlename',
 'state',
 'subcommittee-fullname',
 'footnote',
 'committees',
 'majority',
 'office-zip-suffix',
 'statedistrict']

In [111]:
print('Total number of childrens: {}'.format(len(children_lst)))

Total number of childrens: 61


In [112]:
print('Total number of members: {}'.format(len(house_soup.members)))

Total number of members: 883


In [113]:
print('Members content: {}'.format(house_soup.members.contents))

Members content: ['\n', <member>
<statedistrict>AK00</statedistrict>
<member-info>
<namelist>Young, Don</namelist>
<bioguideid>Y000033</bioguideid>
<lastname>Young</lastname>
<firstname>Don</firstname>
<middlename></middlename>
<sort-name>YOUNG,DON</sort-name>
<suffix></suffix>
<courtesy>Mr.</courtesy>
<prior-congress>116</prior-congress>
<official-name>Don Young</official-name>
<formal-name>Mr. Young</formal-name>
<party>R</party>
<caucus>R</caucus>
<state postal-code="AK">
<state-fullname>Alaska</state-fullname>
</state>
<district>At Large</district>
<townname>Fort Yukon</townname>
<office-building>RHOB</office-building>
<office-room>2314</office-room>
<office-zip>20515</office-zip>
<office-zip-suffix>0200</office-zip-suffix>
<phone>(202) 225-5765</phone>
<elected-date date="20201103">November  3, 2020</elected-date>
<sworn-date date="20210103">January  3, 2021</sworn-date>
</member-info>
<committee-assignments>
<committee comcode="II00" rank="2"></committee>
<committee comcode="PW00

In [157]:
!pip install iteration_utilities
from iteration_utilities import flatten

names = house_soup.memberdata.findAll('namelist')
namelist = [name.contents for name in names]
list(flatten(namelist))



['Young, Don',
 'Carl, Jerry',
 'Moore, Barry',
 'Rogers, Mike',
 'Aderholt, Robert',
 'Brooks, Mo',
 'Palmer, Gary',
 'Sewell, Terri',
 'Crawford, Eric',
 'Hill, J.',
 'Womack, Steve',
 'Westerman, Bruce',
 'Radewagen, Aumua Amata',
 "O'Halleran, Tom",
 'Kirkpatrick, Ann',
 'Grijalva, RaÃºl',
 'Gosar, Paul',
 'Biggs, Andy',
 'Schweikert, David',
 'Gallego, Ruben',
 'Lesko, Debbie',
 'Stanton, Greg',
 'LaMalfa, Doug',
 'Huffman, Jared',
 'Garamendi, John',
 'McClintock, Tom',
 'Thompson, Mike',
 'Matsui, Doris',
 'Bera, Ami',
 'Obernolte, Jay',
 'McNerney, Jerry',
 'Harder, Josh',
 'DeSaulnier, Mark',
 'Pelosi, Nancy',
 'Lee, Barbara',
 'Speier, Jackie',
 'Swalwell, Eric',
 'Costa, Jim',
 'Khanna, Ro',
 'Eshoo, Anna',
 'Lofgren, Zoe',
 'Panetta, Jimmy',
 'Valadao, David',
 'McCarthy, Kevin',
 'Carbajal, Salud',
 'Garcia, Mike',
 'Brownley, Julia',
 'Chu, Judy',
 'Schiff, Adam',
 'CÃ¡rdenas, Tony',
 'Sherman, Brad',
 'Aguilar, Pete',
 'Napolitano, Grace',
 'Lieu, Ted',
 'Gomez, Jimmy',


In [159]:
!pip install iteration_utilities
from iteration_utilities import flatten

states = house_soup.memberdata.findAll('state-fullname')
statelist = [state.contents for state in states]
list(set(flatten(statelist))) # unique states (remove set to reveal state for the above members)



['Kansas',
 'South Dakota',
 'Idaho',
 'District of Columbia',
 'Arizona',
 'Maine',
 'North Carolina',
 'Pennsylvania',
 'Montana',
 'Puerto Rico',
 'New Hampshire',
 'Virgin Islands',
 'Alabama',
 'Iowa',
 'Guam',
 'Minnesota',
 'Vermont',
 'Colorado',
 'Delaware',
 'Utah',
 'Florida',
 'Massachusetts',
 'Indiana',
 'California',
 'Louisiana',
 'Wyoming',
 'Tennessee',
 'Wisconsin',
 'South Carolina',
 'Nebraska',
 'Washington',
 'Oklahoma',
 'Kentucky',
 'North Dakota',
 'Maryland',
 'New Mexico',
 'Ohio',
 'Georgia',
 'Rhode Island',
 'West Virginia',
 'New Jersey',
 'Northern Mariana Islands',
 'Hawaii',
 'American Samoa',
 'Michigan',
 'Mississippi',
 'Connecticut',
 'Illinois',
 'Virginia',
 'Nevada',
 'New York',
 'Texas',
 'Alaska',
 'Arkansas',
 'Oregon',
 'Missouri']

In [162]:
party = house_soup.memberdata.findAll('party')
partylist = [p.contents for p in party]
num_republicans = list(flatten(partylist)).count('R')
num_democrats = list(flatten(partylist)).count('D')
print(num_republicans)
print(num_democrats)

214
226


In [165]:
import pandas as pd

districts = house_soup.memberdata.findAll('district')
districtlist = [d.contents for d in districts]
print(list(set(flatten(districtlist))))
pd.Series(list(flatten(districtlist))).value_counts()

['12th', '52nd', '6th', '29th', 'At Large', '51st', '24th', '16th', '7th', '41st', '23rd', '39th', '14th', '2nd', '35th', '46th', '26th', '10th', '47th', '32nd', '11th', '42nd', '48th', '18th', 'Resident Commissioner', '53rd', '31st', '21st', '28th', '38th', '36th', '44th', '25th', '30th', '27th', '34th', '40th', 'Delegate', '15th', '43rd', '37th', '33rd', '19th', '50th', '8th', '49th', '45th', '17th', '4th', '1st', '20th', '3rd', '13th', '22nd', '5th', '9th']


2nd                      43
1st                      43
3rd                      38
4th                      35
5th                      29
6th                      26
7th                      24
8th                      21
9th                      17
10th                     13
11th                     12
12th                     11
13th                     10
14th                      9
15th                      7
16th                      7
At Large                  7
17th                      6
18th                      6
Delegate                  5
23rd                      4
26th                      4
25th                      4
24th                      4
27th                      4
22nd                      4
20th                      4
19th                      4
21st                      4
32nd                      2
36th                      2
35th                      2
33rd                      2
34th                      2
31st                      2
30th                

In [168]:
gender = house_soup.memberdata.findAll('courtesy')
genderlist = [g.contents for g in gender]
print(list(set(flatten(genderlist))))
num_female = list(flatten(genderlist)).count('Miss') + list(flatten(genderlist)).count('Ms.') + + list(flatten(genderlist)).count('Mrs.')
num_male = list(flatten(genderlist)).count('Mr.')
print(num_female)
print(num_male)

['Miss', 'Mrs.', 'Mr.', 'Ms.']
125
315


In [171]:
bioguideid = house_soup.memberdata.find_all('bioguideid')
bioguideidlist = [bio.contents for bio in bioguideid]
print(list(flatten(bioguideidlist)))

['Y000033', 'C001054', 'M001212', 'R000575', 'A000055', 'B001274', 'P000609', 'S001185', 'C001087', 'H001072', 'W000809', 'W000821', 'R000600', 'O000171', 'K000368', 'G000551', 'G000565', 'B001302', 'S001183', 'G000574', 'L000589', 'S001211', 'L000578', 'H001068', 'G000559', 'M001177', 'T000460', 'M001163', 'B001287', 'O000019', 'M001166', 'H001090', 'D000623', 'P000197', 'L000551', 'S001175', 'S001193', 'C001059', 'K000389', 'E000215', 'L000397', 'P000613', 'V000129', 'M001165', 'C001112', 'G000061', 'B001285', 'C001080', 'S001150', 'C001097', 'S000344', 'A000371', 'N000179', 'L000582', 'G000585', 'T000474', 'R000599', 'B001270', 'S001156', 'K000397', 'R000486', 'T000472', 'C000059', 'W000187', 'B001300', 'P000618', 'C001110', 'L000579', 'S001135', 'L000593', 'I000056', 'V000130', 'P000608', 'J000305', 'D000197', 'N000191', 'B000825', 'B001297', 'L000564', 'C001121', 'P000593', 'L000557', 'C001069', 'D000216', 'H001047', 'H001081', 'N000147', 'B001303', 'G000578', 'D000628', 'C001039'

##**JSON:**

###**Example 1:**

In [172]:
mountain_west = {'Colorado': {'Abbreviation': 'CO',
                              'Area': 269601,
                              'Capital': 'Denver',
                              'Established': '1876-08-01',
                              'Largest city': 'Denver',
                              'Population': 5540545,
                              'Representatives': 7},
                 'Idaho': {'Abbreviation': 'ID',
                              'Area': 216443,
                              'Capital': 'Boise',
                              'Established': '1890-07-03',
                              'Largest city': 'Boise',
                              'Population': 1683140,
                              'Representatives': 2},
                 'Montana': {'Abbreviation': 'MT',
                              'Area': 380831,
                              'Capital': 'Helena',
                              'Established': '1889-11-08',
                              'Largest city': 'Billings',
                              'Population': 1042520,
                              'Representatives': 1},
                 'Utah': {'Abbreviation': 'UT',
                              'Area': 219882,
                              'Capital': 'Salt Lake City',
                              'Established': '1896-01-04',
                              'Largest city': 'Salt Lake City',
                              'Population': 3051217,
                              'Representatives': 4},
                 'Wyoming': {'Abbreviation': 'WY',
                              'Area': 253335,
                              'Capital': 'Cheyenne',
                              'Established': '1890-07-10',
                              'Largest city': 'Cheyenne',
                              'Population': 585501,
                              'Representatives': 1}
                 }

In [173]:
mountain_west['Wyoming']['Capital']

'Cheyenne'

###**Example 2:**

In [174]:
state_population = [ {'Population': 4338785, 'State': 'Colorado', 'Year': 2000},
                     {'Population': 4444513, 'State': 'Colorado', 'Year': 2001},
                     {'Population': 4504709, 'State': 'Colorado', 'Year': 2002},
                     {'Population': 4555084, 'State': 'Colorado', 'Year': 2003},
                     {'Population': 4608811, 'State': 'Colorado', 'Year': 2004},
                     {'Population': 4662534, 'State': 'Colorado', 'Year': 2005},
                     {'Population': 4745660, 'State': 'Colorado', 'Year': 2006},
                     {'Population': 4821784, 'State': 'Colorado', 'Year': 2007},
                     {'Population': 4901938, 'State': 'Colorado', 'Year': 2008},
                     {'Population': 4976853, 'State': 'Colorado', 'Year': 2009},
                     {'Population': 5049935, 'State': 'Colorado', 'Year': 2010},
                     {'Population': 5119538, 'State': 'Colorado', 'Year': 2011},
                     {'Population': 5191086, 'State': 'Colorado', 'Year': 2012},
                     {'Population': 5268413, 'State': 'Colorado', 'Year': 2013},
                     {'Population': 5350118, 'State': 'Colorado', 'Year': 2014},
                     {'Population': 5448055, 'State': 'Colorado', 'Year': 2015},
                     {'Population': 5538180, 'State': 'Colorado', 'Year': 2016}
                   ]

In [178]:
for state_pop in state_population:
    if state_pop['Year'] == 2016:
        print(state_pop['Population'])

5538180


###**Example 3:**

In [179]:
obama_tweet = {'created_at': 'Tue Feb 14 15:34:47 +0000 2017',
               'favorite_count': 1023379,
               'hashtags': [],
               'id': 831527113211645959,
               'id_str': '831527113211645959',
               'lang': 'en',
               'media': [{'display_url': 'pic.twitter.com/O0UhJWoqGN',
                          'expanded_url': 'https://twitter.com/BarackObama/status/831527113211645959/photo/1',
                          'id': 831526916398149634,
                          'media_url': 'http://pbs.twimg.com/media/C4otUykWcAIbSy1.jpg',
                          'media_url_https': 'https://pbs.twimg.com/media/C4otUykWcAIbSy1.jpg',
                          'sizes': {'large': {'h': 800, 'resize': 'fit', 'w': 1200},
                                    'medium': {'h': 800, 'resize': 'fit', 'w': 1200},
                                    'small': {'h': 453, 'resize': 'fit', 'w': 680},
                                    'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
                          'type': 'photo',
                          'url': 'https://t.co/O0UhJWoqGN'}],
               'retweet_count': 252266,
               'source': '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
               'text': 'Happy Valentine’s Day, @michelleobama! Almost 28 years with you, but it always feels new. https://t.co/O0UhJWoqGN',
               'urls': [],
               'user': {'created_at': 'Mon Mar 05 22:08:25 +0000 2007',
                        'description': 'Dad, husband, President, citizen.',
                        'favourites_count': 10,
                        'followers_count': 84814791,
                        'following': True,
                        'friends_count': 631357,
                        'id': 813286,
                        'lang': 'en',
                        'listed_count': 221906,
                        'location': 'Washington, DC',
                        'name': 'Barack Obama',
                        'profile_background_color': '77B0DC',
                        'profile_background_image_url': 'http://pbs.twimg.com/profile_background_images/451819093436268544/kLbRvwBg.png',
                        'profile_banner_url': 'https://pbs.twimg.com/profile_banners/813286/1484945688',
                        'profile_image_url': 'http://pbs.twimg.com/profile_images/822547732376207360/5g0FC8XX_normal.jpg',
                        'profile_link_color': '2574AD',
                        'profile_sidebar_fill_color': 'C2E0F6',
                        'profile_text_color': '333333',
                        'screen_name': 'BarackObama',
                        'statuses_count': 15436,
                        'time_zone': 'Eastern Time (US & Canada)',
                        'url': 'https://t.co/93Y27HEnnX',
                        'utc_offset': -18000,
                        'verified': True},
               'user_mentions': [{'id': 409486555,
                                  'name': 'Michelle Obama',
                                  'screen_name': 'MichelleObama'}]}

In [192]:
# What are the top-most keys in the obama_tweet object?
print('key:', list(obama_tweet.keys())[0])

# When was this tweet sent?
print('value:', obama_tweet[list(obama_tweet.keys())[0]])

# Does this tweet mention anyone?
print('mentioned: ', obama_tweet['user_mentions'][0]['name'])

# How many retweets did this tweet receive (at the time I collected it)?
print('retweet count: ', obama_tweet['retweet_count'])

# How many followers does the "user" who wrote this tweet have?
print('number of followers: ', obama_tweet['user']['followers_count'])

# What's the "media_url" for the image in this tweet?
print('media url: ', obama_tweet['media'][0]['media_url'])

key: created_at
value: Tue Feb 14 15:34:47 +0000 2017
mentioned:  Michelle Obama
retweet count:  252266
number of followers:  84814791
media url:  http://pbs.twimg.com/media/C4otUykWcAIbSy1.jpg
