# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [1]:
import feedparser

### 1. Use feedparser to parse the following RSS feed URL.

In [2]:
url='http://feeds.feedburner.com/oreilly/radar/atom'

In [3]:
res=feedparser.parse(url)
print (type(res))

<class 'feedparser.FeedParserDict'>


### 2. Obtain a list of components (keys) that are available for this feed.

In [4]:
list_parse=list(res.keys())
print (list_parse)

['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces']


### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [5]:
list_feed=[res['feed'].keys()]
print (list_feed)

[dict_keys(['title', 'title_detail', 'id', 'guidislink', 'link', 'updated', 'updated_parsed', 'subtitle', 'subtitle_detail', 'links', 'authors', 'author_detail', 'author', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])]


### 4. Extract and print the feed title, subtitle, author, and link.

In [6]:
print (res['feed']['title'])
print (res['feed']['subtitle'])
print (res['feed']['author'])
print (res['feed']['link'])

All - O'Reilly Media
All of our Ideas and Learning material from all of our topics.
O'Reilly Media
https://www.oreilly.com


### 5. Count the number of entries that are contained in this RSS feed.

In [7]:
print (len(res['entries']))

60


### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [8]:
list_components=[res['entries'][0].keys()]
print (list_components)

[dict_keys(['title', 'title_detail', 'updated', 'updated_parsed', 'id', 'guidislink', 'link', 'content', 'summary', 'links', 'authors', 'author_detail', 'author', 'feedburner_origlink'])]


### 7. Extract a list of entry titles.

In [9]:
list_titles=[res['entries'][i]['title'] for i in range(len(res['entries']))]
print (list_titles)

['Why companies are in need of data lineage solutions', 'Stablecoins: Solving the cryptocurrency volatility crisis', 'Four short links: 25 April 2019', 'Four short links: 24 April 2019', 'Four short links: 23 April 2019', 'Four short links: 22 April 2019', 'Four short links: 19 April 2019', 'Computational propaganda', 'Decoding the human genome with deep learning', 'Automation of AI: Accelerating the AI revolution', 'Simple, scalable, and sustainable: A methodical approach to AI adoption', 'Software 2.0 and Snorkel', 'Applied machine learning at Facebook', 'Artificial intelligence: The “refinery” for data', 'Making real-world distributed deep learning easy with Nauta', 'Four short links: 18 April 2019', 'Toward ethical AI: Inclusivity as a messy, difficult, but promising answer', 'Fast, flexible, and functional: 4 real-world AI deployments at enterprise scale', 'Machine learning for personalization', 'Automated ML: A journey from CRISPR.ML to Azure ML', 'Checking in on AI tools', 'How 

### 8. Calculate the percentage of "Four short links" entry titles.

In [10]:
count=len([i for i in range(len(list_titles)) if 'Four short links' in list_titles[i]])
print (count*100/(len(res['entries'])), '%')

35.0 %


### 9. Create a Pandas data frame from the feed's entries.

In [11]:
import pandas as pd

In [12]:
entry=pd.DataFrame.from_dict(res['entries'])
display (entry.head())

Unnamed: 0,author,author_detail,authors,content,feedburner_origlink,guidislink,id,link,links,summary,title,title_detail,updated,updated_parsed
0,Ben Lorica,{'name': 'Ben Lorica'},[{'name': 'Ben Lorica'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/why-companies-ar...,True,"tag:www.oreilly.com,2019-04-25:/ideas/why-comp...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,<p><img src='https://d3ucjech6zwjp8.cloudfront...,Why companies are in need of data lineage solu...,"{'type': 'text/plain', 'language': None, 'base...",2019-04-25T11:15:00Z,"(2019, 4, 25, 11, 15, 0, 3, 115, 0)"
1,"Wayne Chang, Gregory Rocco, Jacob Blish","{'name': 'Wayne Chang, Gregory Rocco, Jacob Bl...","[{'name': 'Wayne Chang, Gregory Rocco, Jacob B...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/stablecoins-solv...,True,"tag:www.oreilly.com,2019-04-25:/ideas/stableco...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,<p><img src='https://d3ucjech6zwjp8.cloudfront...,Stablecoins: Solving the cryptocurrency volati...,"{'type': 'text/plain', 'language': None, 'base...",2019-04-25T11:00:00Z,"(2019, 4, 25, 11, 0, 0, 3, 115, 0)"
2,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-04-25:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Values Risk, Brain Interface, Hacking S...",Four short links: 25 April 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-04-25T10:50:00Z,"(2019, 4, 25, 10, 50, 0, 3, 115, 0)"
3,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-04-24:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Control is a Shrug, Glitch Languages, S...",Four short links: 24 April 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-04-24T10:55:00Z,"(2019, 4, 24, 10, 55, 0, 2, 114, 0)"
4,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-04-23:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Worker-run Gig Factories, Persistence o...",Four short links: 23 April 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-04-23T12:25:00Z,"(2019, 4, 23, 12, 25, 0, 1, 113, 0)"


### 10. Count the number of entries per author and sort them in descending order.

In [13]:
author=entry['author'].value_counts()
display (author.head())

Nat Torkington    21
Ben Lorica         3
Mike Loukides      2
Mac Slocum         2
Chris Ré           1
Name: author, dtype: int64

### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [14]:
len_title=[len(entry['title'][i]) for i in range(len(entry))]
entry['len_title']=len_title
display (entry.head())

Unnamed: 0,author,author_detail,authors,content,feedburner_origlink,guidislink,id,link,links,summary,title,title_detail,updated,updated_parsed,len_title
0,Ben Lorica,{'name': 'Ben Lorica'},[{'name': 'Ben Lorica'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/why-companies-ar...,True,"tag:www.oreilly.com,2019-04-25:/ideas/why-comp...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,<p><img src='https://d3ucjech6zwjp8.cloudfront...,Why companies are in need of data lineage solu...,"{'type': 'text/plain', 'language': None, 'base...",2019-04-25T11:15:00Z,"(2019, 4, 25, 11, 15, 0, 3, 115, 0)",51
1,"Wayne Chang, Gregory Rocco, Jacob Blish","{'name': 'Wayne Chang, Gregory Rocco, Jacob Bl...","[{'name': 'Wayne Chang, Gregory Rocco, Jacob B...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/stablecoins-solv...,True,"tag:www.oreilly.com,2019-04-25:/ideas/stableco...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,<p><img src='https://d3ucjech6zwjp8.cloudfront...,Stablecoins: Solving the cryptocurrency volati...,"{'type': 'text/plain', 'language': None, 'base...",2019-04-25T11:00:00Z,"(2019, 4, 25, 11, 0, 0, 3, 115, 0)",57
2,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-04-25:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Values Risk, Brain Interface, Hacking S...",Four short links: 25 April 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-04-25T10:50:00Z,"(2019, 4, 25, 10, 50, 0, 3, 115, 0)",31
3,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-04-24:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Control is a Shrug, Glitch Languages, S...",Four short links: 24 April 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-04-24T10:55:00Z,"(2019, 4, 24, 10, 55, 0, 2, 114, 0)",31
4,Nat Torkington,{'name': 'Nat Torkington'},[{'name': 'Nat Torkington'}],"[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/ideas/four-short-links...,True,"tag:www.oreilly.com,2019-04-23:/ideas/four-sho...",http://feedproxy.google.com/~r/oreilly/radar/a...,[{'href': 'http://feedproxy.google.com/~r/orei...,"<p><em>Worker-run Gig Factories, Persistence o...",Four short links: 23 April 2019,"{'type': 'text/plain', 'language': None, 'base...",2019-04-23T12:25:00Z,"(2019, 4, 23, 12, 25, 0, 1, 113, 0)",31


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [15]:
ML=[entry['title'][i] for i in range(len(entry)) if 'machine learning' in entry['summary'][i]]
print (ML)

['Why companies are in need of data lineage solutions', 'Decoding the human genome with deep learning', 'Applied machine learning at Facebook', "Highlights from the O'Reilly Artificial Intelligence Conference in New York 2019", 'Strata San Francisco, 2019: Opportunities and Risks', 'Why a data scientist is not a data engineer', '150+ live online training courses opened for April and May', 'De-biasing language', 'Specialized tools for machine learning development and model governance are becoming essential', 'Highlights from the Strata Data Conference in San Francisco 2019', 'It’s time for data scientists to collaborate with researchers in other disciplines', 'Four short links: 28 March 2019', 'AI and cryptography: Challenges and opportunities']
