# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [1]:
#!pip install feedparser
import feedparser

### 1. Use feedparser to parse the following RSS feed URL.

In [2]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [3]:
feedburner = feedparser.parse(url)

In [4]:
feedburner['feed']

{'title': 'Radar',
 'title_detail': {'type': 'text/plain',
  'language': None,
  'base': 'http://feeds.feedburner.com/oreilly/radar/atom',
  'value': 'Radar'},
 'links': [{'rel': 'alternate',
   'type': 'text/html',
   'href': 'https://www.oreilly.com/radar'},
  {'rel': 'self',
   'type': 'application/rss+xml',
   'href': 'http://feeds.feedburner.com/oreilly/radar/atom'},
  {'rel': 'hub',
   'href': 'http://pubsubhubbub.appspot.com/',
   'type': 'text/html'}],
 'link': 'https://www.oreilly.com/radar',
 'subtitle': 'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology',
 'subtitle_detail': {'type': 'text/html',
  'language': None,
  'base': 'http://feeds.feedburner.com/oreilly/radar/atom',
  'value': 'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology'},
 'updated': 'Wed, 08 Apr 2020 11:48:28 +0000',
 'updated_parsed': time.struct_time(tm_year=2020, tm_mon=4, tm_mday=8, tm_hour=11, tm_min=48, t

### 2. Obtain a list of components (keys) that are available for this feed.

In [5]:
components = list(feedburner.keys())
components

['feed',
 'entries',
 'bozo',
 'headers',
 'etag',
 'updated',
 'updated_parsed',
 'href',
 'status',
 'encoding',
 'version',
 'namespaces']

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [6]:
feed_keys = list(feedburner.feed.keys())
feed_keys

['title',
 'title_detail',
 'links',
 'link',
 'subtitle',
 'subtitle_detail',
 'updated',
 'updated_parsed',
 'language',
 'sy_updateperiod',
 'sy_updatefrequency',
 'generator_detail',
 'generator',
 'feedburner_info',
 'geo_lat',
 'geo_long',
 'feedburner_emailserviceid',
 'feedburner_feedburnerhostname']

### 4. Extract and print the feed title, subtitle, author, and link.

In [7]:
f_title = feedburner.feed.title
f_sub_title = feedburner.feed.subtitle
f_link = feedburner.feed.link
display(f_title, f_sub_title,f_link)

'Radar'

'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology'

'https://www.oreilly.com/radar'

In [8]:
authors = set([a.author for a in feedburner.entries])
display(authors)

{'Cynthia Owens',
 'George Fairbanks',
 'Hugo Bowne-Anderson',
 'Jenn Webb',
 'Kai Holnes',
 'Mac Slocum',
 'Mark Richards',
 'Martin Fowler',
 'Mary Poppendieck',
 'Mike Loukides',
 'Nat Torkington',
 'Pamela Rucker',
 'Peter Skomoroch and Mike Loukides',
 'Rachel Laycock and Neal Ford',
 'Rita J. King',
 'Roger Magoulas and Steve Swoyer',
 'Tim O’Reilly'}

### 5. Count the number of entries that are contained in this RSS feed.

In [9]:
len(feedburner.entries[0].keys())

19

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [10]:
list(feedburner.entries[0].keys())

['title',
 'title_detail',
 'links',
 'link',
 'comments',
 'published',
 'published_parsed',
 'authors',
 'author',
 'author_detail',
 'tags',
 'id',
 'guidislink',
 'summary',
 'summary_detail',
 'content',
 'wfw_commentrss',
 'slash_comments',
 'feedburner_origlink']

### 7. Extract a list of entry titles.

In [11]:
set([a.title for a in feedburner.entries])

{'10 ways to get untapped talent in your organization to contribute',
 '3 ways to confront modern business challenges',
 '5 key areas for tech leaders to watch in 2020',
 '6 trends framing the state of AI and ML',
 'AI adoption in the enterprise 2020',
 'An enterprise vision is your company’s North Star',
 'Architecture.Next: Invalidating old axioms',
 'Four short links: 1 April 2020',
 'Four short links: 10 March 2020',
 'Four short links: 11 March 2020',
 'Four short links: 12 March 2020',
 'Four short links: 13 February 2020',
 'Four short links: 13 March 2020',
 'Four short links: 14 February 2020',
 'Four short links: 16 March 2020',
 'Four short links: 17 February 2020',
 'Four short links: 17 March 2020',
 'Four short links: 18 February 2020',
 'Four short links: 18 March 2020',
 'Four short links: 19 February 2020',
 'Four short links: 19 March 2020',
 'Four short links: 2 April 2020',
 'Four short links: 2 March 2020',
 'Four short links: 20 February 2020',
 'Four short links:

### 8. Calculate the percentage of "Four short links" entry titles.

In [12]:
import re
titles = list([a.title for a in feedburner.entries])
Four_sl = []
for a in titles:
    if len(re.findall('(Four)',a))>0:
        Four_sl.append(re.findall('(Four)',a))
percentage = (len(Four_sl)/len(titles))*100
percentage

60.0

### 9. Create a Pandas data frame from the feed's entries.

In [13]:
import pandas as pd

In [14]:
#list(feedburner.entries[0].keys())
df = pd.DataFrame(feedburner.entries)
df.head()

Unnamed: 0,title,title_detail,links,link,comments,published,published_parsed,authors,author,author_detail,tags,id,guidislink,summary,summary_detail,content,wfw_commentrss,slash_comments,feedburner_origlink
0,Four short links: 8 April 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Wed, 08 Apr 2020 11:48:28 +0000","(2020, 4, 8, 11, 48, 28, 2, 99, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=12611,False,System Design for Advanced Beginners &#8212; a...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
1,Four short links: 7 April 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Tue, 07 Apr 2020 11:45:13 +0000","(2020, 4, 7, 11, 45, 13, 1, 98, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=12606,False,locust &#8212; open source load testing tool: ...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
2,Governance and Discovery,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/governance-and-d...,"Mon, 06 Apr 2020 19:09:29 +0000","(2020, 4, 6, 19, 9, 29, 0, 97, 0)",[{'name': 'Mike Loukides'}],Mike Loukides,{'name': 'Mike Loukides'},"[{'term': 'Radar Column', 'scheme': None, 'lab...",https://www.oreilly.com/radar/?p=12594,False,Data Governance sounds like a candidate for th...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/governance-and-d...,0,https://www.oreilly.com/radar/governance-and-d...
3,Four short links: 6 April 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Mon, 06 Apr 2020 11:53:01 +0000","(2020, 4, 6, 11, 53, 1, 0, 97, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=12590,False,Rufus &#8212; Create bootable USB drives the e...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
4,Four short links: 3 April 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Fri, 03 Apr 2020 11:59:08 +0000","(2020, 4, 3, 11, 59, 8, 4, 94, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=12585,False,The Zero Trust Learning Curve (Palo Alto Netwo...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...


### 10. Count the number of entries per author and sort them in descending order.

In [15]:
authors = df.groupby('author', as_index=False).agg({'title':'count'})
authors.columns = ['author', 'entries']
authors.sort_values('entries', ascending=False)

Unnamed: 0,author,entries
10,Nat Torkington,36
15,Roger Magoulas and Steve Swoyer,4
3,Jenn Webb,4
9,Mike Loukides,3
0,Cynthia Owens,1
14,Rita J. King,1
13,Rachel Laycock and Neal Ford,1
12,Peter Skomoroch and Mike Loukides,1
11,Pamela Rucker,1
8,Mary Poppendieck,1


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [16]:
df['title_length'] = df['title'].apply(len)
df[['title', 'author', 'title_length']].sort_values('title_length', ascending=False).head()

Unnamed: 0,title,author,title_length
40,Highlights from the O’Reilly Software Architec...,Mac Slocum,78
14,Great leaders inspire innovation and creativit...,Jenn Webb,76
52,10 ways to get untapped talent in your organiz...,Pamela Rucker,65
15,Strong leaders forge an intersection of knowle...,Jenn Webb,64
20,It’s an unprecedented crisis: 8 things to do r...,Cynthia Owens,54


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [19]:
def search(x):
    match = re.search('machine learning',x)
    if match:
        return 'Has'
    else:
        return 'No'
df['Has text'] = df.summary.apply(search)

In [20]:
list_sum1 = df[['title','Has text','summary']]
list_sum2 = list_sum1[df['Has text'] == 'Has']

display(list_sum2)

Unnamed: 0,title,Has text,summary
8,What you need to know about product management...,Has,If you’re already a software product manager (...
58,Four short links: 13 February 2020,Has,Ofcom To Regulate UK Internet &#8212; The regu...
