# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [1]:
import feedparser

### 1. Use feedparser to parse the following RSS feed URL.

In [2]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [3]:
parsed_radar = feedparser.parse(url)
print(parsed_radar)

{'feed': {'title': 'Radar', 'title_detail': {'type': 'text/plain', 'language': None, 'base': 'http://feeds.feedburner.com/oreilly/radar/atom', 'value': 'Radar'}, 'links': [{'rel': 'alternate', 'type': 'text/html', 'href': 'https://www.oreilly.com/radar'}, {'rel': 'self', 'type': 'application/rss+xml', 'href': 'http://feeds.feedburner.com/oreilly/radar/atom'}, {'rel': 'hub', 'href': 'http://pubsubhubbub.appspot.com/', 'type': 'text/html'}], 'link': 'https://www.oreilly.com/radar', 'subtitle': 'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology', 'subtitle_detail': {'type': 'text/html', 'language': None, 'base': 'http://feeds.feedburner.com/oreilly/radar/atom', 'value': 'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology'}, 'updated': 'Fri, 04 Oct 2019 12:14:43 +0000', 'updated_parsed': time.struct_time(tm_year=2019, tm_mon=10, tm_mday=4, tm_hour=12, tm_min=14, tm_sec=43, tm_wday=4, tm_yday=27

### 2. Obtain a list of components (keys) that are available for this feed.

In [5]:
radar_keys = parsed_radar.keys()
print(radar_keys)

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])


### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [6]:
feed_keys = parsed_radar['feed'].keys()
print(feed_keys)

dict_keys(['title', 'title_detail', 'links', 'link', 'subtitle', 'subtitle_detail', 'updated', 'updated_parsed', 'language', 'sy_updateperiod', 'sy_updatefrequency', 'generator_detail', 'generator', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])


### 4. Extract and print the feed title, subtitle, and link. (removed author, it does not exist within Feed's keys)

In [19]:
feedtitle = parsed_radar['feed']['title']
feedsubtitle = parsed_radar['feed']['subtitle']
feedlink = parsed_radar['feed']['link']

print(feedtitle, feedsubtitle, feedlink)

Radar Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology https://www.oreilly.com/radar


### 5. Count the number of entries that are contained in this RSS feed.

In [20]:
print(len(parsed_radar['entries']))

18


### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [25]:
entries_keys = parsed_radar['entries'][0].keys() # entries is a list, so get the zero index
print(entries_keys)


dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink'])


### 7. Extract a list of entry titles.

In [29]:
entry_titles = []
for entry in parsed_radar['entries'][1:]:
    entry_titles.append(entry['title'])
print(entry_titles)

['Four short links: 3 October 2019', 'Four short links: 2 October 2019', 'Four short links: 1 October 2019', 'TinyML: The challenges and opportunities of low-power ML applications', 'Four short links: 30 September 2019', 'Highlights from the Strata Data Conference in New York 2019', 'Four short links: 27 September 2019', 'Delivering the enterprise data cloud', 'Data Science Pioneers: Conquering the next frontier, a documentary investigating the future of data science', 'Postrevolutionary big data: Promoting the general welfare', 'Say what? The ethical challenges of designing for humanlike interaction', 'RL in real life: Bringing reinforcement learning to the enterprise', 'Strata Data Awards winners 2019', 'Staying safe in the AI era', 'Unlocking the value of your data', 'Data sonification: Making music from the yield curve', 'Four Short Links: 26 September 2019']


### 8. Calculate the percentage of "Four short links" entry titles.

In [38]:
title_count = [title for title in entry_titles if "Four short links" in title]
print(round((len(title_count) / len(entry_titles)), 2))


0.29


### 9. Create a Pandas data frame from the feed's entries.

In [51]:
import pandas as pd
pd.set_option('display.max_columns', 25)

In [56]:
entry_df = pd.DataFrame(parsed_radar['entries'])
print(entry_df)

               author                   author_detail  \
0   jwebb@oreilly.com  {'email': 'jwebb@oreilly.com'}   
1   jwebb@oreilly.com  {'email': 'jwebb@oreilly.com'}   
2   jwebb@oreilly.com  {'email': 'jwebb@oreilly.com'}   
3   jwebb@oreilly.com  {'email': 'jwebb@oreilly.com'}   
4          Mac Slocum          {'name': 'Mac Slocum'}   
5          Mac Slocum          {'name': 'Mac Slocum'}   
6          Mac Slocum          {'name': 'Mac Slocum'}   
7          Mac Slocum          {'name': 'Mac Slocum'}   
8          Mac Slocum          {'name': 'Mac Slocum'}   
9          Mac Slocum          {'name': 'Mac Slocum'}   
10         Mac Slocum          {'name': 'Mac Slocum'}   
11         Mac Slocum          {'name': 'Mac Slocum'}   
12         Mac Slocum          {'name': 'Mac Slocum'}   
13         Mac Slocum          {'name': 'Mac Slocum'}   
14         Mac Slocum          {'name': 'Mac Slocum'}   
15         Mac Slocum          {'name': 'Mac Slocum'}   
16         Mac Slocum          

### 10. Count the number of entries per author and sort them in descending order.

In [55]:
'''
from the lesson
authors = df.groupby('author', as_index=False).agg({'title':'count'})
authors.columns = ['author', 'entries']
authors.sort_values('entries', ascending=False)
'''
entry_authors = entry_df.groupby('author', as_index=False).agg({'title':'count'})
print(entry_authors.sort_values('title', ascending=False))

              author  title
0         Mac Slocum     14
1  jwebb@oreilly.com      4


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [60]:
entry_df['title_length'] = [len(str(title)) for title in entry_df['title']]

print(entry_df[['title', 'author', 'title_length']].sort_values('title_length', ascending=False))


                                                title             author  \
9   Data Science Pioneers: Conquering the next fro...         Mac Slocum   
11  Say what? The ethical challenges of designing ...         Mac Slocum   
4   TinyML: The challenges and opportunities of lo...         Mac Slocum   
12  RL in real life: Bringing reinforcement learni...         Mac Slocum   
6   Highlights from the Strata Data Conference in ...         Mac Slocum   
10  Postrevolutionary big data: Promoting the gene...         Mac Slocum   
16  Data sonification: Making music from the yield...         Mac Slocum   
8                Delivering the enterprise data cloud         Mac Slocum   
17                Four Short Links: 26 September 2019         Mac Slocum   
7                 Four short links: 27 September 2019         Mac Slocum   
5                 Four short links: 30 September 2019         Mac Slocum   
1                    Four short links: 3 October 2019  jwebb@oreilly.com   
3           

### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [65]:
ml_title_list = [entrytitle for entrytitle in entry_df['title'] if "machine learning" in entrytitle]
print(ml_title_list) # its an empty list
print(entry_df['title']) # printed to confirm

[]
0                      Four short links: 4 October 2019
1                      Four short links: 3 October 2019
2                      Four short links: 2 October 2019
3                      Four short links: 1 October 2019
4     TinyML: The challenges and opportunities of lo...
5                   Four short links: 30 September 2019
6     Highlights from the Strata Data Conference in ...
7                   Four short links: 27 September 2019
8                  Delivering the enterprise data cloud
9     Data Science Pioneers: Conquering the next fro...
10    Postrevolutionary big data: Promoting the gene...
11    Say what? The ethical challenges of designing ...
12    RL in real life: Bringing reinforcement learni...
13                      Strata Data Awards winners 2019
14                           Staying safe in the AI era
15                     Unlocking the value of your data
16    Data sonification: Making music from the yield...
17                  Four Short Links: 26 Sept

In [67]:
# fourlist = [entrytitle for entrytitle in entry_df['title'] if "Four short links" in entrytitle] # to make sure my code works, it does
# print(fourlist)

['Four short links: 4 October 2019', 'Four short links: 3 October 2019', 'Four short links: 2 October 2019', 'Four short links: 1 October 2019', 'Four short links: 30 September 2019', 'Four short links: 27 September 2019']
