# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [1]:
import feedparser

### 1. Use feedparser to parse the following RSS feed URL.

In [2]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [3]:
data = feedparser.parse(url)

### 2. Obtain a list of components (keys) that are available for this feed.

In [4]:
data.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [5]:
data["feed"].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'subtitle', 'subtitle_detail', 'updated', 'updated_parsed', 'language', 'sy_updateperiod', 'sy_updatefrequency', 'generator_detail', 'generator', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [8]:
feed_title = data["feed"]["title"]
feed_subtitle = data["feed"]["subtitle"]
feed_link = data["feed"]["link"]
#No existe el dato de autor
print("Titulo:",feed_title)
print("Subtítulo:",feed_subtitle)
print("Link:",feed_link)

Titulo: Radar
Subtítulo: Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology
Link: https://www.oreilly.com/radar


### 5. Count the number of entries that are contained in this RSS feed.

In [9]:
entries = data["entries"]
len(entries)

60

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [10]:
data["entries"][0].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink'])

### 7. Extract a list of entry titles.

In [11]:
titles = []
for i in data["entries"]:
    titles.append(i["title"])
titles

['Four short links: 21 August 2020',
 'Four Short Links: 19 August 2020',
 'Why Best-of-Breed is a Better Choice than All-in-One Platforms for Data Science',
 'Four short links: 14 August 2020',
 'The Least Liked Programming Languages',
 'Four short links: 11 Aug 2020',
 'Four short links: 7 Aug 2020',
 'Four short links: 5 August 2020',
 'Radar trends to watch: August 2020',
 'Four short links: 31 July 2020',
 'Four short links: 30 July 2020',
 'Four short links: 29 July 2020',
 'Bringing an AI Product to Market',
 'Power, Harms, and Data',
 'Four short links: 27 July 2020',
 'Four short links: 24 July 2020',
 'Four short links: 26 July 2020',
 'Four short links: 22 July 2020',
 'AI, Protests, and Justice',
 'Four short links: 21 July 2020',
 'Four short links: 20 July 2020',
 'Four short links: 17 July 2020',
 'Four short links: 16 July 2020',
 'Microservices Adoption in 2020',
 'Four short links: 15 July 2020',
 'Society-Centered Design',
 'Four short links: 14 July 2020',
 'Four sh

### 8. Calculate the percentage of "Four short links" entry titles.

In [13]:
num = 0
for i in titles:
    if i.startswith("Four short links"):
        num += 1      
percentage = (num/len(titles))*100
print(f"The percentage of -Four short links- entry titles is {percentage}%")

The percentage of -Four short links- entry titles is 75.0%


### 9. Create a Pandas data frame from the feed's entries.

In [14]:
import pandas as pd
import re

In [18]:
entries =[]

for i in data["entries"]:
    title = i["title"]
    date = i["published"]
    authors = i["author"]
    summary = i['summary']
    summary = re.sub("\[?&#\d+;]?","",summary)
    entry={"Title":title,"Date":date,"Authors":authors,"Summary":summary}
    entries.append(entry)
    entries_pd = pd.DataFrame(entries)
entries_pd.head(3)

Unnamed: 0,Title,Date,Authors,Summary
0,Four short links: 21 August 2020,"Fri, 21 Aug 2020 11:56:32 +0000",Nat Torkington,The 212 Story Tower That Isnt in Suburban Melb...
1,Four Short Links: 19 August 2020,"Wed, 19 Aug 2020 11:44:06 +0000",Nat Torkington,The Design Space of Computational Notebooks L...
2,Why Best-of-Breed is a Better Choice than All-...,"Tue, 18 Aug 2020 11:30:42 +0000",Matthew Rocklin and Hugo Bowne-Anderson,So you need to redesign your company’s data in...


### 10. Count the number of entries per author and sort them in descending order.

In [19]:
entries_pd["Authors"].value_counts()

Nat Torkington                                      46
Mike Loukides                                        9
Sarah Gold                                           1
Matthew Rocklin and Hugo Bowne-Anderson              1
Mike Loukides and Steve Swoyer                       1
Justin Norman, Peter Skomoroch and Mike Loukides     1
Hugo Bowne-Anderson                                  1
Name: Authors, dtype: int64

### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [25]:
entries_pd["Length"] = entries_pd["Title"].apply(lambda x: len(x))
entries_pd_2 = entries_pd[["Title","Authors","Length"]]

entries_pd_2.sort_values(by=['Length'], ascending=False).head(3)

Unnamed: 0,Title,Authors,Length
2,Why Best-of-Breed is a Better Choice than All-...,Matthew Rocklin and Hugo Bowne-Anderson,79
29,Automated Coding and the Future of Programming,Mike Loukides,46
54,Machine Learning and the Production Gap,Mike Loukides,39


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [30]:
entry_list = []
workaround = entries_pd.apply(lambda x: entry_list.append(x["Title"]) if "machine learning" in x["Summary"] else False, axis=1)
entry_list #OK

['Four short links: 8 July 2020', 'Machine Learning and the Production Gap']