# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [65]:
import feedparser

### 1. Use feedparser to parse the following RSS feed URL.

In [71]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [75]:
radar= feedparser.parse(url)

### 2. Obtain a list of components (keys) that are available for this feed.

In [76]:
radar.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [77]:
radar.feed.keys()

dict_keys(['title', 'title_detail', 'id', 'guidislink', 'link', 'updated', 'updated_parsed', 'subtitle', 'subtitle_detail', 'links', 'authors', 'author_detail', 'author', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [78]:
radar.feed.title
radar.feed.subtitle
radar.feed.author
radar.feed.link

'https://www.oreilly.com'

### 5. Count the number of entries that are contained in this RSS feed.

In [79]:
len(radar.entries)

60

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [81]:
radar.entries[0].keys()

dict_keys(['title', 'title_detail', 'updated', 'updated_parsed', 'id', 'guidislink', 'link', 'content', 'summary', 'links', 'authors', 'author_detail', 'author', 'feedburner_origlink'])

### 7. Extract a list of entry titles.

In [83]:
titles = [radar.entries[i].title for i in range(len(radar.entries))]
print(titles)

['Four short links: 22 July 2019', 'Four short links: 19 July 2019', 'The war for the soul of open source', "O'Reilly Open Source and Frank Willison Awards", 'O’Reilly Radar: Open source technology trends—What our users tell us', 'Ask not what Brands™ can do for you', 'Managing machines', 'Acquiring and sharing high-quality data', 'Four short links: 18 July 2019', 'The role of open source in mitigating natural disasters', "Highlights from the O'Reilly Open Source Software Conference in Portland 2019", 'Better living through software', 'Why Amazon cares about open source', 'Built to last: Building and growing open source communities', 'The next age of open innovation', 'Four short links: 17 July 2019', 'Four short links: 16 July 2019', 'Managing machine learning in the enterprise: Lessons from banking and health care', 'Four short links: 15 July 2019', 'Four short links: 12 July 2019', 'Four short links: 11 July 2019', 'Four short links: 10 July 2019', 'Four short links: 9 July 2019', '

### 8. Calculate the percentage of "Four short links" entry titles.

In [89]:
count_total=len(titles)
count_four = 0

for i in titles:
    if 'Four short links' in i:
        count_four += 1

pourcentage = count_four/count_total*100
print(pourcentage)

43.333333333333336


### 9. Create a Pandas data frame from the feed's entries.

In [90]:
import pandas as pd

In [91]:
df = pd.DataFrame(radar.entries)

### 10. Count the number of entries per author and sort them in descending order.

In [92]:
authors = df.groupby('author', as_index=False).agg({'title':'count'})
authors.columns = ['author', 'entries']
authors.sort_values('entries', ascending=False)

Unnamed: 0,author,entries
18,Nat Torkington,26
5,Ben Lorica,5
6,"Ben Lorica, Harish Doddi, David Talby",2
10,Jenn Webb,2
0,Abigail Hing Wen,1
15,Michael James,1
25,Tim Kraska,1
24,Tiffani Bell,1
23,Roger Magoulas,1
22,"Rebecca Parsons, Neal Ford",1


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [None]:
df['title_length'] = df['title'].apply(len)
df[['title', 'author', 'title_length']].sort_values('title_length', ascending=False)

### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [93]:
liste = []

for (a,b) in zip(df.summary, df.title):
    if 'machine learning' in a:
        liste.append(b)
        
liste

['Acquiring and sharing high-quality data',
 "Highlights from the O'Reilly Open Source Software Conference in Portland 2019",
 'Managing machine learning in the enterprise: Lessons from banking and health care',
 "Highlights from the O'Reilly Artificial Intelligence Conference in Beijing 2019",
 'The future of machine learning is tiny',
 'Tools for machine learning development',
 'New live online training courses',
 'RISELab’s AutoPandas hints at automation tech that will change the nature of software development',
 'AI and machine learning will require retraining your entire organization',
 'Enabling end-to-end machine learning pipelines in real-world applications',
 'What are model governance and model operations?',
 'The quest for high-quality data']