# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [1]:
import feedparser
import pandas as pd

### 1. Use feedparser to parse the following RSS feed URL.

In [2]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [3]:
radar = feedparser.parse(url)

In [4]:
radar.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

In [5]:
radar.feed.title_detail.keys()

dict_keys(['type', 'language', 'base', 'value'])

### 2. Obtain a list of components (keys) that are available for this feed.

In [6]:
radar.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [7]:
feed_keys = []

for key in radar.feed.keys():
    feed_keys.append(key)
    
feed_keys

['title',
 'title_detail',
 'links',
 'link',
 'subtitle',
 'subtitle_detail',
 'updated',
 'updated_parsed',
 'language',
 'sy_updateperiod',
 'sy_updatefrequency',
 'generator_detail',
 'generator',
 'feedburner_info',
 'geo_lat',
 'geo_long',
 'feedburner_emailserviceid',
 'feedburner_feedburnerhostname']

### 4. Extract and print the feed title, subtitle, author, and link.

In [8]:
title = radar.feed.title
title

'Radar'

In [9]:
subtitle = radar.feed.subtitle
subtitle

'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology'

In [10]:
link = radar.feed.link
link

'https://www.oreilly.com/radar'

In [11]:
type(radar.entries)

list

In [12]:
authors = [radar.entries[i].author for i in range(len(radar.entries))]
set(authors)

{'Cynthia Owens',
 'George Fairbanks',
 'Hugo Bowne-Anderson',
 'Jenn Webb',
 'Kai Holnes',
 'Mac Slocum',
 'Mark Richards',
 'Martin Fowler',
 'Mary Poppendieck',
 'Mike Loukides',
 'Nat Torkington',
 'Pamela Rucker',
 'Peter Skomoroch and Mike Loukides',
 'Rachel Laycock and Neal Ford',
 'Rita J. King',
 'Roger Magoulas and Steve Swoyer',
 'Tim O’Reilly'}

### 5. Count the number of entries that are contained in this RSS feed.

In [13]:
print('There are',len(radar.entries),'entries.')

There are 60 entries.


In [14]:
radar.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

In [15]:
radar.entries[0].title

'Four short links: 8 April 2020'

In [16]:
radar.entries[50].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink'])

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [17]:
entry_keys = []

for key in radar.entries[0].keys():
    entry_keys.append(key)
    
entry_keys

['title',
 'title_detail',
 'links',
 'link',
 'comments',
 'published',
 'published_parsed',
 'authors',
 'author',
 'author_detail',
 'tags',
 'id',
 'guidislink',
 'summary',
 'summary_detail',
 'content',
 'wfw_commentrss',
 'slash_comments',
 'feedburner_origlink']

### 7. Extract a list of entry titles.

In [18]:
entry_titles = []

i=0
while i < len(radar.entries):
    entry_titles.append(radar.entries[i].title)
    i +=1
    
entry_titles

['Four short links: 8 April 2020',
 'Four short links: 7 April 2020',
 'Governance and Discovery',
 'Four short links: 6 April 2020',
 'Four short links: 3 April 2020',
 'Four short links: 2 April 2020',
 'Four short links: 1 April 2020',
 'Four short links: 31 March 2020',
 'What you need to know about product management for AI',
 'The unreasonable importance of data preparation',
 'Four short links: 24 March 2020',
 '3 ways to confront modern business challenges',
 'An enterprise vision is your company’s North Star',
 'Leaders need to mobilize change-ready workforces',
 'Great leaders inspire innovation and creativity from within their workforces',
 'Strong leaders forge an intersection of knowledge and experience',
 'Four short links: 23 March 2020',
 'Four short links: 20 March 2020',
 '6 trends framing the state of AI and ML',
 'Four short links: 19 March 2020',
 'It’s an unprecedented crisis: 8 things to do right now',
 'AI adoption in the enterprise 2020',
 'Four short links: 18

### 8. Calculate the percentage of "Four short links" entry titles.

In [19]:
import re

contador = 0

for title in entry_titles:
    if re.search(r'(Four short links)',title):
        contador +=1


porcentaje = contador / len(entry_titles) *100
print(porcentaje,"% of the entry titles are from \"Four short links\" titles")

60.0 % of the entry titles are from "Four short links" titles


### 9. Create a Pandas data frame from the feed's entries.

In [20]:
import pandas as pd

In [21]:
df = pd.DataFrame(radar.entries)
df.head(2)

Unnamed: 0,title,title_detail,links,link,comments,published,published_parsed,authors,author,author_detail,tags,id,guidislink,summary,summary_detail,content,wfw_commentrss,slash_comments,feedburner_origlink
0,Four short links: 8 April 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Wed, 08 Apr 2020 11:48:28 +0000","(2020, 4, 8, 11, 48, 28, 2, 99, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=12611,False,System Design for Advanced Beginners &#8212; a...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
1,Four short links: 7 April 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Tue, 07 Apr 2020 11:45:13 +0000","(2020, 4, 7, 11, 45, 13, 1, 98, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=12606,False,locust &#8212; open source load testing tool: ...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...


### 10. Count the number of entries per author and sort them in descending order.

In [22]:
df['author'].value_counts(sort=True, ascending=False,)

Nat Torkington                       36
Roger Magoulas and Steve Swoyer       4
Jenn Webb                             4
Mike Loukides                         3
Mary Poppendieck                      1
Peter Skomoroch and Mike Loukides     1
Pamela Rucker                         1
Cynthia Owens                         1
Tim O’Reilly                          1
Martin Fowler                         1
Kai Holnes                            1
Hugo Bowne-Anderson                   1
Mark Richards                         1
Rita J. King                          1
George Fairbanks                      1
Rachel Laycock and Neal Ford          1
Mac Slocum                            1
Name: author, dtype: int64

### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [23]:
df['tamaño'] = df.title.str.len()


final = df[['title','author','tamaño']]


final.sort_values('tamaño',ascending=False)

Unnamed: 0,title,author,tamaño
40,Highlights from the O’Reilly Software Architec...,Mac Slocum,78
14,Great leaders inspire innovation and creativit...,Jenn Webb,76
52,10 ways to get untapped talent in your organiz...,Pamela Rucker,65
15,Strong leaders forge an intersection of knowle...,Jenn Webb,64
20,It’s an unprecedented crisis: 8 things to do r...,Cynthia Owens,54
8,What you need to know about product management...,Peter Skomoroch and Mike Loukides,53
12,An enterprise vision is your company’s North Star,Jenn Webb,49
13,Leaders need to mobilize change-ready workforces,Jenn Webb,48
9,The unreasonable importance of data preparation,Hugo Bowne-Anderson,47
11,3 ways to confront modern business challenges,Rita J. King,45


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [24]:
def has(x): #se usa regex para encontrar la frase "machine learning"
    if re.search(r'machine learning',x):
        return 'Yes'
    else:
        return 'No'


df['Has Machine'] = df.summary.apply(has)

In [25]:
doce = df[['title','Has Machine','summary']]

doce_fil = doce[df['Has Machine']=='Yes']

doce_fil

Unnamed: 0,title,Has Machine,summary
8,What you need to know about product management...,Yes,If you’re already a software product manager (...
58,Four short links: 13 February 2020,Yes,Ofcom To Regulate UK Internet &#8212; The regu...


In [26]:
df.summary.loc[8]

'If you’re already a software product manager (PM), you have a head start on becoming a PM for artificial intelligence (AI) or machine learning (ML). You already know the game and how it is played: you’re the coordinator who ties everything together, from the developers and designers to the executives. You’re responsible for the design, [&#8230;]'

In [27]:
df.summary.loc[58]

'Ofcom To Regulate UK Internet &#8212; The regulator will play a key role in enforcing a statutory duty of care to protect users from harmful and illegal terrorist and child abuse content. Turing &#8212; Julia library for fast machine learning. The Effects of Prize Structures on Innovative Performance &#8212; We find that a winner-takes-all compensation [&#8230;]'

In [28]:
listadoce = doce_fil.title.tolist()

In [29]:
listadoce #lista con los títulos que contienen machine learning en el título

['What you need to know about product management for AI',
 'Four short links: 13 February 2020']