# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [1]:
pip install feedparser

Note: you may need to restart the kernel to use updated packages.


In [2]:
import feedparser
import requests
import xmltodict
import re
import pandas as pd

### 1. Use feedparser to parse the following RSS feed URL.

In [3]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [4]:
reddit = feedparser.parse(url)


### 2. Obtain a list of components (keys) that are available for this feed.

In [5]:
reddit.keys()

dict_keys(['bozo', 'entries', 'feed', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [6]:
reddit.feed.keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'subtitle', 'subtitle_detail', 'updated', 'updated_parsed', 'language', 'sy_updateperiod', 'sy_updatefrequency', 'generator_detail', 'generator', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [7]:
title = reddit.feed.title
subtitle = reddit.feed.subtitle
link = reddit.feed.link
author = reddit.entries[0].author
print (f'{title}\n{subtitle}\n{link}\n{author}')

Radar
Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology
https://www.oreilly.com/radar
Nat Torkington


### 5. Count the number of entries that are contained in this RSS feed.

In [8]:
len(reddit.entries)

60

In [9]:
reddit.entries[0].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink'])

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [10]:
keys = [reddit.entries[i].keys() for i in range(len(reddit.entries))]
print(keys[0])

dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink'])


### 7. Extract a list of entry titles.

In [11]:
ti = [reddit.entries[i].title for i in range(len(reddit.entries))]
print(ti[:5])

['Four short links: 28 Oct 2020', 'Our Favorite Questions', 'Four short links: 21 Oct 2020', 'Four Short Links: 16 October 2020', 'Four short links: 14 Oct 2020']


### 8. Calculate the percentage of "Four short links" entry titles.

In [43]:
def len_link(lista):
    count = 0
    for i in lista: 
        if 'Four short links' in i:
            count +=1
    return f' {count/len(lista)*100} %'

In [44]:
len_link(ti)

' 68.33333333333333 %'

### 9. Create a Pandas data frame from the feed's entries.

In [16]:
import pandas as pd
import requests
import xmltodict

In [51]:
reddit.entries[0].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink'])

In [53]:
import pandas as pd
 
df = pd.DataFrame(reddit.entries)
df.head()

Unnamed: 0,title,title_detail,links,link,comments,published,published_parsed,authors,author,author_detail,tags,id,guidislink,summary,summary_detail,content,wfw_commentrss,slash_comments,feedburner_origlink
0,Four short links: 28 Oct 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Wed, 28 Oct 2020 11:39:13 +0000","(2020, 10, 28, 11, 39, 13, 2, 302, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13382,False,"Phantom of the ADAS &#8212; In this paper, we ...","{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
1,Our Favorite Questions,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/our-favorite-que...,"Thu, 22 Oct 2020 14:33:17 +0000","(2020, 10, 22, 14, 33, 17, 3, 296, 0)","[{'name': 'Q Ethan McCallum, Chris Butler and ...","Q Ethan McCallum, Chris Butler and Shane Glynn","{'name': 'Q Ethan McCallum, Chris Butler and S...","[{'term': 'AI & ML', 'scheme': None, 'label': ...",https://www.oreilly.com/radar/?p=13374,False,"&#8220;On peut interroger n&#8217;importe qui,...","{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/our-favorite-que...,0,https://www.oreilly.com/radar/our-favorite-que...
2,Four short links: 21 Oct 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Wed, 21 Oct 2020 11:34:42 +0000","(2020, 10, 21, 11, 34, 42, 2, 295, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13376,False,Justice Department Antitrust Filing Against Go...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
3,Four Short Links: 16 October 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Fri, 16 Oct 2020 11:21:43 +0000","(2020, 10, 16, 11, 21, 43, 4, 290, 0)",[{}],,,,https://www.oreilly.com/radar/?p=13371,False,Automerge &#8212; (Github) Data structure libr...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
4,Four short links: 14 Oct 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Wed, 14 Oct 2020 11:46:08 +0000","(2020, 10, 14, 11, 46, 8, 2, 288, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13367,False,Data Organization in Spreadsheets &#8212; Focu...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...


### 10. Count the number of entries per author and sort them in descending order.

In [115]:
authors = df.groupby('author', as_index = False).agg({'title':'count'})
authors.columns = ['author', 'entries']
authors.sort_values('entries', ascending=False)
#DataFrame.groupby(by=None, axis=0, level=None, as_index=True, sort=True, 
#group_keys=True, squeeze=<object object>, observed=False, dropna=True)
#DataFrame.agg(func=None, axis=0, *args, **kwargs)

Unnamed: 0,author,entries
7,Nat Torkington,42
5,Mike Loukides,9
0,,1
1,Alex Castrounis,1
2,Justin Norman and Mike Loukides,1
3,"Justin Norman, Peter Skomoroch and Mike Loukides",1
4,Matthew Rocklin and Hugo Bowne-Anderson,1
6,Mike Loukides and Steve Swoyer,1
8,Q Ethan McCallum and Mike Loukides,1
9,"Q Ethan McCallum, Chris Butler and Shane Glynn",1


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [59]:
df['title_length'] = df['title'].apply(lambda x: len(x))
df[['title', 'author', 'title_length']].sort_values('title_length', ascending=False)


Unnamed: 0,title,author,title_length
27,Why Best-of-Breed is a Better Choice than All-...,Matthew Rocklin and Hugo Bowne-Anderson,79
54,Automated Coding and the Future of Programming,Mike Loukides,46
5,AI Product Management After Deployment,Justin Norman and Mike Loukides,38
21,Radar trends to watch: September 2020,Mike Loukides,37
29,The Least Liked Programming Languages,Mike Loukides,37
10,Radar trends to watch: October 2020,Mike Loukides,35
12,Four short links: 25 September 2020,Nat Torkington,35
33,Radar trends to watch: August 2020,Mike Loukides,34
20,Four short links: 2 September 2020,Nat Torkington,34
19,Four short links: 4 September 2020,Nat Torkington,34


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [105]:
#df.head(5)

In [100]:
lista = []

In [87]:
df['summary'] = df['summary'].apply(lambda x: x.lower())

In [97]:
df['summary']

'data organization in spreadsheets &#8212; focusing on the data entry and storage aspects, this article offers practical recommendations for organizing spreadsheet data to reduce errors and ease later analyses. the basic principles are: be consistent, write dates like yyyy-mm-dd, do not leave any cells empty, put just one thing in a cell, organize the data [&#8230;]'

In [101]:
for i,e in enumerate(df['summary']):
    if 'machine learning' in e:
        lista.append(df['title'][i])
        print(i,e)
lista
      


25 the 212 story tower that isn&#8217;t in suburban melbourne &#8212; a typo in a open street map submission becomes a surprising monolith in microsoft flight simulator. fairness in machine learning &#8212; draft text for a book on the subject. the social architecture of impactful communities &#8212; a really good set of models for communities. individuals [&#8230;]
56 when data is messy &#8212; i love stories that illustrate the ways machine learning can draw the wrong conclusions. researchers at the university of tuebingen trained a neural net to recognize images, and then had it point out which parts of the images were the most important for its decision. when they asked it to [&#8230;]


['Four short links: 21 August 2020', 'Four short links: 8 July 2020']

In [103]:
'machine learning'  in df['summary'][25] 

True

In [104]:
'machine learning'  in df['summary'][56] 

True