# Scraping RSS feed for podcast Stuff You Missed in History Class

In [36]:
import requests
from bs4 import BeautifulSoup 

In [37]:
url = "https://feeds.megaphone.fm/stuffyoumissedinhistoryclass"

In [38]:
r = requests.get(url)
soup = BeautifulSoup(r.content, features='xml')

In [39]:
episodes = soup.findAll('item') 

In [40]:
episodes[0]

<item>
<title>Behind the Scenes Minis: William Apess</title>
<itunes:title>Behind the Scenes Minis: William Apess</itunes:title>
<description>Tracy and Holly talk about Tracy’s research process for the William Apess episodes, and how much of his writing she wanted to include in the episode. And that’s because his work is still deeply relevant.

See omnystudio.com/listener for privacy information.</description>
<content:encoded>&lt;p&gt;Tracy and Holly talk about Tracy&amp;rsquo;s research process for the William Apess episodes, and how much of his writing she wanted to include in the episode. And that&amp;rsquo;s because his work is still deeply relevant.&lt;/p&gt;&lt;p&gt;See &lt;a href="https://omnystudio.com/listener"&gt;omnystudio.com/listener&lt;/a&gt; for privacy information.&lt;/p&gt;</content:encoded>
<itunes:summary>Tracy and Holly talk about Tracy’s research process for the William Apess episodes, and how much of his writing she wanted to include in the episode. And that’s be

In [6]:
len(episodes)

1818

# Cleaning the Data 
1. This is the function for labeling each episode as Classic (repeat), Behind the Scenes, Interviews, Unearthed or Regular episodes
2. Using the concept of base case to write the for loop in the function 

In [51]:
search_classics = 'SYMHC Classics:'
search_interviews = 'Interview'
search_bts = 'Behind the Scenes Minis:'
search_unearthed = 'Unearthed!'


special_types = [search_classics, search_interviews, search_bts, search_unearthed]

#This is code to set the defaut to "Regular" 
#Anything that is not regular will be sorted into one of the "special_types" categories
#This is an example of using base case

def get_special_type(title, types):
  special_type = "Regular" 
  for t in types:
    if title.startswith(t):
      special_type = t
    return special_type 


# Looping through each episode and cleaning it.
1. Remove information from descriptions that is impertinent
2. Putting all the episode details into a list and it's a list of dictionaries 

In [52]:
podcast_episodes = []

for episode in episodes:
    podcast_episode = {}
    podcast_episode['title'] = episode.title.text 
    podcast_episode['description'] = episode.description.text
    podcast_episode['description_clean'] = episode.description.text.strip('\n').replace("Learn more about your ad-choices at https://www.iheartpodcastnetwork.com","").replace("See omnystudio.com/listener for privacy information.","").strip()
    podcast_episode['pubDate'] = episode.pubDate.text
    podcast_episode['duration'] = episode.duration.text
    podcast_episode['type'] = get_special_type(podcast_episode['title'],special_types)
    #creating a list of types of episodes appended to the greater list. 
        
    podcast_episodes.append(podcast_episode)
    
#how to get rid of extra parts of the description? 
#how to scrape parts of the date away? 
#split() text on certain deliminators. Use pop() to get rid of last item. 
#find index position of substring, slice description from the beginning --. 
#find the first instance of "learn more about...
#replace() put the string in there. ,empty string. 

#for for the \xa0\n and split there. Do it in a for loop. 
#if any of them have don't have "learn more about...", print it outthen explore more


In [53]:
podcast_episodes[:10]

[{'title': 'Behind the Scenes Minis: William Apess',
  'description': 'Tracy and Holly talk about Tracy’s research process for the William Apess episodes, and how much of his writing she wanted to include in the episode. And that’s because his work is still deeply relevant.\n\nSee omnystudio.com/listener for privacy information.',
  'description_clean': 'Tracy and Holly talk about Tracy’s research process for the William Apess episodes, and how much of his writing she wanted to include in the episode. And that’s because his work is still deeply relevant.',
  'pubDate': 'Fri, 04 Mar 2022 14:00:00 +0000',
  'duration': '508',
  'type': 'Regular'},
 {'title': 'William Apess and the Mashpee Revolt (Pt. 2)',
  'description': 'Apess’s religious work and writing consistently stressed the inherent humanity and worth of Indigenous people, but in the later years of his career he also became involved in more direct activism in Mashpee, Massachusetts.\n\nSee omnystudio.com/listener for privacy inf

# Export data to json file 

In [30]:
import json

with open("rssdump.json", 'w') as outfile:
    json.dump(podcast_episodes, outfile, indent=4)
    #indent adds white space to make it easier to read 

In [34]:
import pandas as pd
df = pd.DataFrame(podcast_episodes, columns=['title','description_clean','pubDate','duration','type'])

In [35]:
df.head()

Unnamed: 0,title,description_clean,pubDate,duration,type
0,William Apess and the Mashpee Revolt (Pt. 2),Apess’s religious work and writing consistentl...,"Wed, 02 Mar 2022 17:35:00 +0000",2172,Regular
1,The Autobiography of William Apess (Pt. 1),Minister William Apess is often described as t...,"Mon, 28 Feb 2022 11:00:00 +0000",1954,Regular
2,Cyrano the Movie,Holly and Tracy share interviews with some of ...,"Sun, 27 Feb 2022 14:00:00 +0000",1234,Regular
3,Introducing: Big Brother,"Hi, Stuff You Missed In History Class fans! Th...","Sun, 27 Feb 2022 09:00:00 +0000",173,Regular
4,SYMHC Classics: Edmonia Lewis,This 2017 episode covers an American sculptor ...,"Sat, 26 Feb 2022 14:00:00 +0000",1703,SYMHC Classics:


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1818 entries, 0 to 1817
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              1818 non-null   object
 1   description_clean  1818 non-null   object
 2   pubDate            1818 non-null   object
 3   duration           1818 non-null   object
 4   type               1818 non-null   object
dtypes: object(5)
memory usage: 71.1+ KB
