# Data extraction, cleaning and, preparation for analysis 

## Scraping RSS feed for podcast Stuff You Missed in History Class

In [1]:
# import necessary libraries and modules. 
import requests
from bs4 import BeautifulSoup 

from datetime import datetime

import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
# url for RSS feed to extract information about the podcasts. 
# RSS feeds are written in XML and information like titles, descriptions, publication date, etc. can be extracted. 
url = "https://feeds.megaphone.fm/stuffyoumissedinhistoryclass"

In [5]:
# Connecting to URL and sending a get request to scrape information. 
r = requests.get(url)
soup = BeautifulSoup(r.content, features='xml')
episodes = soup.findAll('item') 

# Can continally scrape, since the RSS feed is updated every time a new episode is added. 
len(episodes)

1836

# Data Cleaning and Preparation

## Labeling Each Episode
1. This is the function for labeling each episode as Classic (repeat), Behind the Scenes, Interviews, Unearthed or Regular episodes
2. Using the concept of base case to write the for loop in the function 

In [6]:
# Assigning variables to each episode type, which is usually written exactly as follows in the title. 
search_classics = 'SYMHC Classics:'
search_interviews = 'Interview'
search_bts = 'Behind the Scenes Minis:'
search_unearthed = 'Unearthed!'

# Creating a special type list to loop through and search. 
special_types = [search_classics, search_interviews, search_bts, search_unearthed]

#This is code to set the defaut to "Regular" 
#Anything that is not regular will be sorted into one of the "special_types" categories
#This is an example of using base case
def get_special_type(title, types):
    special_type = "Regular" 
    for t in types:
        if title.startswith(t):
            special_type = t
    return special_type 


## Calculating the time of each episode in minutes
1. Episodes were originally in seconds. 
2. Converted them into minutes and rounded to the nearest minute.

In [7]:
def calculate_duration(time_string):
    time = (int(time_string))/60
    return int(round(time, 0))

## Convert publication date to a more analizable string 
1. Created one columns with the conversion to datetime string (YYYY, MM, DD, T, S) 
2. Create two new columns: one for just year and one for just month. 

In [8]:
def convert_date(date_string): 
    date = datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %z") 
    return date

## Function for looping through each episode to find geopolitical locations 

In [9]:
# See the "Spacy_Practice" notebook to see process for extracting named entities. 
# Looping through each episode description to extract countries, states, and cities. 
def spacy_gpe(texts):
    doc = nlp(texts)
    ep_results = []
    for ent in doc.ents:
        if ent.label_=='GPE' and ent.text=='Deblina': 
            #It was recognizing a host's name 'Deblina' as a place, so I wrote an exception to this. 
            continue 
        elif ent.label_=='GPE':
            ep_results.append([ent.text,ent.label_])
        
    return ep_results

## Loop through each episode and cleaning it.
1. Put all the episode details into a list and it's a list of dictionaries 
    *  Remove information from descriptions that is impertinent and put it in a new column called 'description_clean' 
    *  Appended the label of 'type' of episode
    *  find named geopolitical named entities and append
    

In [63]:
#Create a list of all podcast episodes and put the following into the list: 
podcast_episodes = []

# Loopings through each episode on the RSS feed to extract the following information. 
for episode in episodes:
    
    #Create a dictionary to put the lists into. 
    podcast_episode = {}
    podcast_episode['title'] = episode.title.text 
    podcast_episode['description'] = episode.description.text
    
    #Clean the description removing information not pertinent to the analysis. 
    podcast_episode['description_clean'] = episode.description.text.strip('\n').replace("Learn more about your ad-choices at https://www.iheartpodcastnetwork.com","").replace("See omnystudio.com/listener for privacy information.","").strip()
    podcast_episode['pubDate'] = episode.pubDate.text 
    podcast_episode['pubDate_converted'] = str(convert_date(podcast_episode['pubDate'])) 
    podcast_episode['pubDate_year'] = str(convert_date(podcast_episode['pubDate']).year)
    podcast_episode['pubDate_month'] = str(convert_date(podcast_episode['pubDate']).month)
    podcast_episode['duration'] = episode.duration.text
    podcast_episode['duration_minutes'] = calculate_duration(podcast_episode['duration'])
    podcast_episode['type'] = get_special_type(podcast_episode['title'],special_type)
    
    #Named entity recognition extraction of countries, states, and cities. 
    podcast_episode['title_gpe_mentioned'] = spacy_gpe(podcast_episode['title'])
    podcast_episode['description_gpe_mentioned'] = spacy_gpe(podcast_episode['description_clean'])
    
    
    #Append all information to the greater list before the loop. 
    podcast_episodes.append(podcast_episode)

In [64]:
podcast_episodes[:3]

[{'title': 'Women’s March on Pretoria, 1956',
  'description': 'This 1956 march was a protest against pass laws that were part of South Africa’s system of apartheid – and specifically the requirement that women carry passes. The protest was simultaneously part of the anti-apartheid movement in South Africa, and the movement for women’s rights.\xa0\n\nResearch:\n"Apartheid." Gale World History Online Collection, Gale, 2021. Gale In Context: World History, link.gale.com/apps/doc/NUBLZL400705235/WHIC?u=mlin_n_melpub&sid=bookmark-WHIC&xid=a66fcd94\nInternational Women’s Day. “National Women\'s Day in South Africa is a powerful day for equal rights.” https://www.internationalwomensday.com/Missions/15556/National-Women-s-Day-in-South-Africa-is-a-powerful-day-for-equal-rights\nAkpan, Idara . “The 1956 Women’s March in Pretoria.” South African History Online. https://www.sahistory.org.za/article/1956-womens-march-pretoria\nSouth African History Online. “The 1956 Women’s March, Pretoria, 9 Augu

## Export data to json file 

In [65]:
import json

with open("rssdump.json", 'w') as outfile:
    json.dump(podcast_episodes, outfile, indent=4)
    #indent adds white space to make it easier to read 

In [66]:
import pandas as pd
df = pd.DataFrame(podcast_episodes, columns=['title','description_clean','pubDate_year','pubDate_month','duration_minutes','type', 'title_gpe_mentioned', 'description_gpe_mentioned'])

In [67]:
df.head()

Unnamed: 0,title,description_clean,pubDate_year,pubDate_month,duration_minutes,type,title_gpe_mentioned,description_gpe_mentioned
0,"Women’s March on Pretoria, 1956",This 1956 march was a protest against pass law...,2022,3,34,Regular,"[[Pretoria, GPE]]","[[South Africa’s, GPE], [South Africa, GPE], [..."
1,"Thomas Hardy, Emma Gifford and Florence Dugdale","Once you know about Hardy’s life, it's natural...",2022,3,39,Regular,[],"[[Angelique, GPE]]"
2,SYMHC Classics: 6888th Central Postal Director...,This 2019 episode covers the 6888th Central Po...,2022,3,34,SYMHC Classics:,[],"[[U.S., GPE]]"
3,Behind the Scenes Minis: Maria and Bruno,Tracy shares how she learned about Maria Gertr...,2022,3,19,Behind the Scenes Minis:,[],[]
4,Giordano Bruno,"Bruno was a mathematician, philosopher, astron...",2022,3,40,Regular,[],"[[Italy, GPE], [Martinez, GPE]]"


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1833 entries, 0 to 1832
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   title                      1833 non-null   object
 1   description_clean          1833 non-null   object
 2   pubDate_year               1833 non-null   object
 3   pubDate_month              1833 non-null   object
 4   duration_minutes           1833 non-null   int64 
 5   type                       1833 non-null   object
 6   title_gpe_mentioned        1833 non-null   object
 7   description_gpe_mentioned  1833 non-null   object
dtypes: int64(1), object(7)
memory usage: 114.7+ KB


## Export the data into an excel file for easy handcoding of western vs. non-western countries. 

In [70]:
df.to_excel('SYMIHC_country_data.xlsx')

## Additional cleaning/prepping tasks for the future:
1. clean descriptions to remove research portion (not on all episodes) 
2. Include NORP in analysis to greater improve country and region accuracy 
3. Include pronouns, gender referential inference 
4. Include time/era 