In [None]:
#class for getting & parsing RSS feeds
import feedparser as fp
import pandas as pd
import html
from nltk.tokenize import TreebankWordTokenizer,WhitespaceTokenizer
import string
from nltk.corpus import stopwords
import re
import time
import time
import numpy as np

In [None]:
#The FeedCrawler class interfaces w/ feedparser to comb RSS feeds and store them in
#the desired location/format.


class NoMoreFeedError(Exception):
    pass

class FeedCrawler():
    def __init__(self,podcast_df,foutloc,identifier=str(int(time.time()))):
        self.foutloc = foutloc
        self.podcast_df = podcast_df
        self.counter = 0
        self.feeds = []
        self.feedctr = []
        self.state = 'parsed'
        self.filecounter = 0
        self.identifier = identifier
        
    def getNcasts(self):
        return self.podcast_df.shape[0]
    
    #gets the next n feeds in 
    def getFeed(self,n):
        for i in range(0,n):
            if(self.counter > self.podcast_df.shape[0]):
                raise NoMoreFeedError
            url = podcast_df.iloc[self.counter]['feedUrl']
            self.feeds.append(self._feed_request(url))
            self.counter += 1
    
    def parseFeeds(self):
        parsed_feeds = []
        for f in self.feeds:
            try:
                  parsed_feeds.append([[f[1]['entries'][k]['content'][0]['value'],
                                             f[1]['entries'][k]['title'],
                                             f[1]['entries'][k]['author']] 
                                             for k in range(0,len(f[1]['entries']))])
                                      
            except KeyError as e:
                pass
        self.feeds = parsed_feeds
        self.state = 'parsed'
        print('Parsed!')

    def _saveFeeds(self):
        print('Saving %d feeds...' % len(self.feeds))
        with open(foutloc+'feeds_'+str(self.filecounter)+'_'+str(self.identifier)+'.pkl','wb') as fid:
            pickle.dump(self.feeds,fid)
            self.filecounter += 1
    
    def _saveProgress(self):
        with open(foutloc+'progress.pkl','wb') as fid:
            pickle.dump([self.identifier,self.filecounter,self.counter],fid)
    
    def save(self):
        self._saveFeeds()
        self._saveProgress()
        self._resetFeeds()
        
    def loadcounters(self):
        with open(foutloc+'progress.pkl','rb') as fid:
            progress = pickle.load(fid)
            self.identifier = progress[0]
            self.filecounter = progress[1]
            self.counter = progress[2]
            
    
    def _resetFeeds(self):
        self.feeds = []
        self.state = 'raw'
        
    def resetCounter(self):
        self.counter = 0
        
    @classmethod 
    def _feed_request(cls,url):
        try:
            return (url,fp.parse(url))
        except:
            print('Error on ' + url)
            return (url,None)

In [None]:
#Load in putative podcasts, turn it into a dataframe. 
import pickle
import pandas as pd


#load in itunes_request_db
floc = '//Users/Jay/AnacondaProjects/plutarch/get_episodes/'
with open(floc+'raw_itunes_requests.pkl','rb') as fid:
    raw_itunes_requests = pickle.load(fid)
    
#turn everything into a pandas dataframe
formatted_results = []
bads = []
cnames = ['']
for rir in raw_itunes_requests:
    for p in rir.json()['results']:
        if(p['kind']=='podcast'):
            formatted_results.append(p)

podcast_df = pd.DataFrame(formatted_results)
podcast_df = podcast_df.loc[podcast_df['feedUrl'].isnull()==False]

In [None]:
#Crawl & scrape RSS feeds
#We will visit every RSS feed and scrape all the episode data.
import numpy as np
import time
import socket
import feedparser as fp
import requests

MAX_REQUEST_DURATION = 10 #seconds
socket.setdefaulttimeout(MAX_REQUEST_DURATION)

step_size = 10#number of feeds to get/save at once
foutloc = '/Users/Jay/AnacondaProjects/plutarch/get_episodes/preprocessed/'
crawler = FeedCrawler(podcast_df,foutloc)

try:
    crawler.loadcounters()
    print('initialized! counter = %d,filecounter = %d,id = %s' %
          (crawler.counter,crawler.filecounter,crawler.identifier))
except:
    print("could not initialize")
    init = 0

flag = 1
while(flag):
    start_time = time.time()
    try:
        crawler.getFeed(step_size)
        crawler.parseFeeds()
        crawler.save()
    except NoMoreFeedError as e:
        flag = 0
        print("Job's done!")
    stop_time = time.time()
    duration = stop_time - start_time
    print('Counter=%d/%d (duration=%.2f, step=%d)' % (crawler.counter,crawler.getNcasts(),duration,step_size))

In [1]:
#Save out the scraped RSS feeds.
import pickle

floc = '/Users/Jay/AnacondaProjects/plutarch/get_episodes/angie/'
with open(floc+'feeds_0_1538436207.pkl','rb') as fid:
    prog = pickle.load(fid)

In [3]:
prog[0][1][0]

{'title': 'Peg Entwistle, Ghost of Hollywood',
 'title_detail': {'type': 'text/plain',
  'language': None,
  'base': 'https://feeds.megaphone.fm/stuffyoumissedinhistoryclass',
  'value': 'Peg Entwistle, Ghost of Hollywood'},
 'summary': "Her story is often told in a sort of sloppy shorthand: She went to Los Angeles to become an actress, failed, and then became desperate. But that isn’t a really accurate picture of Peg Entwistle at all.&nbsp;<br><br><br><br><br><br><br><br>Learn more about advertising on the HowStuffWorks podcasts at <a href='http://www.howstuffworks.com/advertisers.htm'>www.howstuffworks.com/advertisers.htm</a><br><br>And to learn about your ad choices when listening to podcasts, visit <a href='https://www.howstuffworks.com/privacy.htm#ad-choices'>https://www.howstuffworks.com/privacy.htm#ad-choices</a>",
 'summary_detail': {'type': 'text/html',
  'language': None,
  'base': 'https://feeds.megaphone.fm/stuffyoumissedinhistoryclass',
  'value': "Her story is often told 

In [None]:
import pickle

# Initialise the new dataframe
episode_df = pd.DataFrame()

# Load in each feeds file holding the episode descriptions (feeds_0 to feeds_22)
for i in range(23):
    floc = '/Users/Jay/AnacondaProjects/plutarch/get_episodes/preprocessed/'
    with open(floc+'feeds_'+str(i)+'_1538437135.pkl','rb') as fid:
        prog = pickle.load(fid)
        print('Parsing file: '+'feeds_'+str(i)+'_1538437135.pkl'+' of 22')
# Call in each list of episode descriptions and combine it with podcast_df listings
    for j in range(len(prog)):
        for k in range(len(prog[j])):
            episode_df=episode_df.append({'description':prog[j][k][0],
                                         'ep_title':prog[j][k][1],
                                         'pod_title':prog[j][k][2]},
                             ignore_index=True)

In [None]:
episode_df['pod_title'].nunique()

In [None]:
floc = '/Users/Jay/AnacondaProjects/plutarch/get_episodes/preprocessed/'
fname = 'episode_df.csv'
episode_df.to_csv(floc+fname)

In [None]:
floc = '/Users/Jay/AnacondaProjects/plutarch/get_episodes/preprocessed/'
fname = 'episode_df.csv'
episode_df = pd.read_csv(floc+fname, encoding='utf-8')
episode_df.drop_duplicates(['description'],inplace=True)


In [None]:
episode_df