In [None]:
#Grab the names and links for each podcast!
import pandas as pd
import requests
import bs4 #BeautifulSoup
import time
import re
import numpy as np

podcast_leading_character = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ*'#Here are possible leading characters for podcast names


#gets the titles of all podcasts on an iTunes podcast category page
def get_podcast_titles_iTunes(soup,target_url = 'https://itunes.apple.com/us/podcast/'):
    
    links = []
    titles = []
    ids = []
    for link in soup.find_all('a'):
        if target_url in link.get('href'):
            links.append(link.get('href'))
            titles.append(link.contents[0])
            ids.append(link.get('href').split("/id",1)[1])
    return links, titles, ids

#count pages on an iTunes podcast category page
def count_pages_iTunes(soup):
    flag = 0
    prev_link = None
    for link in soup.find_all('a'):
        #print(link.contents)
        if link.contents[0]=='#':
            flag = 1
        if(flag):
            if(link.contents[0].isdigit()):
                prev_link = link
            else:
                if(link.contents[0]!='#'):
                    if(prev_link is None):
                        return 1
                    else:
                        return int(prev_link.contents[0])
    raise ValueError()
    
#safe request that keeps on trying if it gets a 403.
def persistent_request(url,retry_time = 60):
    flag = 1
    while(flag):
        page = requests.get(url)
        print(page)
        if(page.status_code!=200):
            flag = 1
            print('Error '+str(page.status_code)+'. Retrying in '+str(retry_time)+' sec')
            time.sleep(retry_time)
        else:
            flag = 0
    return page

inter_query_interval = 0 #seconds
inter_query_variance = 0


#store the podcast name, id, id url, time accessed, and url from where it was obtained. 
url_data = []

In [None]:
#Grab the names of all of the podcasts from iTunes
#IMPORTANT: these titles are truncated. When searching for id, grab both feed and title

#iterate through leading characters
for c in podcast_leading_character:
    print('######Accessing character '+c)
    #base_url = ('https://itunes.apple.com/us/genre/podcasts-society-culture-history/id1462?mt=2&letter='+c)
    base_url = re.sub('letter=.','letter='+c,'https://itunes.apple.com/us/genre/podcasts-society-culture-history/id1462')
    print(base_url)
    #request first & count pages
    request_time = time.time()
    query_time = np.random.randn()*inter_query_variance+inter_query_interval
    page = persistent_request(base_url)
    soup = bs4.BeautifulSoup(page.text,"html5lib")
    npages = count_pages_iTunes(soup)

    #iterate through page number
    for k in range(1,npages+1):
        print('###Accessing page #'+str(k)+'/'+str(npages))
        if(k!=1):
            #generate url
            url = re.sub('page=.','page='+str(k),hlink)
            request_time = time.time()
            page = persistent_request(url)
            soup = bs4.BeautifulSoup(page.text,"html5lib")
        else:
            url = base_url
        #get podcast titles
        links,titles,ids = get_podcast_titles_iTunes(soup)

        #store the podcast name, id, id url, time accessed, and url from where it was obtained. 
        for x in list(zip(titles,ids,links)):
            url_data.append([x[0],x[1],x[2],request_time,url])

        #file them away
        iter_end_time = time.time()
        duration = iter_end_time - request_time
        time.sleep(max(0,query_time-(duration)))

url_df = pd.DataFrame(url_data,columns=['name','id','url','time_accessed','query_url'])


In [None]:
url_df['id']=url_df['id'].str.replace('\?mt=2','') 

In [None]:
url_df.head()

In [None]:
#save the url dataframe
import pickle
floc = '/Users/Jay/AnacondaProjects/plutarch/'
fname = 'podcast_urls.csv'
url_df.to_csv(floc+fname)

In [None]:
#load in URL dataframe#load i 
import pandas as pd

#load it in
floc = '/Users/Jay/AnacondaProjects/plutarch/'
fname = 'podcast_urls.csv'
url_df = pd.read_csv(floc+fname)
url_df.drop_duplicates(['id'],inplace=True)

In [None]:
url_df=url_df.drop(columns=['Unnamed: 0'])
url_df

In [None]:
#Clean up IDs
for rep in range(0,2):
    ids = url_df['id'].get_values()
    for i in range(0,len(ids)):
        #print(i)
        if(isinstance(ids[i],str)):
            if(not ids[i].isdigit()): 
                ids[i] = ids[i].split("/id",1)[1]
    url_df['id'] = ids
    print(url_df['id'])

In [None]:
#Setup the components of querying the iTunes API for podcast RSS feeds.
import os
import random
#safe request that keeps on trying if it gets a 403.
def persistent_request(url,retry_time = 60):
    flag = 1
    while(flag):
        page = requests.get(url)
        if(page.status_code!=200):
            flag = 1
            print('Error '+str(page.status_code)+'. Retrying in '+str(retry_time)+' sec')
            time.sleep(retry_time)
        else:
            flag = 0
    return page

#wrapper for looking up a set of IDs on iTunes
def lookup_id_iTunes(id):
    return persistent_request('https://itunes.apple.com/lookup?id=' + str(id))

In [None]:
#Queries the iTunes API to get all of the RSS feeds
import numpy as np
import time
import requests

#setup ID 
def lookup_id_iTunes(id):
    return persistent_request('https://itunes.apple.com/lookup?id=' + str(id))


chunk_size = 200

pages = []
duration = 0
for i in range(0, len(url_df), chunk_size):
    print('###### ' + str(i) + ' to ' + str(i+chunk_size) + 
          ' (last iteration '+ str(np.floor(duration)) + ')' )
    start_time = time.time()
    df_subset = url_df.iloc[i:np.min([len(url_df),(i+chunk_size)])]
    search_url = ','.join([str(x) for x in df_subset['id'].get_values()])
    pages.append(lookup_id_iTunes(search_url))
    print(pages)
    stop_time = time.time()
    duration = stop_time - start_time

In [None]:
#plot the length of page vectors to make sure that scraping approximately worked
#Each page vector should be of length 200.
import matplotlib.pyplot as plt
from datetime import datetime

page_lengths = [len(x.json()['results']) for x in pages]

plt.figure(figsize=[12,9])
plt.hist(page_lengths)
plt.xlabel('# results')
plt.ylabel('count')
#print(len(url_df))

In [None]:
#save pages for later processing
import pickle
floc=('/Users/Jay/AnacondaProjects/plutarch/')
pickle.dump(pages,open(floc+'raw_itunes_requests.pkl','wb'))

In [2]:
#load in raw iTunes requests
import pickle
floc=('/Users/Jay/AnacondaProjects/plutarch/')
raw_itunes_requests = pickle.load(open(floc+'raw_itunes_requests.pkl','rb'))

In [5]:
#turn everything into a pandas dataframe#turn e 
import pandas as pd

formatted_results = []
bads = []
cnames = ['']
for rir in raw_itunes_requests:
    for p in rir.json()['results']:
        if(p['kind']=='podcast'):
            formatted_results.append(p)

podcast_df = pd.DataFrame(formatted_results)

In [6]:
podcast_df

Unnamed: 0,artistId,artistName,artistViewUrl,artworkUrl100,artworkUrl30,artworkUrl60,artworkUrl600,collectionCensoredName,collectionExplicitness,collectionHdPrice,...,trackCount,trackExplicitness,trackHdPrice,trackHdRentalPrice,trackId,trackName,trackPrice,trackRentalPrice,trackViewUrl,wrapperType
0,2.843410e+08,HowStuffWorks,https://itunes.apple.com/us/artist/howstuffwor...,https://is2-ssl.mzstatic.com/image/thumb/Music...,https://is2-ssl.mzstatic.com/image/thumb/Music...,https://is2-ssl.mzstatic.com/image/thumb/Music...,https://is2-ssl.mzstatic.com/image/thumb/Music...,Stuff You Missed in History Class,cleaned,0,...,300,cleaned,0,0,283605519,Stuff You Missed in History Class,0.0,0,https://itunes.apple.com/us/podcast/stuff-you-...,track
1,2.562010e+08,Dan Carlin,https://itunes.apple.com/us/artist/wizzard-med...,https://is4-ssl.mzstatic.com/image/thumb/Music...,https://is4-ssl.mzstatic.com/image/thumb/Music...,https://is4-ssl.mzstatic.com/image/thumb/Music...,https://is4-ssl.mzstatic.com/image/thumb/Music...,Dan Carlin's Hardcore History,cleaned,0,...,13,cleaned,0,0,173001861,Dan Carlin's Hardcore History,0.0,0,https://itunes.apple.com/us/podcast/dan-carlin...,track
2,1.009390e+09,Aaron Mahnke,https://itunes.apple.com/us/artist/aaron-mahnk...,https://is3-ssl.mzstatic.com/image/thumb/Music...,https://is3-ssl.mzstatic.com/image/thumb/Music...,https://is3-ssl.mzstatic.com/image/thumb/Music...,https://is3-ssl.mzstatic.com/image/thumb/Music...,Lore,cleaned,0,...,99,cleaned,0,0,978052928,Lore,0.0,0,https://itunes.apple.com/us/podcast/lore/id978...,track
3,1.557539e+08,Slate,https://itunes.apple.com/us/artist/slate-magaz...,https://is5-ssl.mzstatic.com/image/thumb/Music...,https://is5-ssl.mzstatic.com/image/thumb/Music...,https://is5-ssl.mzstatic.com/image/thumb/Music...,https://is5-ssl.mzstatic.com/image/thumb/Music...,Slow Burn,notExplicit,0,...,19,notExplicit,0,0,1315040130,Slow Burn,0.0,0,https://itunes.apple.com/us/podcast/slow-burn/...,track
4,,Mike Rowe,,https://is3-ssl.mzstatic.com/image/thumb/Music...,https://is3-ssl.mzstatic.com/image/thumb/Music...,https://is3-ssl.mzstatic.com/image/thumb/Music...,https://is3-ssl.mzstatic.com/image/thumb/Music...,The Way I Heard It with Mike Rowe,cleaned,0,...,112,cleaned,0,0,1087110764,The Way I Heard It with Mike Rowe,0.0,0,https://itunes.apple.com/us/podcast/the-way-i-...,track
5,,Casefile True Crime,,https://is1-ssl.mzstatic.com/image/thumb/Music...,https://is1-ssl.mzstatic.com/image/thumb/Music...,https://is1-ssl.mzstatic.com/image/thumb/Music...,https://is1-ssl.mzstatic.com/image/thumb/Music...,Casefile True Crime,explicit,0,...,120,explicit,0,0,998568017,Casefile True Crime,0.0,0,https://itunes.apple.com/us/podcast/casefile-t...,track
6,1.216766e+08,BBC Radio 4,https://itunes.apple.com/us/artist/bbc/1216766...,https://is5-ssl.mzstatic.com/image/thumb/Music...,https://is5-ssl.mzstatic.com/image/thumb/Music...,https://is5-ssl.mzstatic.com/image/thumb/Music...,https://is5-ssl.mzstatic.com/image/thumb/Music...,In Our Time,cleaned,0,...,300,cleaned,0,0,73330895,In Our Time,0.0,0,https://itunes.apple.com/us/podcast/in-our-tim...,track
7,1.216766e+08,BBC World Service,https://itunes.apple.com/us/artist/bbc/1216766...,https://is2-ssl.mzstatic.com/image/thumb/Music...,https://is2-ssl.mzstatic.com/image/thumb/Music...,https://is2-ssl.mzstatic.com/image/thumb/Music...,https://is2-ssl.mzstatic.com/image/thumb/Music...,The Documentary Podcast,cleaned,0,...,300,cleaned,0,0,73802620,The Documentary Podcast,0.0,0,https://itunes.apple.com/us/podcast/the-docume...,track
8,7.123799e+08,Mike Duncan,https://itunes.apple.com/us/artist/mike-duncan...,https://is4-ssl.mzstatic.com/image/thumb/Music...,https://is4-ssl.mzstatic.com/image/thumb/Music...,https://is4-ssl.mzstatic.com/image/thumb/Music...,https://is4-ssl.mzstatic.com/image/thumb/Music...,The History of Rome,cleaned,0,...,192,cleaned,0,0,261654474,The History of Rome,0.0,0,https://itunes.apple.com/us/podcast/the-histor...,track
9,8.501391e+08,Nate DiMeo,https://itunes.apple.com/us/artist/radiotopia/...,https://is5-ssl.mzstatic.com/image/thumb/Music...,https://is5-ssl.mzstatic.com/image/thumb/Music...,https://is5-ssl.mzstatic.com/image/thumb/Music...,https://is5-ssl.mzstatic.com/image/thumb/Music...,the memory palace,cleaned,0,...,154,cleaned,0,0,299436963,the memory palace,0.0,0,https://itunes.apple.com/us/podcast/the-memory...,track


In [7]:
#take out rows that don't have RSS feeds
import numpy as np
podcast_df = podcast_df.loc[podcast_df['feedUrl'].isnull()==False]

#take out rows that don't have well-defined release dates
podcast_df['releaseDate'] = pd.to_datetime(podcast_df['releaseDate'])
podcast_df = podcast_df.loc[podcast_df['releaseDate'].isnull()==False]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
#plot included podcasts as a function of update frequency
from datetime import datetime,timedelta
from matplotlib import pyplot as plt

upper_limit = 100
comparator_day = datetime(2018,9,18,11,5,23,424906)#This was "today" on Monday 6-11-18
npodcast = []
cutoffs = [i for i in range(1,upper_limit+1)]
for c in cutoffs:
    npodcast.append(np.sum(podcast_df['releaseDate'] > (comparator_day - timedelta(days=c))))

plt.plot(cutoffs,npodcast)
plt.xlabel('Time since last update threshold')
plt.ylabel('count')

Text(0,0.5,'count')

In [11]:
#multithreaded RSS scrape
from multiprocessing import Pool
import socket
import time
import feedparser as fp
import requests

MAX_REQUEST_DURATION = 10 #seconds
socket.setdefaulttimeout(MAX_REQUEST_DURATION)

class ResponseCodeError(BaseException):
    pass

def safe_request(url):
    flag = 1
    while(flag):
        page = requests.get(url).json()
        if(page.status_code!=200):
            flag = 1
            print('Error '+str(page.status_code)+'. Retrying in '+str(retry_time)+' sec')
            time.sleep(retry_time)
        else:
            raise ResponseCodeError()
    return page

def feed_request(url):
    try:
        return (url,fp.parse(url))
    except:
        print('Error on ' + url)
        return (url,None)

    
floc = ('/Users/Jay/AnacondaProjects/plutarch/')
if __name__ == '__main__':
    p = Pool(8)
    chunk_size = 8*3
    save_every = 1000; #save and clear rss_feeds after this many feeds have been scraped.
    rss_feeds = []
    bads = []
    duration = 0
    j = 0
    filecounter = 0
    associator = str(int(time.time()))
    for i in range(0,len(podcast_df),chunk_size):
        print('###### ' + str(i) + ' to ' + str(i+chunk_size) + 
          ' (length= ' + str(len(podcast_df))+ '; last iteration '+ str(np.floor(duration)) + ')' )
        start_time = time.time()
        podcast_df.iloc[i:np.min([len(podcast_df),(i+chunk_size)])]
        try:
            rss_feeds.append(p.map(feed_request, podcast_df['feedUrl'].iloc[i:np.min([len(podcast_df),(i+chunk_size)])].get_values()))
        except:
            print('bad on '+ str(i) + ' to ' + str(i+chunk_size))
            for k in range(i,i+chunk_size):
                bads.append(k)
        stop_time = time.time()
        duration = stop_time - start_time
        if(j >= (save_every/chunk_size)):
            print('###Saving...')
            pickle.dump([rss_feeds,bads],open(floc+'raw_rss_and_bads_file'+
                              str(filecounter)+'_'+associator+'.pkl','wb'))
            print('###Saved!')
            #bads = []
            rss_feeds = []
            filecounter+=1
            j = -1
        j += 1
    p.close()
    pickle.dump([rss_feeds,bads],open(floc+'raw_rss_and_bads_file'+
                str(filecounter)+'_'+associator+'.pkl','wb'))
    
print('done!')

###### 0 to 24 (length= 232; last iteration 0.0)
###### 24 to 48 (length= 232; last iteration 6.0)
###### 48 to 72 (length= 232; last iteration 5.0)
###### 72 to 96 (length= 232; last iteration 3.0)
###### 96 to 120 (length= 232; last iteration 7.0)
###### 120 to 144 (length= 232; last iteration 6.0)
bad on 120 to 144
###### 144 to 168 (length= 232; last iteration 3.0)
###### 168 to 192 (length= 232; last iteration 6.0)
bad on 168 to 192
###### 192 to 216 (length= 232; last iteration 7.0)
###### 216 to 240 (length= 232; last iteration 9.0)
done!


In [12]:
fname = 'raw_rss_and_bads_file0_1537306068.pkl'
floc=('/Users/Jay/AnacondaProjects/plutarch/')
bads = pickle.load(open(floc+fname,'rb'))[1]

In [13]:
#iterate through bads
import numpy as np
import time
import feedparser as fp
import requests


class ResponseCodeError(BaseException):
    pass

def safe_request(url):
    flag = 1
    while(flag):
        page = requests.get(url).json()
        if(page.status_code!=200):
            flag = 1
            print('Error '+str(page.status_code)+'. Retrying in '+str(retry_time)+' sec')
            time.sleep(retry_time)
        else:
            raise ResponseCodeError()
    return page

def feed_request(url):
    try:
        return (url,fp.parse(url))
    except:
        print('Error on ' + url)
        return (url,None)

#load in raw_rss_and_bads_file48_1528758749.pkl for a list of bads

bad_list = bads
rss_feeds = []
floc=('/Users/Jay/AnacondaProjects/plutarch/')
save_every = 1000; #save and clear rss_feeds after this many feeds have been scraped.
bad_feeds = []
badbads = []
duration = 0
j = 0
filecounter = 56
associator = 1528772780
for i in range(0,len(bad_list)):
    print('###### ' + str(i) + ' to ' + str(len(bad_list)) + 
      ' (last iteration '+ str(np.floor(duration)) + ')' )
    start_time = time.time()
    bad_feeds.append(feed_request(podcast_df['feedUrl'].iloc[bad_list[i]]))
    stop_time = time.time()
    duration = stop_time - start_time
    if(j >= save_every):
        print('###Saving...')
        pickle.dump([bad_feeds,bads],open(floc+'raw_rss_and_bads_file'+
                              str(filecounter)+'_'+str(associator)+'.pkl','wb'))
        print('###Saved!')
        bad_feeds = []
        filecounter+=1
        j = -1
    j += 1

pickle.dump([bad_feeds,bads],open(floc+'raw_rss_and_bads_file'+
                              str(filecounter)+'_'+str(associator)+'.pkl','wb'))

###### 0 to 48 (last iteration 0.0)
###### 1 to 48 (last iteration 0.0)
###### 2 to 48 (last iteration 0.0)
###### 3 to 48 (last iteration 0.0)
###### 4 to 48 (last iteration 0.0)
###### 5 to 48 (last iteration 0.0)
###### 6 to 48 (last iteration 2.0)
###### 7 to 48 (last iteration 0.0)
###### 8 to 48 (last iteration 0.0)
###### 9 to 48 (last iteration 0.0)
###### 10 to 48 (last iteration 0.0)
###### 11 to 48 (last iteration 1.0)
###### 12 to 48 (last iteration 0.0)
###### 13 to 48 (last iteration 1.0)
###### 14 to 48 (last iteration 1.0)
###### 15 to 48 (last iteration 0.0)
###### 16 to 48 (last iteration 0.0)
###### 17 to 48 (last iteration 1.0)
###### 18 to 48 (last iteration 0.0)
###### 19 to 48 (last iteration 0.0)
###### 20 to 48 (last iteration 1.0)
###### 21 to 48 (last iteration 0.0)
###### 22 to 48 (last iteration 0.0)
###### 23 to 48 (last iteration 0.0)
###### 24 to 48 (last iteration 0.0)
###### 25 to 48 (last iteration 0.0)
###### 26 to 48 (last iteration 0.0)
###### 27 t

ValueError: I/O operation on closed file.