In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.axes_grid.anchored_artists import AnchoredText
%matplotlib inline

In [2]:
mpl.rcParams.update({
    'font.size'           : 16.0,
    'axes.titlesize'      : 'large',
    'axes.labelsize'      : 'medium',
    'xtick.labelsize'     : 'small',
    'ytick.labelsize'     : 'small',
    'legend.fontsize'     : 'small',
})

In [3]:
# Force pandas & numpy to display all data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
np.set_printoptions(threshold=np.nan)

In [21]:
# Read in data
df = pd.read_csv('../data/a.tsv', sep='\t', header=None)

In [23]:
# Check the overall shape of the data
df.shape

(9793, 12)

In [29]:
df.columns = ['permalink', 'name', 'image_url', 'feed_url', 'website_url', 'itunes_owner_name', \
              'itunes_owner_email', 'managing_editor_name', 'managing_editor_email', 'explicit', \
              'description', 'itunes_summary']

In [31]:
# Looking at data
df.head()

Unnamed: 0,permalink,name,image_url,feed_url,website_url,itunes_owner_name,itunes_owner_email,managing_editor_name,managing_editor_email,explicit,description,itunes_summary
0,a,A플러스처치,http://cdn4.iblug.com/contents/profile/agdarms...,http://www.iblug.com/xml/itunes/agdarmstadt.xml,http://agdarmstadt.iblug.com,Leechangbae,chblee58@me.com,,,False,A플러스처치는 독일 다름슈타트에 있는 아름다운교회입니다.&#xD;\r\n청년유학생 ...,[Pod+]사연보내기/캐스터와SNS/통계
1,a-and-a-podcast-www-poderato-com-aandapodcast,A and A (Podcast) - www.poderato.com/aandapod...,http://www.poderato.com/files/images/34365l198...,http://www.poderato.com/aandapodcast/_feed/1,http://www.poderato.com/aandapodcast,www.podErato.com,support@poderato.com,,,False,A new project dedicated to all lovers of elect...,A and A (Podcast) - www.poderato.com/aandapod...
2,a-and-e-chat-podcasts,A and E Chat » Podcasts,http://69.195.124.89/~aandecha/wp-content/uplo...,http://feeds.feedburner.com/aandechatpodcasts,http://69.195.124.89/~aandecha,A and E,aandechat@gmail.com,,,False,"We chat about the movies, TV and pop-culturey ...","We chat about the movies, TV and pop-culturey ..."
3,a-b48,A基B48,http://www.pooopup.com/pod/images/agayb.jpg,http://www.pooopup.com/pod/media/agb48.xml,http://www.pooopup.com,Pooopup.com,info@pooopup.com,,,True,<br>Pooopup.com 節目［A基B48 (AGAYB48)]</br>\n<br>...,Pooopup.com
4,a-birding-on-a-bronco-by-merriam-florence-a,"A-Birding on a Bronco by MERRIAM, Florence A.",,https://librivox.org/rss/4734,http://librivox.org/a-birding-on-a-bronco-by-f...,LibriVox,info@librivox.org,,,False,Florence Augusta Merriam Bailey was an America...,Florence Augusta Merriam Bailey was an America...


In [30]:
# Looking at columns & types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9793 entries, 0 to 9792
Data columns (total 12 columns):
permalink                9793 non-null object
name                     9793 non-null object
image_url                8771 non-null object
feed_url                 9793 non-null object
website_url              9724 non-null object
itunes_owner_name        7508 non-null object
itunes_owner_email       7792 non-null object
managing_editor_name     1957 non-null object
managing_editor_email    2364 non-null object
explicit                 7500 non-null object
description              8829 non-null object
itunes_summary           8514 non-null object
dtypes: object(12)
memory usage: 918.2+ KB


In [32]:
# Checking percentage of null values
nulls = df.isnull().sum()/float(df.shape[0])
nulls.sort_values(ascending=False)

managing_editor_name     0.800163
managing_editor_email    0.758603
explicit                 0.234147
itunes_owner_name        0.233330
itunes_owner_email       0.204330
itunes_summary           0.130603
image_url                0.104360
description              0.098438
website_url              0.007046
feed_url                 0.000000
name                     0.000000
permalink                0.000000
dtype: float64

In [44]:
# Check for duplicates by columns
# Merge podcasts that are the same on different mediums? (iPhone vs mp3 in the name)
dup_ids = pd.concat(x for _, x in df.groupby('description') if len(x) > 1).sort_values('description')
dup_ids

Unnamed: 0,permalink,name,image_url,feed_url,website_url,itunes_owner_name,itunes_owner_email,managing_editor_name,managing_editor_email,explicit,description,itunes_summary
2115,ahmed-hulusi-kur-an-i-kerim-cozumu-arapca,AHMED HULUSİ - KUR'ÂN-I KERÎM ÇÖZÜMÜ - ARAPÇA,http://www.ahmedhulusi.org/images/podcast/kura...,http://download.ahmedhulusi.org/download/video...,http://www.ahmedhulusi.org/kuran/kuran-anlamak...,Ahmed Hulusi,cem@ahmedhulusi.org,,,,"""Yenilen"" hükmü geldi! Kur'ân-ı Kerîm'e farklı...",Allâh İlminden Yansımalarla KUR'ÂN-I KERÎM ÇÖZ...
2116,ahmed-hulusi-kur-an-i-kerim-cozumu-turkce,AHMED HULUSİ - KUR'ÂN-I KERÎM ÇÖZÜMÜ - TÜRKÇE,http://www.ahmedhulusi.org/images/podcast/kura...,http://download.ahmedhulusi.org/download/video...,http://www.ahmedhulusi.org/kuran/kuran-anlamak...,Ahmed Hulusi,cem@ahmedhulusi.org,,,,"""Yenilen"" hükmü geldi! Kur'ân-ı Kerîm'e farklı...",Allâh İlminden Yansımalarla KUR'ÂN-I KERÎM ÇÖZ...
2185,aint-nobody-heard-of-us-aac,Aint Nobody Heard Of Us (aac),http://podcastmachine-attachements.s3.amazonaw...,http://feed.podcastmachine.com/podcasts/4513/a...,http://www.aintnobodyheardofus.com,Psydome,admin%etchadoodle.com@gtempaccount.com,,,True,'Aint nobody heard of us' is a comedy podcast ...,
2186,aint-nobody-heard-of-us-appletv,Aint Nobody Heard Of Us (AppleTV),http://podcastmachine-attachements.s3.amazonaw...,http://feed.podcastmachine.com/podcasts/4513/a...,http://www.aintnobodyheardofus.com,Psydome,admin%etchadoodle.com@gtempaccount.com,,,True,'Aint nobody heard of us' is a comedy podcast ...,
2187,aint-nobody-heard-of-us-iphone,Aint Nobody Heard Of Us (iPhone),http://podcastmachine-attachements.s3.amazonaw...,http://feed.podcastmachine.com/podcasts/4513/i...,http://www.aintnobodyheardofus.com,Psydome,admin%etchadoodle.com@gtempaccount.com,,,True,'Aint nobody heard of us' is a comedy podcast ...,
2188,aint-nobody-heard-of-us-ipod,Aint Nobody Heard Of Us (iPod),http://podcastmachine-attachements.s3.amazonaw...,http://feed.podcastmachine.com/podcasts/4513/i...,http://www.aintnobodyheardofus.com,Psydome,admin%etchadoodle.com@gtempaccount.com,,,True,'Aint nobody heard of us' is a comedy podcast ...,
2189,aint-nobody-heard-of-us-mp3,Aint Nobody Heard Of Us (mp3),http://podcastmachine-attachements.s3.amazonaw...,http://feed.podcastmachine.com/podcasts/4513/m...,http://www.aintnobodyheardofus.com,Psydome,admin%etchadoodle.com@gtempaccount.com,,,True,'Aint nobody heard of us' is a comedy podcast ...,
670,accentofwomen,: AccentofWomen,http://podcast.3cr.org.au/audio/itunescover.jpg,http://podcast.3cr.org.au/podcast.php?cat=Acce...,http://podcast.3cr.org.au?cat=AccentofWomen,3CR 855AM community radio,programming@3cr.org.au,3CR 855AM community radio,programming@3cr.org.au,False,3CR 855AM is a community radio station in Melb...,"Melbourne Community Radio 3CR, 855AM"
3762,alternativenews,: AlternativeNews,http://podcast.3cr.org.au/audio/itunescover.jpg,http://podcast.3cr.org.au/podcast.php?cat=Alte...,http://podcast.3cr.org.au?cat=AlternativeNews,3CR 855AM community radio,programming@3cr.org.au,3CR 855AM community radio,programming@3cr.org.au,False,3CR 855AM is a community radio station in Melb...,"Melbourne Community Radio 3CR, 855AM"
4562,anarchistworldthisweek,: AnarchistWorldThisWeek,http://podcast.3cr.org.au/audio/itunescover.jpg,http://podcast.3cr.org.au/podcast.php?cat=Anar...,http://podcast.3cr.org.au?cat=AnarchistWorldTh...,3CR 855AM community radio,programming@3cr.org.au,3CR 855AM community radio,programming@3cr.org.au,False,3CR 855AM is a community radio station in Melb...,"Melbourne Community Radio 3CR, 855AM"


In [45]:
# Check for duplicates by columns
# Check for meaningless URLs
dup_ids = pd.concat(x for _, x in df.groupby('website_url') if len(x) > 1).sort_values('website_url')
dup_ids

Unnamed: 0,permalink,name,image_url,feed_url,website_url,itunes_owner_name,itunes_owner_email,managing_editor_name,managing_editor_email,explicit,description,itunes_summary
3429,alleluia-church-music-conference-video,Alleluia! Church Music Conference Video,,http://www.baylor.edu/rss/rss.php/94.wvideo.xml,/,,,,,,This podcast contains recorded sessions taken ...,This podcast contains recorded sessions taken ...
3430,alleluia-conference-2009-audio,Alleluia Conference 2009 Audio,,http://www.baylor.edu/rss/rss.php/85.audio.xml,/,,,,,,This podcast contains recorded sessions taken ...,This podcast contains recorded sessions taken ...
399,about-apple,About Apple,,http://www.podcastrevolution.com/viewpodcast.p...,http://,Owen Piercey,owen.piercey@me.com,,,False,It is about apple,
808,acids-house-adventures-ep1,ACIDs House adventures EP1,http://www.podcastrevolution.com/file/acidhous...,http://www.podcastrevolution.com/viewpodcast.p...,http://,calvin ball,iamcalvin11@yahoo.com,,,False,HOUSE MUSIC? YOU GOT IT LISTEN UP.\r\rACID,
1386,advance-student-ministry-test,Advance Student Ministry Test,http://www.podcastrevolution.com/file/advance/...,http://www.podcastrevolution.com/viewpodcast.p...,http://,Advance Student Ministry Test,,,,False,Advance Student Ministry Test,
1938,agcn-weekly-sermons,AGCN weekly sermons,http://files.stablerack.com/webfiles/60633/Mik...,http://media.stablerack.com/web/clientid/60633...,http://,,,,,,Join us weekly and listen to Pastor Mike Schutz,
4584,anatomy-physiology-i-podcast-at-the-university...,Anatomy/Physiology I - podcast at The Univers...,https://podcasts.memphis.edu/images/icon2.png,http://podcasts.memphis.edu/courses/BIOL2010-0...,http://,,,,,,"Introduction (lecture 1, 08/30/2011)",
6413,apwth,APWTH,http://www.podcastrevolution.com/file/daniel_a...,http://www.podcastrevolution.com/viewpodcast.p...,http://,"Mattia Coccia, Keegan Barbosa and Dan",ihatesearchingfornames@hotmail.com,,,True,Stories and theroies by awkward comedy nerds ...,
3910,amateur-traveler-podcast-travel-for-the-love-o...,Amateur Traveler Podcast | travel for the love...,http://travelphotos.amateurtraveler.com/Other/...,http://feeds.feedburner.com/AmateurTravelerPod...,http://AmateurTraveler.com/,Chris Christensen,rss@amateurtraveler.com,host@amateurtraveler.com,host@amateurtraveler.com,False,The Amateur Traveler is an audio travel show t...,travel for the love of it
3909,amateur-traveler-podcast-itunes-enhanced-trave...,Amateur Traveler Podcast (iTunes enhanced) | t...,http://assets.libsyn.com/content/4737859.jpg,http://feeds.feedburner.com/AmateurTravelerPod...,http://AmateurTraveler.com/,Chris Christensen,rss@amateurtraveler.com,host@amateurtraveler.com,host@amateurtraveler.com,False,The Amateur Traveler is an audio travel show t...,travel for the love of it
