# Android Weekly analysis

Investigating the most popular blogs and authors over time.

In [6]:
from bs4 import BeautifulSoup
import urllib
import csv
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np

import plotly.plotly as py
import pandas as pd
import numpy as np
import plotly
from plotly.graph_objs import Scatter, Layout

%matplotlib inline

plotly.offline.init_notebook_mode(connected=True)

BASE_URL = "http://androidweekly.net/issues/issue-{}"
DATA_DIR = "../data"

## Data collection

Dowload and parse each Android weekly post. 
1. `contentForIssue(num)` downloads the data for an issue
2. `dataForIssue(content, issue_num)`, `dataForIssueBefore103(content, issue_num)`, and `dataForIssueBefore60(content, issue_num)` parse the html from different issues into an array of objects
3. `writeData(data, fname)` writes the data for an issue to a csv for retrieval later

In [6]:
def contentForIssue(num):
    url = BASE_URL.format(num)
    print(url)
    request = urllib.Request(url)
    request.add_header('Accept-Encoding', 'utf-8')
    response = urllib.urlopen(url)
    soup = BeautifulSoup(response.read().decode('utf-8'), 'lxml')
    content = soup.find('div', { 'class':'issue' })
    return content

In [8]:
def dataForIssue(content, issue_num):
    sections = content.find_all('td')
    last_section_header = ""
    data = []
    section_count = 0
    for section in sections:
        if section.h2 or section.h3:
            section_count += 1
            if section.h2:
                last_section_header = section.h2.text
            else:
                last_section_header = section.h3.text
        if section.a and section.a.string:
            item = {}
            item['issue_num'] = issue_num
            item['section'] = last_section_header.encode("utf-8").strip()
            item['title'] = section.a.text.encode("utf-8").replace('\n','').strip()
            item['link'] = section.a.get('href').encode("utf-8").strip()
            item['site'] = section.span.text.replace('(','').replace(')', '').replace('\n','').encode("utf-8").strip()
            item['description'] = section.p.text.replace('\n','').encode("utf-8").strip()
            data.append(item)
    print('Found %d items in %s sections' % (len(data), section_count))
    return data

In [10]:
def dataForIssueBefore103(content, issue_num):
    sections = content.find_all(['h2', 'div', 'p'])
    last_section_header = ""
    data = []
    section_count = 0
    for section in sections:
        if section.name == 'h2':
            section_count += 1
            last_section_header = section.text
        elif section.name == 'div' and len(section.find_all('a')) > 1:
            links = section.find_all('a');
            item = {}
            item['issue_num'] = issue_num
            item['section'] = last_section_header.encode("utf-8").strip()
            item['title'] = links[1].string.encode("utf-8").replace('\n','').strip()
            item['link'] = links[1].get('href').encode("utf-8").strip()
            item['site'] = section.span.text.replace('(','').replace(')', '').replace('\n','').encode("utf-8").strip()
            item['description'] = section.p.text.replace('\n','').encode("utf-8").strip()
            data.append(item)
        elif section.name == 'p' and section.a and section.a.text:
            item = {}
            item['issue_num'] = issue_num
            item['section'] = last_section_header.encode("utf-8").strip()
            item['title'] = section.a.string.encode("utf-8").replace('\n','').strip()
            item['link'] = section.a.get('href').encode("utf-8").strip()
            if section.span:
                item['site'] = section.span.text.replace('(','').replace(')', '').replace('\n','').encode("utf-8").strip()
            else:
                item['site'] = ""
            if section.br and section.br.next_sibling:
                item['description'] = section.br.next_sibling.replace('\n','').encode("utf-8").strip()
            else:
                item['description'] = ""
            data.append(item)
    print('Found %d items in %s sections' % (len(data), section_count))
    return data

In [11]:
def dataForIssueBefore60(content, issue_num):
    return dataForIssueBefore103(content.div, issue_num)

In [12]:
def writeData(data, fname):
    with open(fname, 'wb') as f:
        w = csv.DictWriter(f, data[0].keys())
        w.writeheader()
        for datum in data:
            w.writerow(datum)

## Get all the data!

With the above functions available, we can download all the Android Weekly posts

In [13]:
def getAllData(issueNums):
    for i in issueNums:
        content = contentForIssue(i)
        data = None
        if (i < 60):
            data = dataForIssueBefore60(content, i) 
        elif (i < 103):
            data = dataForIssueBefore103(content, i) 
        else:
            data = dataForIssue(content, i)
        writeData(data, 'issue_{}.csv'.format(i))

In [33]:
# 500 errors: 138, 108, 43
# Skipped b/c unusual: 30
issueNums = range(1, 243)
issueNums.remove(138)
issueNums.remove(108)
issueNums.remove(43)
issueNums.remove(30)
getAllData(reversed(issueNums))

http://androidweekly.net/issues/issue-216
Found 26 items in 6 sections
http://androidweekly.net/issues/issue-215
Found 28 items in 7 sections
http://androidweekly.net/issues/issue-214
Found 21 items in 8 sections
http://androidweekly.net/issues/issue-213
Found 30 items in 7 sections
http://androidweekly.net/issues/issue-212
Found 23 items in 7 sections
http://androidweekly.net/issues/issue-211
Found 29 items in 9 sections
http://androidweekly.net/issues/issue-210
Found 28 items in 9 sections
http://androidweekly.net/issues/issue-209
Found 27 items in 10 sections
http://androidweekly.net/issues/issue-208
Found 25 items in 7 sections
http://androidweekly.net/issues/issue-207
Found 25 items in 8 sections
http://androidweekly.net/issues/issue-206
Found 25 items in 8 sections
http://androidweekly.net/issues/issue-205
Found 34 items in 8 sections
http://androidweekly.net/issues/issue-204
Found 34 items in 10 sections
http://androidweekly.net/issues/issue-203
Found 29 items in 7 sections
http

## Load Data

Loading the data from the csv files into memory makes it easy to resume work.

In [5]:
def loadDataFiles():
    files = [f for f in listdir(DATA_DIR) if isfile(join(DATA_DIR, f)) and f.endswith('.csv')]
    df = None
    for fname in files:
        if df is None:
            df = pd.read_csv('../data/' + fname)
        else:
            df = df.append(pd.read_csv('../data/' + fname, dtype={'site': object}), ignore_index=True)
    return df
    
df = loadDataFiles()
df.head()

Unnamed: 0,description,title,section,site,issue_num,link
0,An Android Tutorial aimed for iOS Developers.,http://clayallsopp.posterous.com/building-an-a...,Articles and Tutorials,,1,http://clayallsopp.posterous.com/building-an-a...
1,Selenium Webdriver is now available for your A...,http://android-developers.blogspot.com/2011/10...,Articles and Tutorials,,1,http://android-developers.blogspot.com/2011/10...
2,For our dev rookies we got an Java tutorial wh...,http://mobile.tutsplus.com/tutorials/android/j...,Articles and Tutorials,,1,http://mobile.tutsplus.com/tutorials/android/j...
3,The brandnew nexus with Android 4.0 aka Ice Cr...,http://www.google.com/nexus/,Headlines,,1,http://www.google.com/nexus/
4,A very cool info graphic about the Rise of And...,http://bbgeeks.com/images/AndroidRise.png,Headlines,,1,http://bbgeeks.com/images/AndroidRise.png


In [15]:
df.loc[df.issue_num >= 238, 'year'] = 2017
df.loc[(df.issue_num < 238) & (df.issue_num >= 191), 'year'] = 2016
df.loc[(df.issue_num < 191) & (df.issue_num >= 134), 'year'] = 2015
df.loc[(df.issue_num < 134) & (df.issue_num >= 83), 'year'] = 2014
df.loc[(df.issue_num < 83) & (df.issue_num >= 49), 'year'] = 2013
df.loc[(df.issue_num < 49) & (df.issue_num >= 9), 'year'] = 2012
df.loc[df.issue_num < 9, 'year'] = 2011
df.year = df.year.astype(int)

In [16]:
df.loc[df.site == 'www.grokkingandroid.com', 'site'] = 'grokkingandroid.com'

# Analysis

#### These are the primary questions I want to answer:
1. What are the most prolific blogs? Have those changed over time?
2. Who are the most prolific bloggers? Have they changed over time?
3. What are the most common topics of discussion? Have those changed over time?

#### These are questions of secondary interest:
1. What are the most popular library topics? Have those changed over time?
2. Who are the most prolific library contributors? Have they changed over time?

### The sections

In [17]:
df.groupby('section').count().sort_values(by='issue_num', ascending=False).issue_num[:15]

section
Articles & Tutorials      2105
Libraries & Code           892
Jobs                       563
Videos & Podcasts          338
Sponsored                  314
News                       281
Tools                      182
Design                     170
Videos                     113
Events                      67
Specials                    60
ARTICLES AND TUTORIALS      32
App of the Week             28
LIBRARIES AND CODE          27
Screencasts                 22
Name: issue_num, dtype: int64

## What are the most prolific blogs? Have those changed over time?

To answer this, I will look at the most common sites in general. Then I will also split segment the data by year.

In [18]:
adf = df[(df.section == 'Articles & Tutorials') | (df.section == 'ARTICLES AND TUTORIALS')]
adf.issue_num.count()

2137

In [19]:
adf.site.value_counts()[:35]

medium.com                         377
blog.stylingandroid.com            197
plus.google.com                     92
android-developers.blogspot.com     66
blog.danlew.net                     30
speakerdeck.com                     29
www.bignerdranch.com                25
antonioleiva.com                    24
www.philosophicalhacker.com         22
hannesdorfmann.com                  22
code.tutsplus.com                   21
riggaroo.co.za                      18
commonsware.com                     17
www.novoda.com                      17
blog.sqisland.com                   14
frogermcs.github.io                 14
developer.android.com               13
grokkingandroid.com                 13
hackernoon.com                      13
ptrprograms.blogspot.com            12
saulmm.github.io                    11
ryanharter.com                      11
android.jlelse.eu                   11
www.thedroidsonroids.com            11
www.doubleencore.com                11
www.androiddesignpatterns

#### Most popular blogs of 2016

In [20]:
adf[adf.year == 2017].site.value_counts()[:10]

medium.com                     93
blog.stylingandroid.com        19
www.philosophicalhacker.com    12
android.jlelse.eu              11
hackernoon.com                 10
commonsware.com                 6
www.thedroidsonroids.com        6
blog.danlew.net                 6
hannesdorfmann.com              6
www.novoda.com                  5
Name: site, dtype: int64

#### Most popular blogs of 2016

In [21]:
adf[adf.year == 2016].site.value_counts()[:10]

medium.com                         191
blog.stylingandroid.com             36
riggaroo.co.za                      13
android-developers.blogspot.com     10
www.novoda.com                      10
blog.nimbledroid.com                 9
code.tutsplus.com                    9
hannesdorfmann.com                   8
jeroenmols.com                       7
www.bignerdranch.com                 6
Name: site, dtype: int64

#### Most popular blogs of 2015

In [22]:
adf[adf.year == 2015].site.value_counts()[:10]

medium.com                         83
blog.stylingandroid.com            50
plus.google.com                    34
android-developers.blogspot.com    19
www.bignerdranch.com               13
antonioleiva.com                   12
speakerdeck.com                    12
blog.danlew.net                    12
code.tutsplus.com                  11
frogermcs.github.io                10
Name: site, dtype: int64

#### Most popular blogs of 2014

In [23]:
adf[adf.year == 2014].site.value_counts()[:10]

blog.stylingandroid.com            41
plus.google.com                    35
android-developers.blogspot.com    21
ptrprograms.blogspot.com           11
blog.danlew.net                    10
speakerdeck.com                     9
medium.com                          9
antonioleiva.com                    6
github.com                          5
corner.squareup.com                 5
Name: site, dtype: int64

#### Most popular blogs of 2013

In [24]:
adf[adf.year == 2013].site.value_counts()[:10]

blog.stylingandroid.com            29
plus.google.com                    15
android-developers.blogspot.com     8
www.doubleencore.com                7
www.kpbird.com                      4
udinic.wordpress.com                4
gmariotti.blogspot.de               4
android-developers.blogspot.de      4
kpbird.com                          3
developer.android.com               3
Name: site, dtype: int64

#### Most popular blogs before 2013

In [25]:
adf[adf.year < 2013].site.value_counts()[:10]

blog.stylingandroid.com            22
vogella.de                          8
android-developers.blogspot.com     8
chrisrisner.com                     7
grokkingandroid.com                 5
plus.google.com                     4
developer.android.com               4
mobile.tutsplus.com                 3
vogella.com                         3
androiduipatterns.com               3
Name: site, dtype: int64

In [69]:
blogs_by_year = pd.pivot_table(adf, index='year', columns='site', aggfunc=lambda x: len(x.unique()), fill_value=0).issue_num
blogs_by_year.head()

site,Working with XML on Android,abhan.github.io,acadgild.com,adavis.info,adelnizamutdinov.github.io,afterecho.uk,agiliq.com,air.mozilla.org,aiyprojects.withgoogle.com,akarnokd.blogspot.com,...,www.zendesk.com,www.zoftino.com,xrigau.wordpress.com,yakivmospan.wordpress.com,yalantis.com,yarikx.github.io,yvonne.idescout.com,zdominguez.com,zeroturnaround.com,zserge.com
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2012,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2013,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2015,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,2,1,1,0,0,0


In [27]:
blogs_by_year['blog.stylingandroid.com']

year
2011     1
2012    14
2013    24
2014    39
2015    49
2016    34
2017    19
Name: blog.stylingandroid.com, dtype: int64

In [66]:
blogs_by_year['total'] = blogs_by_year.sum(0)
blogs_by_year['blog.stylingandroid.com']

year
2011     1
2012    14
2013    24
2014    39
2015    49
2016    34
2017    19
Name: blog.stylingandroid.com, dtype: int64

In [75]:
top_blogs_by_year = blogs_by_year.loc[:, blogs_by_year.sum(0) >= 20]
top_blogs_by_year.head()

site,android-developers.blogspot.com,antonioleiva.com,blog.danlew.net,blog.stylingandroid.com,code.tutsplus.com,hannesdorfmann.com,medium.com,plus.google.com,www.bignerdranch.com,www.philosophicalhacker.com
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2011,1,0,0,1,0,0,0,0,0,0
2012,6,0,0,14,0,0,0,3,0,0
2013,6,2,0,24,0,0,1,11,0,0
2014,18,6,10,39,0,1,7,21,5,0
2015,16,12,12,49,10,7,43,24,11,4


In [58]:
plotly.offline.iplot({
    "data": [{
        'x': top_blogs_by_year.index,
        'y': top_blogs_by_year[col],
        'name': col
        }  for col in top_blogs_by_year.columns],
    "layout": Layout(yaxis=dict(title='Frequency'), xaxis=dict(title='Year'),
        title='Top 10 Android blogs of all time')
    })

## Who are the most prolific bloggers? Have they changed over time?

In [76]:
mask_authors = (adf.site == 'medium.com') | (adf.site == 'plus.google.com') | (adf.site == 'speakerdeck.com')

In [81]:
adf.loc[mask_authors, 'site_with_author'] = adf.link.str.split('/').str.get(3).str.replace('@','').str.replace('+','')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [78]:
adf.groupby('site_with_author').count().sort_values(by='issue_num', ascending=False).issue_num.count()

829

In [79]:
adf[adf.site == 'speakerdeck.com'].link

87      https://speakerdeck.com/pareshmayani/lazy-andr...
193     https://speakerdeck.com/stephanenicolas/blende...
340     https://speakerdeck.com/malmstein/streaming-th...
472     https://speakerdeck.com/cyrilmottier/deep-dive...
474     https://speakerdeck.com/mathieu_calba/the-deat...
498     https://speakerdeck.com/taylorling/ingredients...
538     https://speakerdeck.com/mttkay/reactive-soundc...
565     https://speakerdeck.com/abdyer/babbq5-automate...
607     https://speakerdeck.com/dorvaryn/rxfy-all-the-...
1120    https://speakerdeck.com/guardiola31337/elegant...
1165    https://speakerdeck.com/hugovisser/connecting-...
1508    https://speakerdeck.com/rock3r/tools-of-the-tr...
1509    https://speakerdeck.com/udinic/speed-up-your-a...
1510    https://speakerdeck.com/randomlytyping/android...
1513    https://speakerdeck.com/jakewharton/simple-htt...
1514    https://speakerdeck.com/devunwired/mastering-r...
1516    https://speakerdeck.com/jacobtabak/data-bindin...
1523    https:

In [34]:
adf.site_with_author.value_counts()[:20]

google-developers           31
AndroidDevelopers           23
google-developer-experts    11
ribot-labs                   9
sebs-top-tips                8
sergii                       7
hitherejoe                   7
bherbst                      6
duhroach                     5
square-corner-blog           4
azimolabs                    4
shelajev                     4
exploring-android            4
amitshekhar                  4
manuelvicnt                  4
p.tournaris                  4
crunching-rxandroid          4
rocknnull                    3
keepsafe-engineering         3
building-for-android-tv      3
Name: site_with_author, dtype: int64

In [35]:
adf.loc[mask_authors, ('site_with_author')] = adf.site + '/' + adf.site_with_author
adf.loc[~mask_authors, ('site_with_author')] = adf.site



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [36]:
adf[adf.site == 'www.bignerdranch.com'].iloc[3].link

'http://www.bignerdranch.com/blog/implementing-swipe-to-refresh/'

In [37]:
adf.site_with_author.value_counts()[:40]

blog.stylingandroid.com                197
android-developers.blogspot.com         66
medium.com/google-developers            31
blog.danlew.net                         30
www.bignerdranch.com                    25
antonioleiva.com                        24
plus.google.com/AndroidDevelopers       23
hannesdorfmann.com                      22
www.philosophicalhacker.com             22
code.tutsplus.com                       21
riggaroo.co.za                          18
commonsware.com                         17
www.novoda.com                          17
frogermcs.github.io                     14
blog.sqisland.com                       14
hackernoon.com                          13
developer.android.com                   13
grokkingandroid.com                     13
ptrprograms.blogspot.com                12
ryanharter.com                          11
medium.com/google-developer-experts     11
android.jlelse.eu                       11
www.doubleencore.com                    11
www.android

In [38]:
adf.issue_num.max()

263

In [39]:
adf[adf.issue_num > (adf.issue_num.max() - 52)].site_with_author.value_counts()[:25]

blog.stylingandroid.com                37
www.philosophicalhacker.com            15
medium.com/google-developers           13
hackernoon.com                         13
riggaroo.co.za                         13
www.novoda.com                         12
android.jlelse.eu                      11
hannesdorfmann.com                      8
blog.danlew.net                         7
www.thedroidsonroids.com                7
jeroenmols.com                          7
commonsware.com                         7
medium.com/google-developer-experts     6
blog.mindorks.com                       6
m.signalvnoise.com                      6
www.raywenderlich.com                   5
upday.github.io                         5
code.tutsplus.com                       5
blog.egorand.me                         5
yalantis.com                            5
tech.trello.com                         5
www.andevcon.com                        4
medium.com/manuelvicnt                  4
medium.com/azimolabs              

In [40]:
adf[adf.year == 2017].site_with_author.value_counts()[:25]

blog.stylingandroid.com              19
www.philosophicalhacker.com          12
android.jlelse.eu                    11
hackernoon.com                       10
commonsware.com                       6
www.thedroidsonroids.com              6
hannesdorfmann.com                    6
blog.danlew.net                       6
blog.mindorks.com                     5
www.novoda.com                        5
www.raywenderlich.com                 5
medium.com/google-developers          4
riggaroo.co.za                        4
tech.trello.com                       4
jeroenmols.com                        3
medium.com/square-corner-blog         3
yalantis.com                          3
android-developers.googleblog.com     3
medium.com/rafael_toledo              3
medium.com/proandroiddev              3
collectiveidea.com                    3
medium.com/quiro91                    3
m.signalvnoise.com                    3
eng.uber.com                          3
medium.com/JorgeCastilloPr            3


In [41]:
adf[adf.year == 2016].site_with_author.value_counts()[:25]

blog.stylingandroid.com                36
medium.com/google-developers           19
riggaroo.co.za                         13
android-developers.blogspot.com        10
www.novoda.com                         10
code.tutsplus.com                       9
blog.nimbledroid.com                    9
hannesdorfmann.com                      8
jeroenmols.com                          7
medium.com/sergii                       6
www.bignerdranch.com                    6
medium.com/google-developer-experts     6
realm.io                                5
medium.com/sebs-top-tips                5
tomstechnicalblog.blogspot.com          5
medium.com/duhroach                     5
www.philosophicalhacker.com             5
upday.github.io                         5
wiresareobsolete.com                    4
robots.thoughtbot.com                   4
www.thedroidsonroids.com                4
artemzin.com                            4
medium.com/p.tournaris                  4
commonsware.com                   

In [42]:
# blog-author map
bam = {}
bam['blog.stylingandroid.com'] = 'Mark Allison'
bam['blog.danlew.net'] = 'Dan Lew'
bam['antonioleiva.com'] = 'Antonio Leiva'
bam['hannesdorfmann.com'] = 'Hannes Dorfmann'
bam['frogermcs.github.io'] = 'Miroslaw Stanek'
bam['ptrprograms.blogspot.com'] = 'Paul Trebilcox-Ruiz'
#doubleencore?
bam['commonsware.com'] = 'Mark Murphy'
bam['ryanharter.com'] = 'Ryan Harter'
bam['blog.sqisland.com'] = 'Chiu-Ki Chan'
bam['saulmm.github.io'] = 'Saúl Molinero'
bam['www.philosophicalhacker.com'] = 'Matt Dupree'
bam['www.androiddesignpatterns.com'] = 'Alex Lockwood'
bam['wiresareobsolete.com'] = 'Dave Smith'
bam['medium.com/sebs-top-tips'] = 'Sebastiano Poggi'
bam['udinic.wordpress.com'] = 'Udi Cohen'
bam['riggaroo.co.za'] = 'Rebecca Franks'
bam['chrisrisner.com'] = 'Chris Risner'
bam['artemzin.com'] = 'Artem Zinnatullin'
bam['trickyandroid.com'] = 'Pavel Dudka'
bam['grokkingandroid.com'] = 'Wolfram Rittmeyer'
bam['fernandocejas.com'] = 'Fernando Cejas'
bam['tomstechnicalblog.blogspot.com'] = 'Thomas Nield'

In [43]:
for key in bam:
    adf.loc[adf.site == key, 'author'] = bam[key]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [44]:
adf.author.value_counts()[:10]

Mark Allison         197
Dan Lew               30
Antonio Leiva         24
Matt Dupree           22
Hannes Dorfmann       22
Rebecca Franks        18
Mark Murphy           17
Miroslaw Stanek       14
Chiu-Ki Chan          14
Wolfram Rittmeyer     13
Name: author, dtype: int64

In [45]:
adf[adf.year == 2016].author.value_counts()[:10]

Mark Allison         36
Rebecca Franks       13
Hannes Dorfmann       8
Thomas Nield          5
Matt Dupree           5
Dave Smith            4
Artem Zinnatullin     4
Mark Murphy           4
Ryan Harter           3
Chiu-Ki Chan          3
Name: author, dtype: int64

In [46]:
adf[adf.year == 2015].author.value_counts()[:10]

Mark Allison         50
Dan Lew              12
Antonio Leiva        12
Miroslaw Stanek      10
Chiu-Ki Chan          8
Saúl Molinero         7
Hannes Dorfmann       7
Matt Dupree           5
Fernando Cejas        4
Artem Zinnatullin     4
Name: author, dtype: int64

In [47]:
adf[adf.year == 2014].author.value_counts()[:10]

Mark Allison           41
Paul Trebilcox-Ruiz    11
Dan Lew                10
Antonio Leiva           6
Ryan Harter             5
Pavel Dudka             3
Wolfram Rittmeyer       3
Alex Lockwood           3
Udi Cohen               3
Mark Murphy             2
Name: author, dtype: int64

In [48]:
adf[adf.year == 2013].author.value_counts()[:10]

Mark Allison         29
Udi Cohen             4
Alex Lockwood         3
Antonio Leiva         2
Wolfram Rittmeyer     2
Ryan Harter           1
Name: author, dtype: int64

In [49]:
adf[adf.year < 2013].author.value_counts()[:10]

Mark Allison         22
Chris Risner          7
Wolfram Rittmeyer     5
Dave Smith            1
Udi Cohen             1
Mark Murphy           1
Name: author, dtype: int64

In [50]:
authors_by_year = pd.pivot_table(adf, index='year', columns='author', aggfunc=lambda x: len(x.unique()), fill_value=0).issue_num
authors_by_year.head()

author,Alex Lockwood,Antonio Leiva,Artem Zinnatullin,Chiu-Ki Chan,Chris Risner,Dan Lew,Dave Smith,Fernando Cejas,Hannes Dorfmann,Mark Allison,...,Matt Dupree,Miroslaw Stanek,Paul Trebilcox-Ruiz,Pavel Dudka,Rebecca Franks,Ryan Harter,Saúl Molinero,Thomas Nield,Udi Cohen,Wolfram Rittmeyer
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2012,0,0,0,0,7,0,1,0,0,14,...,0,0,0,0,0,0,0,0,1,4
2013,3,2,0,0,0,0,0,0,0,24,...,0,0,0,0,0,1,0,0,4,2
2014,3,6,0,2,0,10,2,2,1,39,...,0,2,11,2,0,5,0,0,3,3
2015,3,12,4,8,0,12,1,4,7,49,...,4,10,1,3,1,2,7,1,0,2


In [51]:
top_authors_by_year = authors_by_year.loc[:, authors_by_year.sum(0) >= 10]
plotly.offline.iplot({
    "data": [{
        'x': top_authors_by_year.index,
        'y': top_authors_by_year[col],
        'name': col
        }  for col in top_authors_by_year.columns],
    "layout": Layout(yaxis=dict(title='Frequency'), xaxis=dict(title='Year'),
        title='Author Frequency by Year')
    })