# Android Weekly analysis

Investigating the most popular blogs and authors over time.

In [3]:
from bs4 import BeautifulSoup
import urllib2
import csv
from os import listdir
from os.path import isfile, join
import pandas as pd

BASE_URL = "http://androidweekly.net/issues/issue-{}"

## Data collection

Dowload and parse each Android weekly post. 
1. `contentForIssue(num)` downloads the data for an issue
2. `dataForIssue(content, issue_num)`, `dataForIssueBefore103(content, issue_num)`, and `dataForIssueBefore60(content, issue_num)` parse the html from different issues into an array of objects
3. `writeData(data, fname)` writes the data for an issue to a csv for retrieval later

In [25]:
def contentForIssue(num):
    url = BASE_URL.format(num)
    print url
    request = urllib2.Request(url)
    request.add_header('Accept-Encoding', 'utf-8')
    response = urllib2.urlopen(url)
    soup = BeautifulSoup(response.read().decode('utf-8'), 'lxml')
    content = soup.find('div', { 'class':'issue' })
    return content

In [26]:
def dataForIssue(content, issue_num):
    sections = content.find_all('td')
    last_section_header = ""
    data = []
    section_count = 0
    for section in sections:
        if section.h2 or section.h3:
            section_count += 1
            if section.h2:
                last_section_header = section.h2.text
            else:
                last_section_header = section.h3.text
        if section.a and section.a.string:
            item = {}
            item['issue_num'] = issue_num
            item['section'] = last_section_header.encode("utf-8").strip()
            item['title'] = section.a.text.encode("utf-8").replace('\n','').strip()
            item['link'] = section.a.get('href').encode("utf-8").strip()
            item['site'] = section.span.text.replace('(','').replace(')', '').replace('\n','').encode("utf-8").strip()
            item['description'] = section.p.text.replace('\n','').encode("utf-8").strip()
            data.append(item)
    print 'Found %d items in %s sections' % (len(data), section_count)
    return data

In [27]:
def dataForIssueBefore103(content, issue_num):
    sections = content.find_all(['h2', 'div', 'p'])
    last_section_header = ""
    data = []
    section_count = 0
    for section in sections:
        if section.name == 'h2':
            section_count += 1
            last_section_header = section.text
        elif section.name == 'div' and len(section.find_all('a')) > 1:
            links = section.find_all('a');
            item = {}
            item['issue_num'] = issue_num
            item['section'] = last_section_header.encode("utf-8").strip()
            item['title'] = links[1].string.encode("utf-8").replace('\n','').strip()
            item['link'] = links[1].get('href').encode("utf-8").strip()
            item['site'] = section.span.text.replace('(','').replace(')', '').replace('\n','').encode("utf-8").strip()
            item['description'] = section.p.text.replace('\n','').encode("utf-8").strip()
            data.append(item)
        elif section.name == 'p' and section.a and section.a.text:
            item = {}
            item['issue_num'] = issue_num
            item['section'] = last_section_header.encode("utf-8").strip()
            item['title'] = section.a.string.encode("utf-8").replace('\n','').strip()
            item['link'] = section.a.get('href').encode("utf-8").strip()
            if section.span:
                item['site'] = section.span.text.replace('(','').replace(')', '').replace('\n','').encode("utf-8").strip()
            else:
                item['site'] = ""
            if section.br and section.br.next_sibling:
                item['description'] = section.br.next_sibling.replace('\n','').encode("utf-8").strip()
            else:
                item['description'] = ""
            data.append(item)
    print 'Found %d items in %s sections' % (len(data), section_count)
    return data

In [28]:
def dataForIssueBefore60(content, issue_num):
    return dataForIssueBefore103(content.div, issue_num)

In [29]:
def writeData(data, fname):
    with open(fname, 'wb') as f:
        w = csv.DictWriter(f, data[0].keys())
        w.writeheader()
        for datum in data:
            w.writerow(datum)

## Get all the data!

With the above functions available, we can download all the Android Weekly posts

In [30]:
def getAllData(issueNums):
    for i in issueNums:
        content = contentForIssue(i)
        data = None
        if (i < 60):
            data = dataForIssueBefore60(content, i) 
        elif (i < 103):
            data = dataForIssueBefore103(content, i) 
        else:
            data = dataForIssue(content, i)
        writeData(data, 'issue_{}.csv'.format(i))

In [33]:
# 500 errors: 138, 108, 43
# Skipped b/c unusual: 30
issueNums = range(1, 217)
issueNums.remove(138)
issueNums.remove(108)
issueNums.remove(43)
issueNums.remove(30)
getAllData(reversed(issueNums))

http://androidweekly.net/issues/issue-216
Found 26 items in 6 sections
http://androidweekly.net/issues/issue-215
Found 28 items in 7 sections
http://androidweekly.net/issues/issue-214
Found 21 items in 8 sections
http://androidweekly.net/issues/issue-213
Found 30 items in 7 sections
http://androidweekly.net/issues/issue-212
Found 23 items in 7 sections
http://androidweekly.net/issues/issue-211
Found 29 items in 9 sections
http://androidweekly.net/issues/issue-210
Found 28 items in 9 sections
http://androidweekly.net/issues/issue-209
Found 27 items in 10 sections
http://androidweekly.net/issues/issue-208
Found 25 items in 7 sections
http://androidweekly.net/issues/issue-207
Found 25 items in 8 sections
http://androidweekly.net/issues/issue-206
Found 25 items in 8 sections
http://androidweekly.net/issues/issue-205
Found 34 items in 8 sections
http://androidweekly.net/issues/issue-204
Found 34 items in 10 sections
http://androidweekly.net/issues/issue-203
Found 29 items in 7 sections
http

## Load Data

Loading the data from the csv files into memory makes it easy to resume work.

In [27]:
def loadDataFiles():
    files = [f for f in listdir('data') if isfile(join('data', f)) and f.endswith('.csv')]
    df = None
    for fname in files:
        if df is None:
            df = pd.read_csv('data/' + fname)
        else:
            df = df.append(pd.read_csv('data/' + fname), ignore_index=True)
    return df
    
df = loadDataFiles()
df.head()

Unnamed: 0,description,title,section,site,issue_num,link
0,An Android Tutorial aimed for iOS Developers.,http://clayallsopp.posterous.com/building-an-a...,Articles and Tutorials,,1,http://clayallsopp.posterous.com/building-an-a...
1,Selenium Webdriver is now available for your A...,http://android-developers.blogspot.com/2011/10...,Articles and Tutorials,,1,http://android-developers.blogspot.com/2011/10...
2,For our dev rookies we got an Java tutorial wh...,http://mobile.tutsplus.com/tutorials/android/j...,Articles and Tutorials,,1,http://mobile.tutsplus.com/tutorials/android/j...
3,The brandnew nexus with Android 4.0 aka Ice Cr...,http://www.google.com/nexus/,Headlines,,1,http://www.google.com/nexus/
4,A very cool info graphic about the Rise of And...,http://bbgeeks.com/images/AndroidRise.png,Headlines,,1,http://bbgeeks.com/images/AndroidRise.png


# Analysis

#### These are the primary questions I want to answer:
1. What are the most prolific blogs? Have those changed over time?
2. Who are the most prolific bloggers? Have they changed over time?
3. What are the most common topics of discussion? Have those changed over time?

#### These are questions of secondary interest:
1. What are the most popular library topics? Have those changed over time?
2. Who are the most prolific library contributors? Have they changed over time?

### The sections

In [28]:
df.groupby('section').count().sort_values(by='issue_num', ascending=False).issue_num[:15]

section
Articles & Tutorials      1558
Libraries & Code           709
Jobs                       459
Sponsored                  219
News                       193
Videos & Podcasts          192
Tools                      168
Design                     147
Videos                     113
Events                      49
Specials                    44
ARTICLES AND TUTORIALS      32
App of the Week             28
LIBRARIES AND CODE          27
Screencasts                 22
Name: issue_num, dtype: int64

## What are the most prolific blogs? Have those changed over time?

To answer this, I will look at the most common sites in general. Then I will also split segment the data by year.

In [72]:
adf = df[(df.section == 'Articles & Tutorials') | (df.section == 'ARTICLES AND TUTORIALS')]
adf.issue_num.count()

1590

In [38]:
adf.groupby('site').count().sort_values(by='issue_num', ascending=False).issue_num[:30]

site
medium.com                         176
blog.stylingandroid.com            164
plus.google.com                     90
android-developers.blogspot.com     65
speakerdeck.com                     29
blog.danlew.net                     24
www.bignerdranch.com                23
antonioleiva.com                    22
code.tutsplus.com                   17
hannesdorfmann.com                  15
frogermcs.github.io                 13
ptrprograms.blogspot.com            12
www.doubleencore.com                11
commonsware.com                     11
developer.android.com               11
ryanharter.com                      11
corner.squareup.com                 10
blog.nimbledroid.com                10
blog.sqisland.com                   10
www.androiddesignpatterns.com        9
www.slideshare.net                   9
saulmm.github.io                     9
www.philosophicalhacker.com          9
udinic.wordpress.com                 8
android-developers.blogspot.de       8
vogella.de          

#### Most popular blogs of 2016

In [98]:
mask_2016 = adf.issue_num >= 191
mask_2015 = (adf.issue_num < 191) & (adf.issue_num >= 134)
mask_2014 = (adf.issue_num < 134) & (adf.issue_num >= 83)
mask_2013 = (adf.issue_num < 83) & (adf.issue_num >= 49)
mask_before_2013 = adf.issue_num < 49

In [93]:
adf[mask_2016].groupby('site').count().sort_values(by='issue_num', ascending=False).issue_num[:10]

site
medium.com                         83
blog.stylingandroid.com            22
android-developers.blogspot.com     9
blog.nimbledroid.com                8
riggaroo.co.za                      7
hannesdorfmann.com                  7
code.tutsplus.com                   6
realm.io                            5
speakerdeck.com                     5
tomstechnicalblog.blogspot.com      5
Name: issue_num, dtype: int64

#### Most popular blogs of 2015

In [94]:
adf[mask_2015].groupby('site').count().sort_values(by='issue_num', ascending=False).issue_num[:10]

site
medium.com                         83
blog.stylingandroid.com            50
plus.google.com                    34
android-developers.blogspot.com    19
www.bignerdranch.com               13
blog.danlew.net                    12
speakerdeck.com                    12
antonioleiva.com                   12
code.tutsplus.com                  11
frogermcs.github.io                10
Name: issue_num, dtype: int64

#### Most popular blogs of 2014

In [95]:
adf[mask_2014].groupby('site').count().sort_values(by='issue_num', ascending=False).issue_num[:10]

site
blog.stylingandroid.com            41
plus.google.com                    35
android-developers.blogspot.com    21
ptrprograms.blogspot.com           11
blog.danlew.net                    10
speakerdeck.com                     9
medium.com                          9
antonioleiva.com                    6
github.com                          5
www.bignerdranch.com                5
Name: issue_num, dtype: int64

#### Most popular blogs of 2013

In [96]:
adf[mask_2013].groupby('site').count().sort_values(by='issue_num', ascending=False).issue_num[:10]

site
blog.stylingandroid.com            29
plus.google.com                    15
android-developers.blogspot.com     8
www.doubleencore.com                7
www.kpbird.com                      4
android-developers.blogspot.de      4
udinic.wordpress.com                4
gmariotti.blogspot.de               4
speakerdeck.com                     3
www.androiddesignpatterns.com       3
Name: issue_num, dtype: int64

#### Most popular blogs before 2013

In [97]:
adf[mask_before_2013].groupby('site').count().sort_values(by='issue_num', ascending=False).issue_num[:10]

site
blog.stylingandroid.com            22
vogella.de                          8
android-developers.blogspot.com     8
chrisrisner.com                     7
grokkingandroid.com                 5
plus.google.com                     4
developer.android.com               4
mobile.tutsplus.com                 3
vogella.com                         3
androiduipatterns.com               3
Name: issue_num, dtype: int64

## Who are the most prolific bloggers? Have they changed over time?

In [73]:
mask = (adf.site == 'medium.com') | (adf.site == 'plus.google.com')

In [89]:
adf.loc[mask,('author')] = adf.link.str.split('/').str.get(3).str.replace('@','').str.replace('+','')

In [90]:
adf.groupby('author').count().sort_values(by='issue_num', ascending=False).issue_num.count()

137

In [91]:
adf.groupby('author').count().sort_values(by='issue_num', ascending=False).issue_num[:20]

author
AndroidDevelopers           23
google-developers           22
ribot-labs                   9
sebs-top-tips                8
hitherejoe                   6
google-developer-experts     5
sergii                       4
duhroach                     4
shelajev                     4
crunching-rxandroid          3
_tiwiz                       3
rotxed                       3
CyrilMottier                 3
CodingDoug                   3
ChrisBanes                   3
juanchosaravia               3
KirillGrouchnikov            3
bherbst                      3
113735310430199015092        3
RomanNurik                   3
Name: issue_num, dtype: int64

In [102]:
adf[mask_2016].groupby('author').count().sort_values(by='issue_num', ascending=False).issue_num[:10]

author
google-developers           14
sebs-top-tips                5
duhroach                     4
sergii                       4
shelajev                     3
CodingDoug                   3
juanchosaravia               3
andrzejchm                   2
google-developer-experts     2
_tiwiz                       2
Name: issue_num, dtype: int64

In [101]:
adf[mask_2015].groupby('author').count().sort_values(by='issue_num', ascending=False).issue_num[:10]

author
AndroidDevelopers           14
ribot-labs                   9
google-developers            8
hitherejoe                   4
google-developer-experts     3
sebs-top-tips                3
etiennelawlor                2
ChrisBanes                   2
ipaulpro                     2
sotti                        2
Name: issue_num, dtype: int64

In [103]:
adf[mask_2014].groupby('author').count().sort_values(by='issue_num', ascending=False).issue_num[:10]

author
AndroidDevelopers          7
CyrilMottier               3
KirillGrouchnikov          3
IanLake                    2
ArneStockmans              2
JakeWharton                2
RomanNurik                 2
building-for-android-tv    2
StevePomeroy               1
ThomasDevauxPlus           1
Name: issue_num, dtype: int64

In [104]:
adf[mask_2013].groupby('author').count().sort_values(by='issue_num', ascending=False).issue_num[:10]

author
MatthiasSchaff           2
RomainGuy                2
104992153032750621070    1
106183159594179737258    1
107708120842840792570    1
108284392618554783657    1
108612553581259107752    1
118417777153109946393    1
AndroidDevelopers        1
ChristopherBroadfoot     1
Name: issue_num, dtype: int64

In [105]:
adf[mask_before_2013].groupby('author').count().sort_values(by='issue_num', ascending=False).issue_num[:10]

author
113735310430199015092    3
u                        1
Name: issue_num, dtype: int64