In [15]:
import bigbang.mailman as mailman
import bigbang.process as process
from bigbang.archive import Archive


import pandas as pd
import datetime

from commonregex import CommonRegex

import matplotlib.pyplot as plt
%matplotlib inline

In [16]:
def filter_messages(df, column, keywords):
    filters = []
    for keyword in keywords:
        filters.append(df[column].str.contains(keyword, case=False))

    return df[reduce(lambda p, q: p | q, filters)]


In [17]:
# Get the Archieves
pd.options.display.mpl_style = 'default'  # pandas has a set of preferred graph formatting options

urls = ["https://lists.w3.org/Archives/Public/public-html/",
        "https://lists.w3.org/Archives/Public/public-html/",
        "https://lists.w3.org/Archives/Public/public-html-media/"]
mlists = [mailman.open_list_archives(url, archive_dir="/home/hargup/code/bigbang/archives") for url in urls]
# The spaces around eme are **very** important otherwise it can catch things like "emerging", "implement" etc
eme_messages = filter_messages(mlists[0], 'Subject', [' EME ', 'Encrypted Media', 'Digital Rights Managagement'])
eme_activites = Archive.get_activity(Archive(eme_messages))


Opening 69 archive files
Opening 69 archive files
Opening 48 archive files


In [18]:
eme_activites.sum(0).sum()

474.0

In [19]:
# XXX: Bugzilla might also contain discussions
# XXX: We are only consider one mailing list
eme_activites.drop("bugzilla@jessica.w3.org", axis=1, inplace=True)


In [20]:
# Remove Dupicate senders
levdf = process.sorted_matrix(eme_activites)

consolidates = []
# gather pairs of names which have a distance of less than 10
for col in levdf.columns:
  for index, value in levdf.loc[levdf[col] < 10, col].iteritems():
        if index != col: # the name shouldn't be a pair for itself
            consolidates.append((col, index))
            
consolidates.extend([(u'Kornel Lesi\u0144ski <kornel@geekhood.net>',
                      u'wrong string <kornel@geekhood.net>'),
                     (u'Charles McCathie Nevile <chaals@yandex-team.ru>',
                      u'Charles McCathieNevile <chaals@opera.com>')])

eme_activites = process.consolidate_senders_activity(eme_activites, consolidates)



In [21]:
sender_categories = pd.read_csv('people_tag.csv',delimiter=',', encoding="utf-8-sig")
sender_categories['email'] = map(lambda x: CommonRegex(x).emails[0].lower(), sender_categories['name_email'])
sender_categories.index = sender_categories['email']
cat_dicts = {
    "region":{
        1: "asia",
        2: "Australia and New Zealand",
        3: "europe",
        4: "africa",
        5: "north america",
        6: "south america"
    },
    "work":{
        1: "foss browswer developer",
        2: "content provider",
        3: "drm platform provider",
        4: "accessibility",
        5: "security researcher",
        6: "other w3c empoyee",
        7: "None of the above",
        8: "Privacy"
    }
}


In [22]:
def get_cat_val_func(cat):
    def _get_cat_val(sender):
        try:
            sender_email = CommonRegex(sender).emails[0].lower()
            return cat_dicts[cat][sender_categories.loc[sender_email][cat]]
        except KeyError:
            return "Unknow"
    return _get_cat_val

In [23]:
grouped = eme_activites.groupby(get_cat_val_func("region"), axis=1)
print("Emails sent per region")
print(grouped.sum().sum())
print("Total emails: %s" % grouped.sum().sum().sum())



Emails sent per region
Australia and New Zealand     16
europe                       146
north america                310
dtype: float64
Total emails: 472.0


In [24]:
print("Participants per region")
for group in grouped.groups:
    print "%s: %s" % (group,len(grouped.get_group(group).sum()))
print("Total participants: %s" % len(eme_activites.columns))


Participants per region
europe: 13
north america: 30
Australia and New Zealand: 5
Total participants: 48


In [25]:
grouped = eme_activites.groupby(get_cat_val_func("work"), axis=1)
print("Emails sent per region")
print(grouped.sum().sum())

Emails sent per region
None of the above           79
Privacy                      2
accessibility               47
content provider           186
drm platform provider       92
foss browswer developer     56
other w3c empoyee           10
dtype: float64


In [26]:
print("Participants per emloyeer")
for group in grouped.groups:
    print "%s: %s" % (group,len(grouped.get_group(group).sum()))

Participants per region
foss browswer developer: 5
Privacy: 2
accessibility: 4
other w3c empoyee: 3
drm platform provider: 14
content provider: 9
None of the above: 11


In [27]:
# Why is diversity important?
# Cultural Atitudes between Asia and Europeans are very differnt
# The IP regiems are very different, for example India does not allow Software Patents
# and DRM laws are also very differt. They are developing standards for the world
# by thinking about US