In [1]:
import bigbang.mailman as mailman
import bigbang.process as process
from bigbang.archive import Archive


import pandas as pd
import datetime

from commonregex import CommonRegex

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def filter_messages(df, column, keywords):
    filters = []
    for keyword in keywords:
        filters.append(df[column].str.contains(keyword, case=False))

    return df[reduce(lambda p, q: p | q, filters)]


In [3]:
# Get the Archieves
pd.options.display.mpl_style = 'default'  # pandas has a set of preferred graph formatting options

# XXX: the archieve_dir argument willl not work for other people
mlist = mailman.open_list_archives("https://lists.w3.org/Archives/Public/public-html/", archive_dir="/home/hargup/code/bigbang/archives") 

# The spaces around eme are **very** important otherwise it can catch things like "emerging", "implement" etc
eme_messages = filter_messages(mlist, 'Subject', [' EME ', 'Encrypted Media', 'Digital Rights Managagement'])
eme_activites = Archive.get_activity(Archive(eme_messages))


Opening 69 archive files


  self.data.sort(columns='Date', inplace=True)


In [4]:
eme_activites.sum(0).sum()

474.0

In [None]:
# XXX: Bugzilla might also contain discussions
eme_activites.drop("bugzilla@jessica.w3.org", axis=1, inplace=True)


In [None]:
# Remove Dupicate senders
levdf = process.sorted_matrix(eme_activites)

consolidates = []
# gather pairs of names which have a distance of less than 10
for col in levdf.columns:
  for index, value in levdf.loc[levdf[col] < 10, col].iteritems():
        if index != col: # the name shouldn't be a pair for itself
            consolidates.append((col, index))
            
# Handpick special cases which aren't covered with string matching
consolidates.extend([(u'Kornel Lesi\u0144ski <kornel@geekhood.net>',
                      u'wrong string <kornel@geekhood.net>'),
                     (u'Charles McCathie Nevile <chaals@yandex-team.ru>',
                      u'Charles McCathieNevile <chaals@opera.com>')])

eme_activites = process.consolidate_senders_activity(eme_activites, consolidates)

In [None]:
sender_categories = pd.read_csv('people_tag.csv',delimiter=',', encoding="utf-8-sig")

# match sender using email only
sender_categories['email'] = map(lambda x: CommonRegex(x).emails[0].lower(), sender_categories['name_email'])

sender_categories.index = sender_categories['email']
cat_dicts = {
    "region":{
        1: "Asia",
        2: "Australia and New Zealand",
        3: "Europe",
        4: "Africa",
        5: "North America",
        6: "South America"
    },
    "work":{
        1: "Foss Browser Developer",
        2: "Content Provider",
        3: "DRM platform provider",
        4: "Accessibility",
        5: "Security Researcher",
        6: "Other W3C Empoyee",
        7: "Privacy",
        8: "None of the above"

    }
}

In [None]:
def get_cat_val_func(cat):
    """
    Given category type, returns a function which gives the category value for a sender.
    """
    def _get_cat_val(sender):
        try:
            sender_email = CommonRegex(sender).emails[0].lower()
            return cat_dicts[cat][sender_categories.loc[sender_email][cat]]
        except KeyError:
            return "Unknow"
    return _get_cat_val

In [None]:
grouped = eme_activites.groupby(get_cat_val_func("region"), axis=1)
print("Emails sent per region\n")
print(grouped.sum().sum())
print("Total emails: %s" % grouped.sum().sum().sum())

In [None]:
print("Participants per region")
for group in grouped.groups:
    print "%s: %s" % (group,len(grouped.get_group(group).sum()))
print("Total participants: %s" % len(eme_activites.columns))


In [None]:
grouped = eme_activites.groupby(get_cat_val_func("work"), axis=1)
print("Emails sent per work category")
print(grouped.sum().sum())

In [None]:
print("Participants per work category")
for group in grouped.groups:
    print "%s: %s" % (group,len(grouped.get_group(group).sum()))

In [None]:
# Why is diversity important?
# Cultural Atitudes between Asia and Europeans are very differnt
# The IP regiems are very different, for example India does not allow Software Patents
# and DRM laws are also very differt. They are developing standards for the world
# by thinking about US