This notebook seeks to explore the gender diversity of the different apache projects & the process

In [1]:
!pwd
!pip install meetup.api
!ls /hadoop/spark/tmp/
!cd /hadoop/spark/tmp/spark-*/userFiles-*/
!pwd
!rm lazy_helpers.py
!wget https://raw.githubusercontent.com/holdenk/diversity-analytics/master/lazy_helpers.py

/
Collecting meetup.api
  Downloading https://files.pythonhosted.org/packages/b7/44/a545b860f19cac088cb7f3e39beae8334b63e025729348ddeffb3342122a/meetup_api-0.1.1-py2.py3-none-any.whl (229kB)
[K    100% |████████████████████████████████| 235kB 8.2MB/s ta 0:00:01
Installing collected packages: meetup.api
Successfully installed meetup.api
blockmgr-e352b968-7f0a-4623-9eac-b6b0b599e202
spark-399e062b-2ef6-45d2-a612-c3640a2f853a
/
--2018-04-12 17:38:26--  https://raw.githubusercontent.com/holdenk/diversity-analytics/master/lazy_helpers.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1265 (1.2K) [text/plain]
Saving to: ‘lazy_helpers.py’


2018-04-12 17:38:26 (37.1 MB/s) - ‘lazy_helpers.py’ saved [1265/1265]



In [2]:
import os
os.environ['PATH'] = os.environ['PATH'] + ":/usr/lib/chromium/"

In [62]:
from pyspark import *
from pyspark.sql import *
from pyspark.sql.functions import concat, collect_set, explode, from_json, format_string
from pyspark.sql import functions as F
from pyspark.sql.types import *

import json
import os
import meetup.api
from copy import copy
import time
import logging

API key configuration

In [8]:
meetup_key = os.getenv("MEETUP_APIKEY")
gh_api_token = os.getenv("GITHUB_TOKEN")
fs_prefix = "gs://boo-stuff/"

Less secret configuration

In [9]:
max_meetup_events = 800

In [10]:
session = SparkSession.builder.appName("whatCanWeLearnFromTheSixties").getOrCreate()
sc = session.sparkContext

The first thing we want to get is the committers and PMC members, this information is stored in LDAP but also available in JSON. Eventually we will want to enrich this with mailing list information

In [11]:
def loadFlatJsonFile(path, explodeKey, schema=None):
    """Load a flat multi-line json file and convert into Spark & explode"""
    rdd = sc.wholeTextFiles(path).values().setName("Input file {}".format(path))
    df = (session.read.schema(schema)
            .json(rdd))
    return df.select(explode(explodeKey))

In [12]:
apache_people_schema = StructType([StructField("lastCreateTimestamp", StringType()),
                     StructField("people",
                                 MapType(StringType(), 
                                         StructType([StructField('name', StringType()),
                                                     StructField('key_fingerprints', ArrayType(StringType())),
                                                     StructField('urls', ArrayType(StringType())),
                                                    ]))
                                )])
apache_poeple_df_file = "{0}{1}".format(fs_prefix, "http_data_sources/public_ldap_people.json") # http://people.apache.org/public/public_ldap_people.json
apache_people_df = loadFlatJsonFile(path=apache_poeple_df_file, 
                                 explodeKey="people", schema=apache_people_schema)
apache_people_df = apache_people_df.select(apache_people_df.key.alias("username"), apache_people_df.value.alias("extra")).repartition(100).persist().alias("apache_people")
apache_people_df.alias("Apache Committers")

DataFrame[username: string, extra: struct<name:string,key_fingerprints:array<string>,urls:array<string>>]

In [13]:
sc.addFile("lazy_helpers.py")

In [14]:
# Construct a lazy urllib3 pool
from lazy_helpers import *
    
bcast_pool = sc.broadcast(LazyPool)
bcast_pool.value

lazy_helpers.LazyPool

In [15]:
def project_on_github(project):
    """Returns if a project is on github"""
    import urllib3
    http = bcast_pool.value.get()
    r = http.request('GET', "https://github.com/apache/{0}".format(project))
    return r.status == 200
session.catalog.registerFunction("on_github", project_on_github, BooleanType())
# Except I'm a bad person so....
from pyspark.sql.catalog import UserDefinedFunction
project_on_github_udf = UserDefinedFunction(project_on_github, BooleanType(), "on_github")
session.catalog._jsparkSession.udf().registerPython("on_github", project_on_github_udf._judf)

In [16]:
apache_committees_schema = StructType([StructField("lastCreateTimestamp", StringType()),
                     StructField("committees",
                                 MapType(StringType(), StructType([StructField('roster', ArrayType(StringType())),
                                                                  StructField('modifyTimestamp', StringType()),
                                                                  StructField('createTimestamp', StringType())
                                                                  ])))])
apache_committees_df_file = "{0}{1}".format(fs_prefix, "http_data_sources/public_ldap_committees.json") # http://people.apache.org/public/public_ldap_committees.json
apache_committees_df = loadFlatJsonFile(path=apache_committees_df_file,
                                 explodeKey="committees", schema=apache_committees_schema)
apache_committees_on_github_df = apache_committees_df.filter(project_on_github_udf(apache_committees_df.key))
apache_committees_on_github_df.persist(StorageLevel.MEMORY_AND_DISK)
committee_names_df = apache_committees_on_github_df.select(apache_committees_df.key.alias("project")).alias("apache_committees").repartition(200)
committee_names_df.persist(StorageLevel.MEMORY_AND_DISK)
committee_names_df.alias("Apache Committee Names")
committee_names_df.count()

163

In [17]:
project_to_user_df = apache_committees_on_github_df.select(
    apache_committees_on_github_df.key.alias("project"),
    explode(apache_committees_on_github_df.value.roster).alias("username"))


user_to_project_df = project_to_user_df.groupBy(project_to_user_df.username).agg(
    collect_set(project_to_user_df.project).alias("projects"))
apache_people_df = apache_people_df.join(user_to_project_df, on="username")
apache_people_df.alias("Apache People joined with projects")

DataFrame[username: string, extra: struct<name:string,key_fingerprints:array<string>,urls:array<string>>, projects: array<string>]

In [18]:
apache_people_df.take(1)

[Row(username='aching', extra=Row(name='Avery Ching', key_fingerprints=['2DC0BC2C'], urls=None), projects=['giraph'])]

Attempt to fetch relevant past & present meetups for each project - idea based on the listing at https://www.apache.org/events/meetups.html but different code

We want to do a non-blocking count to materialize the meetup RDD because this is slow

In [110]:
# Some async helpers, in Scala we would use AsyncRDDActions but its not currently available in Python
# Support is being considered in https://issues.apache.org/jira/browse/SPARK-20347
def non_blocking_rdd_count(rdd):
    import threading
    def count_magic():
        rdd.count()
    thread = threading.Thread(target=count_magic)
    thread.start()

def non_blocking_rdd_save(rdd, target):
    import threading
    def save_panda():
        rdd.saveAsPickleFile(target)
    thread = threading.Thread(target=save_panda)
    thread.start()

def non_blocking_df_save(df, target):
    import threading
    def save_panda():
        df.write.save(target)
    thread = threading.Thread(target=save_panda)
    thread.start()

def non_blocking_df_save_or_load(df, target):
    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jvm.java.net.URI(fs_prefix), sc._jsc.hadoopConfiguration())
    success_files = ["{0}/SUCCESS.txt", "{0}/_SUCCESS", "{0}"]
    if any(fs.exists(sc._jvm.org.apache.hadoop.fs.Path(t.format(target))) for t in success_files):
        return session.read.load(target)
    else:
        non_blocking_df_save(df, target)
        return df    

In [92]:
logger = logging.getLogger()
logger.setLevel("WARN")
# For now, this is an avenue of future exploration, AKA holden's doesn't want her meetup API keys banned
def lookup_relevant_meetup(project_name, max_meetup_events=0):
    """Lookup relevant meetups for a specific project."""
    import logging
    import time
    import meetup.api
    logger = logging.getLogger()
    meetup_delay = 30
    meetup_reset_delay = 3600 # 1 hour
    standard_keys = {"text_format": "plain", "trending": "desc=true", "and_text": "true", "city": "san francisco", "country": "usa", "text": "apache " + project_name, "radius": 10000}
    results = {"upcoming": [], "past": []}
    for status in ["upcoming", "past"]:
        keys = copy(standard_keys)
        keys["status"] = status
        count = 200
        base = 0
        while (count == 200 and (max_meetup_events == 0 or base < max_meetup_events)):
            logging.debug("Fetch {0} meetups for {1} on base {2}".format(status, project_name, base))
            project_name = "spark"
            client = client = meetup.api.Client(meetup_key)
            if base > 0:
                keys["page"] = base
            # Manually sleep for meetup_reset_delay on failure, the meetup-api package retry logic sometimes breaks :(
            response = None
            retry_count = 0
            while response is None and retry_count < 10:
                try:
                    response = client.GetOpenEvents(**keys)
                except:
                    response = None
                    retry_count += 1
                    time.sleep(meetup_reset_delay)
                    try:
                        response = client.GetOpenEvents(**keys)
                    except:
                        response = None
            try:
                count = response.meta['count']
                base = base + count
                results[status].append(response.results)
                time.sleep(meetup_delay)
            except:
                count = 0
    return (project_name, results)

In [93]:
#project_meetups_rdd = committee_names_df.repartition(500).rdd.map(lambda x: x.project).map(lambda name: lookup_relevant_meetup(name, max_meetup_events))
#project_meetups_rdd.setName("Meetup Data RDD")

In [94]:
#project_meetups_rdd.persist(StorageLevel.MEMORY_AND_DISK)
#raw_project_meetups_df = project_meetups_rdd.toDF() 
#raw_project_meetups_df.alias("Project -> meetup dataframe")

In [95]:
#project_meetups_df = non_blocking_df_save(raw_project_meetups_df, "mini_meetup_data")

In [96]:
#project_meetups_df.show()

In [97]:
#project_meetups_df.schema

For the provided projects attempt to lookup their GitHub

In [98]:
def lookup_project_git(org, project):
    """Returns the project github for a specific project. Assumes project is git hosted"""
    return "https://github.com/{0}/{1}.git".format(org, project)
    

In [99]:
def fetch_project_github_data(org, project):
    """Fetch the project github data, note this only gets github issues so likely not super useful"""
    from perceval.backends.core.github import GitHub as perceval_github
    gh_backend = perceval_github(owner=org, repository=project, api_token=gh_api_token)
    # The backend return a generator - which is awesome. However since we want to pull this data into Spark 
    def append_project_info(result):
        """Add the project information to the return from perceval"""
        result["project_name"] = project
        return result

    return list(map(append_project_info, gh_backend.fetch()))

In [100]:
def fetch_project_git_data(org, project):
    from perceval.backends.core.git import Git as perceval_git

    git_uri = lookup_project_git(org, project)
    import tempfile
    import shutil
    tempdir = tempfile.mkdtemp()

    def append_project_info(result):
        """Add the project information to the return from perceval"""
        result["project_name"] = project
        return result

    try:
        git_backend = perceval_git(uri=git_uri, gitpath=tempdir + "/repo")
        return list(map(append_project_info, git_backend.fetch()))
    finally:
        shutil.rmtree(tempdir)

Fetch the git history info using perceval

In [101]:
apache_git_project_data_rdd = committee_names_df.repartition(400).rdd.flatMap(lambda row: fetch_project_git_data("apache", row.project))
jupyter_git_project_data_rdd = sc.parallelize([("jupyter", "notebook"), ("nteract", "nteract")]).flatMap(lambda elem: fetch_project_git_data(elem[0], elem[1]))
git_project_data_rdd = apache_git_project_data_rdd.union(jupyter_git_project_data_rdd)
git_project_data_rdd.persist()
git_project_data_rdd.setName("Perceival GIT dat")

Perceival GIT dat UnionRDD[141] at union at NativeMethodAccessorImpl.java:0

In [32]:
mini_list = git_project_data_rdd.take(1)

In [33]:
mini_list

[{'backend_name': 'Git',
  'backend_version': '0.10.2',
  'perceval_version': '0.9.16',
  'timestamp': 1523555212.233742,
  'origin': 'https://github.com/apache/openwebbeans.git',
  'uuid': 'b58c3d4d37e15bccb71a67ebc6a692de023e77e2',
  'updated_on': 1227374941.0,
  'category': 'commit',
  'tag': 'https://github.com/apache/openwebbeans.git',
  'data': {'commit': 'aafbe570875a9d3174f0bb126b86ab0875e39e7d',
   'parents': [],
   'refs': [],
   'Author': 'Gurkan Erdogdu <gerdogdu@apache.org>',
   'AuthorDate': 'Sat Nov 22 17:29:01 2008 +0000',
   'Commit': 'Gurkan Erdogdu <gerdogdu@apache.org>',
   'CommitDate': 'Sat Nov 22 17:29:01 2008 +0000',
   'message': 'creating trunk folder\n\ngit-svn-id: https://svn.apache.org/repos/asf/incubator/openwebbeans/trunk@719871 13f79535-47bb-0310-9956-ffa450edef68',
   'files': []},
  'project_name': 'openwebbeans'}]

In [55]:
git_project_data_df = git_project_data_rdd.map(lambda row: Row(**row)).toDF()

In [56]:
git_project_data_df.schema

StructType(List(StructField(backend_name,StringType,true),StructField(backend_version,StringType,true),StructField(category,StringType,true),StructField(data,MapType(StringType,StringType,true),true),StructField(origin,StringType,true),StructField(perceval_version,StringType,true),StructField(project_name,StringType,true),StructField(tag,StringType,true),StructField(timestamp,DoubleType,true),StructField(updated_on,DoubleType,true),StructField(uuid,StringType,true)))

In [102]:
authors_by_project_and_commit_df = git_project_data_df.select("project_name", "data.Author", "data.CommitDate")
raw_distinct_authors_latest_commit = authors_by_project_and_commit_df.groupBy("project_name", "Author").agg(F.max("CommitDate").alias("latest_commit"))

In [105]:
boop = "gs://boo-stuff/distinct_authors_latest_commit"
fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jvm.java.net.URI(fs_prefix), sc._jsc.hadoopConfiguration())
fs.exists(sc._jvm.org.apache.hadoop.fs.Path("{0}/_SUCCESS".format(boop)))

True

In [111]:
distinct_authors_latest_commit = non_blocking_df_save_or_load(raw_distinct_authors_latest_commit, "{0}distinct_authors_latest_commit".format(fs_prefix))

In [114]:
distinct_authors_latest_commit

DataFrame[project_name: string, Author: string, latest_commit: string]

20937

Lookup info from crunchbase

In [135]:
os.environ['PATH']

'/opt/conda/bin:/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/chromium/'

In [144]:
from lazy_helpers import *

bcast_driver = sc.broadcast(LazyDriver)

# TBD if we should see this, see comments on robots.txt in function, also consider overhead of firefox req
def lookup_crunchbase_info(people_and_projects):
    """Lookup a person a crunch base and see what the gender & company is.
    Filter for at least one mention of their projects."""
    # Path hack
    if not "chromium" in os.environ['PATH']:
        os.environ['PATH'] = os.environ['PATH'] + ":/usr/lib/chromium/"
    from bs4 import BeautifulSoup
    import re
    bcast_driver.value._driver = None
    driver = bcast_driver.value.get()
    import time
    import random
    for (username, name, projects, urls) in people_and_projects:
        yield (username)
        time.sleep(random.randint(60, 360))
        # robots.txt seems to be ok with person for now as of April 4 2018, double check before re-running this
        url = "https://www.crunchbase.com/person/{0}".format(name.replace(" ", "-"))
        try:
            driver.get(url)
            text = driver.page_source
            lower_text = text.lower()
            yield [driver, lower_text]
            if any(project.lower() in lower_text for project in projects) or any(url.lower in lower_text for url in urls):
                soup = BeautifulSoup(text, "html.parser")
                stats = soup.findAll("div", { "class" : "component--fields-card"})[0]
                # Hacky but I'm lazy
                result = {}
                result["crunchbase-url"] = url
                result["username"] = username
                if "Female" in str(stats):
                    result["gender"] = "Female"
                if "Male" in str(stats):
                    result["gender"] = "Male"
                try:
                    m = re.search("\" title=\"(.+?)\" href=\"\/organization", lower_text)
                    result["company"] = m.group(1)
                except:
                    # No match no foul
                    pass
                yield result
        except Exception as e:
            pass

In [145]:
result = lookup_crunchbase_info([("holden", "holden karau", ["spark"], ["http://www.holdenkarau.com"])])
list(result)

KeyboardInterrupt: 

Augment the committer info

In [49]:
# We do this as an RDD transformation since the cost of the transformation dominates
relevant_info = apache_people_df.select(
    apache_people_df.username,
    apache_people_df.extra.getField("name").alias("name"),
    apache_people_df.projects,
    apache_people_df.extra.getField("urls").alias("urls"))
crunchbase_info_rdd = relevant_info.rdd.map(lambda row: (row.username, row.name, row.projects, row.urls)).mapPartitions(lookup_crunchbase_info)
crunchbase_info_rdd.persist(StorageLevel.MEMORY_AND_DISK)
schema = StructType([
    StructField("username", StringType()),
    StructField("gender", StringType()),
    StructField("company", StringType()),
    StructField("crunchbase-url", StringType())])
crunchbase_info_df = crunchbase_info_rdd.toDF(schema = schema)
crunchbase_info_df.alias("Crunchbase user information")

DataFrame[username: string, gender: string, company: string]

In [112]:
crunchbase_info_df = non_blocking_df_save_or_load(crunchbase_info_df, "{0}crunchbase_out_3".format(fs_prefix))

In [38]:
#crunchbase_info_df.count()

In [39]:
apache_people_df.count()

2565

Export to Mechnical turk format

In [43]:
def mini_concat_udf(array_strs):
    """Concat the array of strs"""
    if array_strs == None:
        return ""
    else:
        return ' '.join(array_strs)

# Except I'm a bad person so....
from pyspark.sql.catalog import UserDefinedFunction
mini_concat_udf = UserDefinedFunction(mini_concat_udf, StringType(), "mini_concat_udf")
session.catalog._jsparkSession.udf().registerPython("mini_concat_udf", mini_concat_udf._judf)

apache_people_df.select(
    apache_people_df.username,
    apache_people_df.extra.getField("name").alias("name"),
    mini_concat_udf(apache_people_df.extra.getField("urls")).alias("personal_websites"),
    mini_concat_udf(apache_people_df.projects).alias("projects")
    ).coalesce(1).write.csv("{0}/apache_people.csv".format(fs_prefix), header=True)

In [44]:
#crunchbase_info_rdd.collect()

One of the things that is interesting is understanding what the tones of the meetup descriptions & mailing list posts are. We can use https://www.ibm.com/watson/developercloud/tone-analyzer/api/v3/?python#introduction

In [None]:
# TODO: take out toneanalyzer3 and use nltk instead because I don't have free access to this API anymore
# TODO: use pandas acceleration maybe? some issues with dataproc "support"
def lookup_tone(document):
    """Looks up the tone for a specific document. Returns a json blob."""
    from watson_developer_cloud import ToneAnalyzerV3
    tone_analyzer = ToneAnalyzerV3(
        username=tone_bluemix_user,
        password=tone_bluemix_password,
        version='2016-05-19 ')
    return tone_analyzer.tone(text=document)

In [None]:
oh_no_you = lookup_tone("oh no you didn't girl")

In [None]:
oh_no_you

Ok its time to find some mailing list info

In [146]:
mbox_failures = sc.accumulator(0)

def fetch_mbox_ids(project_name):
    """Return the mbox ids"""
    import itertools

    def fetch_mbox_ids_apache_site(box_type):
        """Fetches all of the mbox ids from a given apache project and box type (dev or user)"""
        root_url = "http://mail-archives.apache.org/mod_mbox/{0}-{1}".format(project_name, box_type)
        
        # Fetch the page to parse
        pool = bcast_pool.value.get()
        result = pool.request('GET', root_url)
        
        
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(result.data, "html.parser")
        mbox_ids = set(map(lambda tag: tag.get('id'), soup.findAll("span", { "class" : "links"})))
        return map(lambda box_id: (project_name, box_type, box_id), mbox_ids)
    # We have to return a list here because PySpark doesn't handle generators (TODO: holden)
    return list(itertools.chain.from_iterable(map(fetch_mbox_ids_apache_site, ["dev", "user"])))
        
        
def fetch_and_process_mbox_records(project_name, box_type, mbox_id):
        import tempfile
        import shutil
        from perceval.backends.core.mbox import MBox as perceval_mbox

        def process_mbox_directory(base_url, dir_path):
            mbox_backend = perceval_mbox(base_url, dir_path)
            return mbox_backend.fetch()
        
        def append_project_info(result):
            """Add the project information to the return from perceval"""
            result["project_name"] = project_name
            result["box_type"] = box_type
            result["mbox_id"] = mbox_id
            return result

        # Make a temp directory to hold the mbox files
        tempdir = tempfile.mkdtemp()

        try:
            root_url = "http://mail-archives.apache.org/mod_mbox/{0}-{1}".format(project_name, box_type)
            mbox_url = "{0}/{1}.mbox".format(root_url, mbox_id)
            filename = "{0}/{1}.mbox".format(tempdir, mbox_id)
        
            print("fetching {0}".format(mbox_url))

            pool = bcast_pool.value.get()
            with pool.request('GET', mbox_url, preload_content=False) as r, open(filename, 'wb') as out_file:       
                try:
                    shutil.copyfileobj(r, out_file)
                    return list(map(append_project_info, process_mbox_directory(root_url, tempdir)))
                except:
                    mbox_failures.add(1)
                    return []
        finally:
            shutil.rmtree(tempdir)

In [147]:
fetched_mbox_ids = fetch_mbox_ids("spark")
list(fetched_mbox_ids)[0]
fetched_mbox_data = fetch_and_process_mbox_records('spark', 'dev', '201308')

fetching http://mail-archives.apache.org/mod_mbox/spark-dev/201308.mbox


In [148]:
fetched_mbox_data[0]

{'backend_name': 'MBox',
 'backend_version': '0.10.2',
 'perceval_version': '0.9.14',
 'timestamp': 1523588778.76019,
 'origin': 'http://mail-archives.apache.org/mod_mbox/spark-dev',
 'uuid': '82e0f3bdbbce4324ac365bc564d1221947eb5405',
 'updated_on': 1375383652.0,
 'category': 'message',
 'tag': 'http://mail-archives.apache.org/mod_mbox/spark-dev',
 'data': {'unixfrom': 'dev-return-121-apmail-spark-dev-archive=spark.apache.org@spark.incubator.apache.org  Thu Aug  1 19:03:23 2013',
  'Return-Path': '<dev-return-121-apmail-spark-dev-archive=spark.apache.org@spark.incubator.apache.org>',
  'X-Original-To': 'apmail-spark-dev-archive@minotaur.apache.org',
  'Delivered-To': 'moderator for dev@spark.incubator.apache.org',
  'Received': '(qmail 42870 invoked by uid 99); 1 Aug 2013 19:01:19 -0000',
  'Mailing-List': 'contact dev-help@spark.incubator.apache.org; run by ezmlm',
  'Precedence': 'bulk',
  'List-Help': '<mailto:dev-help@spark.incubator.apache.org>',
  'List-Unsubscribe': '<mailto:de

In [149]:
def random_key(x):
    import random
    return (random.randint(0, 40000), x)

def de_key(x):
    return x[1]

mailing_list_posts_mbox_ids = committee_names_df.repartition(400).rdd.flatMap(lambda row: fetch_mbox_ids(row.project))
# mbox's can be big, so break up how many partitions we have
mailing_list_posts_mbox_ids = mailing_list_posts_mbox_ids.map(random_key).repartition(2000).map(de_key)
mailing_list_posts_rdd = mailing_list_posts_mbox_ids.flatMap(lambda args: fetch_and_process_mbox_records(*args))
mailing_list_posts_rdd.persist(StorageLevel.MEMORY_AND_DISK)

PythonRDD[177] at RDD at PythonRDD.scala:48

In [150]:
schema = StructType([
    StructField("project_name",StringType()),
    StructField("box_type",StringType()), # dev or user
    StructField("mbox_id",StringType()),
    StructField("backend_name",StringType()),
    StructField("backend_version",StringType()),
    StructField("category",StringType()),
    StructField("data", MapType(StringType(),StringType())), # The "important" bits
    StructField("origin",StringType()),
    StructField("perceval_version",StringType()),
    StructField("tag",StringType()),
    StructField("timestamp",DoubleType()),
    StructField("updated_on",DoubleType()),
    StructField("uuid",StringType())])
mailing_list_posts_mbox_df_raw = mailing_list_posts_rdd.toDF(schema=schema)
mailing_list_posts_mbox_df_raw.persist(StorageLevel.MEMORY_AND_DISK)
mailing_list_posts_mbox_df_raw.alias("Mailing list perceival information - no post processing")

DataFrame[project_name: string, box_type: string, mbox_id: string, backend_name: string, backend_version: string, category: string, data: map<string,string>, origin: string, perceval_version: string, tag: string, timestamp: double, updated_on: double, uuid: string]

In [151]:
mailing_list_posts_mbox_df_raw = non_blocking_df_save_or_load(mailing_list_posts_mbox_df_raw, "mailing_list_info")

In [152]:
mailing_list_posts_mbox_df = mailing_list_posts_mbox_df_raw.select("*",
                                                               mailing_list_posts_mbox_df_raw.data.getField("From").alias("from"),
                                                               mailing_list_posts_mbox_df_raw.data.getField("body").alias("body")
                                                              )

In [153]:
df.schema

NameError: name 'df' is not defined

Start using some of the lazily created DFs

In [117]:
distinct_authors_latest_commit.count()

20937

In [154]:
distinct_authors_latest_commit.show()

+------------+--------------------+--------------------+
|project_name|              Author|       latest_commit|
+------------+--------------------+--------------------+
|    accumulo|Ed Coleman <dev1@...|Wed Jan 7 23:08:4...|
|    activemq|Martyn Taylor <mt...|Wed Jun 7 11:09:4...|
|         ant|Jesse Stockall <j...|Wed Apr 16 15:20:...|
|      aurora|Steve Salevan <st...|Tue Jul 14 10:50:...|
|        avro|Douglas Adam Crea...|Wed Feb 29 01:33:...|
|        beam|Jean-Baptiste Ono...|Wed Sep 7 06:11:0...|
|      bigtop|Andrew Kuchling <...|Wed Sep 21 12:18:...|
|      bigtop|Chris Huang <chri...|Fri Jun 21 17:01:...|
|      bigtop|roypradeep <roypr...|Thu Mar 23 10:27:...|
|      bigtop|wenwu <pengwenwu2...|Wed Mar 19 22:43:...|
|        bval|Albert Lee <allee...|Wed Feb 1 19:23:4...|
|     calcite|Sree Vaddi <sree_...|Thu Sep 18 01:35:...|
|       camel|Adrian Cole <adri...|Fri Feb 17 09:51:...|
|       camel|Bruno Marco Visio...|Fri Sep 4 10:18:4...|
|  carbondata|sgururajshetty <s

In [159]:
num_authors_by_project = distinct_authors_latest_commit.groupBy("project_name").agg(F.count("Author"))
num_authors_by_project.cache()
num_authors_by_project.show()

+-------------+-------------+
| project_name|count(Author)|
+-------------+-------------+
|         lucy|           21|
|      vxquery|           22|
|    chemistry|           14|
|       roller|           21|
|        geode|          186|
|       falcon|           82|
|          tez|           36|
|trafficserver|          434|
|       pdfbox|           25|
|        httpd|          148|
|   carbondata|          205|
|        celix|           19|
|     accumulo|          135|
|       wicket|          123|
|        twill|           41|
|   servicemix|           24|
|     clerezza|           28|
|      couchdb|          231|
|       bigtop|          205|
|     marmotta|           31|
+-------------+-------------+
only showing top 20 rows



Compute the sample %s for each project so we can get reasonable confidence bounds for sampling

In [None]:
def compute_num_required_sample(pop_size):
    import numpy as np
    import scipy.stats
    target_margin_of_error = 0.05
    Z = 1.96 # 95%
    p = 0.5
    N = pop_size
    # CALC SAMPLE SIZE
    n_0 = ((Z**2) * p * (1-p)) / (e**2)
    # ADJUST SAMPLE SIZE FOR FINITE POPULATION
    n = n_0 / (1 + ((n_0 - 1) / float(N)) )
    return int(math.ceil(n)) # THE SAMPLE SIZE

In [None]:
sample_sizes = num_authors_by_project.withColumn(compute_num_required_sample("count(Author)"))

In [None]:
sample_sizes.show()

Attempt to infer Gender off of name. This should be used as a last-ditch fall back, see https://ironholds.org/names-gender/ for a discussion on why this is problematic. However without doing this it's difficult to get much of a picture (see above where we attempt to gender from other sources, the hit rate leaves something to be desired)