This notebook seeks to explore the gender diversity of the different apache projects & the process

In [179]:
from pyspark import *
from pyspark.sql import *
from pyspark.sql.functions import explode
from pyspark.sql.types import *

import json
import os
import meetup.api
from copy import copy
import time

In [2]:
session = SparkSession.builder.appName("whatCanWeLearnFromTheSixties").getOrCreate()
sc = session.sparkContext

The first thing we want to get is the committers and PMC members, this information is stored in LDAP but also available in JSON. Eventually we will want to enrich this with mailing list information

In [105]:
def loadFlatJsonFile(path, explodeKey, schema=None):
    """Load a flat multi-line json file and convert into Spark & explode"""
    rdd = sc.wholeTextFiles(path).values()
    df = (session.read.schema(schema)
            .json(rdd))
    return df.select(explode(explodeKey))

In [106]:
apache_people_schema = StructType([StructField("lastCreateTimestamp", StringType()),
                     StructField("people",
                                 MapType(StringType(), MapType(StringType(), StringType())))])
apache_people_df = loadFlatJsonFile(path="http_data_sources/public_ldap_people.json", # http://people.apache.org/public/public_ldap_people.json
                                 explodeKey="people", schema=apache_people_schema)

In [117]:
apache_committees_schema = StructType([StructField("lastCreateTimestamp", StringType()),
                     StructField("committees",
                                 MapType(StringType(), MapType(StringType(), StringType())))])
apache_committees_df = loadFlatJsonFile(path="http_data_sources/public_ldap_committees.json", # http://people.apache.org/public/public_ldap_people.json
                                 explodeKey="committees", schema=apache_committees_schema)
committee_names_df = apache_committees_df.select(apache_committees_df.key.alias("project"))

Attempt to fetch relevant past & present meetups for each project - idea based on the listing at https://www.apache.org/events/meetups.html but different code

In [129]:
meetup_key = os.getenv("MEETUP_APIKEY")

In [200]:
logger = logging.getLogger()
logger.setLevel("WARN")
def lookup_relevant_meetup(project_name):
    """Lookup relevant meetups for a specific project."""
    import logging
    import time
    import meetup.api
    logger = logging.getLogger()
    meetup_delay = 30
    meetup_reset_delay = 1800 # 30 minutes
    standard_keys = {"text_format": "plain", "trending": "desc=true", "and_text": "true", "city": "san francisco", "country": "usa", "text": "apache " + project_name, "radius": 10000}
    results = {"upcoming": [], "past": []}
    for status in ["upcoming", "past"]:
        keys = copy(standard_keys)
        keys["status"] = status
        count = 200
        base = 0
        while (count == 200):
            logging.debug("Fetch {0} meetups for {1} on base {2}".format(status, project_name, base))
            project_name = "spark"
            client = client = meetup.api.Client(meetup_key)
            if base > 0:
                keys["page"] = base
            # Manually sleep for meetup_reset_delay on failure, the meetup-api package retry logic sometimes breaks :(
            response = None
            retry_count = 0
            while response is None and retry_count < 10:
                try:
                    response = client.GetOpenEvents(**keys)
                except:
                    response = None
                    retry_count += 1
                    time.sleep(meetup_reset_delay)
                    try:
                        response = client.GetOpenEvents(**keys)
                    except:
                        response = None
            try:
                count = response.meta['count']
                base = base + count
                results[status].append(response.results)
                time.sleep(meetup_delay)
            except:
                count = 0
    return (project_name, results)

project_meetups_df = committee_names_df.rdd.map(lambda x: x.project).map(lookup_relevant_meetup)

In [None]:
project_meetups_df.cache()
project_meetups_df.take(1)

For the provided projects attempt to lookup their GitHub

In [None]:
def lookup_project_github(project):
    """Returns a list with the projects GitHub and empty list if no GitHub found."""
    