In [51]:
# Create Spark Context
from pyspark import SparkContext
from pyspark.sql import SparkSession, functions
from pyspark.sql.types import StringType
from pyspark.sql.functions import col, split, udf
import gender_guesser.detector as gender
from pyspark.sql.functions import udf
import re
import requests
#sc = SparkContext()
spark = (SparkSession
         .builder
         .getOrCreate()
         )
sc = spark.sparkContext
gdetector = gender.Detector()

def download(date):
    paths = []
    for hour in range(24):
        prefix = date + '-' + str(hour)
        url = 'https://data.gharchive.org/' + prefix + '.json.gz'
        path = prefix + '.json.gz'
        response = requests.get(url, stream=True)
        open(path, 'wb').write(response.content)
        paths.append(path)
    return paths

def detectGender():
    def detectGender_(col):
        #clean = re.sub('[^a-zA-Z]+', '', col)
        names = re.sub( r"([A-Z])", r" \1", col).split()
        clean = re.sub('[^a-zA-Z]+', '', names[0])
        #names = re.split('(?=[A-Z])', clean)
        #print(clean)
        return gdetector.get_gender(clean)
    return udf(detectGender_, StringType())

files = download('2022-09-19')
json = spark.read.json(files)
json.createOrReplaceTempView("github")

eventsDF = spark.sql("SELECT type,repo.name as owner_repo,actor.login as submitter FROM github") \
        .withColumn("owner", split(col("owner_repo"),"/").getItem(0)) \
        .withColumn("repo", split(col("owner_repo"),"/").getItem(1)) \
        .drop("owner_repo")

commitsDF = eventsDF.where("type='PushEvent'")
submitsDF = commitsDF.groupBy('submitter').count()
                           
# List of Developers that own more than one repository;
devsWithMoreThanOneRepoDF =eventsDF.drop('submitter','type').distinct().groupBy('owner').count().where("count > 1").orderBy(col('count').desc())

                           
# List of Developers who did more than one commit in a day, ordered by name and number of commits;
moreThanOneCommitSubmittersDF = submitsDF.where("count > 1").orderBy(col('count').desc())
# List of Developers with less than one commit in a day;
lessThanOneCommitSubmittersDF = submitsDF.where("count <= 1").orderBy(col('count').desc())

# Total Developers grouped by gender;
# API does not provide gender         
namesDF = eventsDF.select('owner').withColumnRenamed('owner','name').union(eventsDF.select('submitter').withColumnRenamed('submitter','name')).distinct()
gendersDF = namesDF.withColumn("gender", detectGender()("name")).groupBy("gender").count()


# Total projects with more than 10 members;
#eventsDF.groupBy('owner').count()
reposWithMoreThanTenMembersDF = eventsDF.drop('type').distinct().groupBy("owner", "repo").count().where("count > 10").orderBy(col('count').desc())



print("List of Developers that own more than one repository")                           
devsWithMoreThanOneRepoDF.show()
                           
print("List of Developers who did more than one commit in a day, ordered by name and number of commits")                           
moreThanOneCommitSubmittersDF.show()
                           
print("List of Developers with less than one commit in a day")                           
lessThanOneCommitSubmittersDF.show()

print("Total Developers grouped by gender")                           
gendersDF.show()

print("Total projects with more than 10 members")                           
reposWithMoreThanTenMembersDF.show()

                                                                                

List of Developers that own more than one repository
List of Developers who did more than one commit in a day, ordered by name and number of commits
List of Developers with less than one commit in a day
Total Developers grouped by gender


                                                                                

+-------------+------+
|       gender| count|
+-------------+------+
|mostly_female|  3506|
|      unknown|683426|
|       female| 14542|
|         andy|  7972|
|         male| 56585|
|  mostly_male|  4400|
+-------------+------+

Total projects with more than 10 members
