In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from numpy.testing import assert_equal

In [2]:
spark = (SparkSession
            .builder
            .master('local[*]')
            .getOrCreate())

22/02/04 18:09:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/04 18:09:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/02/04 18:09:52 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/02/04 18:09:52 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


## Problem 2a [10 pts]

Create a function `most_submits` that accepts a globstring to [Github archive](https://www.gharchive.org) data then returns a Spark DataFrame with columns `type` and `actor`. The `type` column corresponds to an event and the values correspond to the `actor` `login` who initiated the most number of times of that event type. Sort by `type`.

In [3]:
def most_submits(paths):
    from operator import add
    from pyspark.sql import functions as f
    return (spark.read.json(paths)
     .select('type','actor')
     .rdd
     .map(lambda x: ((x['type'], x['actor']['login']), 1))
     .reduceByKey(add)
     .map(lambda x: (x[0][0], x[0][1],x[1]))
     .toDF(['type', 'login', 'count'])
     .orderBy('count', ascending=False)
     .drop_duplicates(['type'])
     .drop('count')
     .orderBy('type')
    )

In [4]:
pdf_ms = (most_submits('/mnt/localdata/public/gharchive/2020-01-01-*.json.gz')
          .toPandas())
assert_equal(pdf_ms.columns.tolist(), ['type', 'login'])
assert_equal(
    pdf_ms.iloc[:5].to_numpy().tolist(),
    [['CommitCommentEvent', 'now[bot]'],
     ['CreateEvent', 'svc-software-factory'],
     ['DeleteEvent', 'dependabot-preview[bot]'],
     ['ForkEvent', 'fagan2888'],
     ['GollumEvent', 'joric']]
)

22/02/04 18:11:11 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

## Problem 2b [15 pts]

In this problem, you will be working with Wikipedia clickstream dataset files. It has the following [format](https://meta.wikimedia.org/wiki/Research:Wikipedia_clickstream#Format):

* `prev`: the result of mapping the referer URL to the fixed set of values described below.
* `curr`: the title of the article the client requested
* `type`: describes (`prev`, `curr`)
  * `link`: if the referer and request are both articles and the referer links to the request
  * `external`: if the referer host is not `en(.m)?.wikipedia.org`
  * `other`: if the referer and request are both articles but the referer does not link to the request. This can happen when clients search or spoof their refer.
* `n`: the number of occurrences of the (referer, resource) pair
  
The `prev` column has the following values:

* an article in the main namespace -> the article title
* a page from any other Wikimedia project -> `other-internal`
* an external search engine -> `other-search`
* any other external site -> `other-external`
* an empty referer -> `other-empty`
* anything else -> `other-other`

Create a function `count_inbound_articles` that accepts a globstring to Wikipedia clickstream dataset files and returns a Spark dataframe with columns `cur` corresponding to an article and `inbound` corresponding to the number of unique case-sensitive articles from which the article was visited. Sort by decreasing `inbound` and return only the 10 rows with the largest `inbound`.

In [5]:
def count_inbound_articles(path):
    from pyspark.sql.functions import col, countDistinct
    return (spark.read.csv(path, sep='\t', ).toDF('prev', 'cur', 'type', 'n')
 .filter(col('prev').rlike(r'^(?!other-.*)'))
 .groupby(['cur'])
 .agg(countDistinct('prev').alias('inbound'))
 .orderBy('inbound', ascending=False)
 .limit(10)
)
    

In [6]:
# path = '/mnt/localdata/public/wikipedia/clickstream/clickstream/2019-10/clickstream-enwiki-2019-10.tsv.gz'
# wiki = spark.read.csv(path, sep='\t', ).toDF('prev', 'curr', 'type', 'n')
# from pyspark.sql.functions import col, countDistinct
# (spark.read.csv(path, sep='\t', ).toDF('prev', 'curr', 'type', 'n')
#  .filter(col('prev').rlike(r'^(?!other-.*)'))
#  .groupby(['curr'])
#  .agg(countDistinct('prev'))
#  .orderBy('count(prev)', ascending=False)
#  .limit(10)
#  .toPandas()
# )

In [7]:
# wiki.limit(10).toPandas()

In [8]:
df_cia = count_inbound_articles(
    '/mnt/localdata/public/wikipedia/clickstream/clickstream/2019-10/'
    'clickstream-enwiki-2019-10.tsv.gz'
).toPandas()

assert_equal(df_cia.shape, (10, 2))
assert_equal(
    df_cia.columns.tolist(),
    ['cur', 'inbound']
)
assert_equal(
    df_cia.iloc[:5].to_numpy().tolist(),
    [['Hyphen-minus', 135466],
     ['Main_Page', 127434],
     ['United_States', 7168],
     ['India', 4002],
     ['United_Kingdom', 3619]])

                                                                                