In [4]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

Manually managing spark context and sqlcontexts has been replaced the Spark sessions, which come with convenient builders. We may want to modify the following config variables:

    spark.sql.hive.metastore.version
    spark.sql.hive.metastore.jars

In [None]:
sc = SparkContext(appName='Notebook')
sqlContext = SQLContext(sc)

In [None]:
pageviews_tbl = sqlContext.sql("SELECT * FROM u_juliet.wiki_pageviews").cache()
project_names = pageviews_tbl.select("project_name").distinct().collect()
project_names

In [None]:
project_page_counts = pageviews_tbl.select("project_name").groupBy("project_name").count().orderBy("count",
ascending=False).collect()
project_page_counts

In [None]:
[name for name in project_names if 'en' in name]

In [None]:
pageviews_tbl.filter("project_name = 'en'").show(10)

In [None]:
en_pageviews = pageviews_tbl.filter("project_name= 'en'").drop("project_name")
en_pageviews.show(10)

In [None]:
top_10_pg_views_hourly = en_pageviews.orderBy("n_views", ascending=False)
top_10_pg_views_hourly.show(10)

In [None]:
null_pg_views = en_pageviews.filter("n_views IS NULL")
null_pg_views.show()

In [None]:
nn_pg_views = en_pageviews.filter("n_views IS NOT NULL")
nn_pg_views.orderBy("n_views", ascending=False).show(10)

In [None]:
champagne_df = nn_pg_views.filter("LOWER(page_name) = 'champagne'")
champagne_df.orderBy("day", "hour").show(10)

In [None]:
w_daily_views = nn_pg_views.groupBy("page_name", "month",
"day").sum("n_views").withColumnRenamed("sum(n_views)",
"daily_views").orderBy("daily_views", ascending=False)
w_daily_views.show(10)

In [None]:
tot_view = nn_pg_views.groupBy("page_name").sum("n_views").withColumnRenamed("sum(n_views)",
"all_views").orderBy("all_views", ascending=False)
tot_view.show(30)