In [1]:
%%time


## Load context
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import re
import os
import pandas as pd
import numpy as np
from time import time

from pyspark.sql import functions as F
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.types import ArrayType, FloatType, StringType

hostname = os.uname()[1]

from pyspark.sql import SparkSession
from pyspark import SparkConf

# BEN'S MASTER: 192.168.2.87
# OUR MASTER:   192.168.2.203

# New API
conf = (SparkConf()    
   .setMaster("spark://192.168.2.203:7077")  
   .setAppName(f"wordcount; hostname: {hostname}")    
   .set("spark.executor.cores",2)  
   .set("spark.dynamicAllocation.enabled", False)  
   .set("spark.shuffle.service.enabled", False))

#spark_context = SparkContext(conf = conf)  
spark_session = SparkSession.builder.config(conf=conf).getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext
spark_context.uiWebUrl
print(f'hostname for this machine: {hostname}')

hostname for this machine: jadalidaoudproject1-1
CPU times: user 461 ms, sys: 150 ms, total: 611 ms
Wall time: 4.95 s


In [2]:
### START TIMER

print(f'Starting timer')
start_clock = time()

Starting timer


In [3]:
%%time

load_fraction = 1.0

df = spark_session.read\
    .option("header", "true")\
    .json('hdfs://192.168.2.203:9000/RC_2010-01')\
    .sample(False, load_fraction, 1234)\
    .cache()


CPU times: user 8.64 ms, sys: 117 µs, total: 8.76 ms
Wall time: 14.5 s


In [4]:
%%time

sampled_count = df.count()

CPU times: user 8.26 ms, sys: 0 ns, total: 8.26 ms
Wall time: 16.4 s


In [5]:
%%time


udf_strip_lower = F.udf(lambda comment: (re.sub(r'\W+', ' ', comment).lower().strip()), 'string')
df_1 = df.select('body').withColumn('body', udf_strip_lower('body'))

CPU times: user 21 ms, sys: 6.45 ms, total: 27.5 ms
Wall time: 353 ms


In [6]:
%%time


df_2 = df_1.withColumn('word', F.explode(F.split(F.col('body'), ' ')))\
    .groupBy('word')\
    .count()\
    .sort('count', ascending=False)

df_2.show()

+----+-------+
|word|  count|
+----+-------+
| the|3767491|
|   i|2665340|
|  to|2481057|
|   a|2308136|
| and|1944342|
|  of|1756286|
|  it|1688628|
| you|1617607|
|that|1526095|
|  is|1320633|
|  in|1220979|
|   s| 900558|
|   t| 887123|
| for| 872725|
|  on| 636708|
|have| 629236|
|this| 628567|
| not| 614794|
| but| 611240|
|with| 601854|
+----+-------+
only showing top 20 rows

CPU times: user 22.5 ms, sys: 17.7 ms, total: 40.3 ms
Wall time: 23.6 s


In [7]:
### END TIMER

end_clock = time()

runtime = end_clock - start_clock

print(f'Run finished. Experiment run on {sampled_count} comments on fraction {load_fraction}. Runtime resulted in {runtime} seconds.')

Run finished. Experiment run on 2884096 comments on fraction 1.0. Runtime resulted in 55.909826040267944 seconds.
