In [1]:
## Load context
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import re
import os
import pandas as pd
import numpy as np
from time import time

from pyspark.sql import functions as F
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.types import ArrayType, FloatType, StringType

hostname = os.uname()[1]

from pyspark.sql import SparkSession
from pyspark import SparkConf

# BEN'S MASTER: 192.168.2.87
# OUR MASTER:   192.168.2.203

# New API
conf = (SparkConf()    
   .setMaster("spark://192.168.2.203:7077")  
   .setAppName(f"load_local_comments; hostname: {hostname}")    
   .set("spark.executor.cores",2)  
   .set("spark.dynamicAllocation.enabled", False)  
   .set("spark.shuffle.service.enabled", False))

#spark_context = SparkContext(conf = conf)  
spark_session = SparkSession.builder.config(conf=conf).getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext
spark_context.uiWebUrl
print(f'hostname for this machine: {hostname}')

hostname for this machine: host-192-168-2-247-ldsa


In [2]:
load_fraction = 0.3

df = spark_session.read\
    .option("header", "true")\
    .json('hdfs://192.168.2.203:9000/RC_2010-01')\
    .sample(False, load_fraction, 1234)\
    .cache()

sampled_count = df.count()

In [3]:
### START TIMER

print(f'Starting timer. Fraction of dataset is {load_fraction} and sampled size is {sampled_count}')
start_clock = time()

Starting timer. Fraction of dataset is 0.3 and sampled size is 864201


In [4]:
udf_strip_lower = F.udf(lambda comment: (re.sub(r'\W+', ' ', comment).lower().strip()), 'string')
df_1 = df.select('body').withColumn('body', udf_strip_lower('body'))

In [5]:
df_2 = df_1.withColumn('word', F.explode(F.split(F.col('body'), ' ')))\
    .groupBy('word')\
    .count()\
    .sort('count', ascending=False)

df_2.show()

+----+-------+
|word|  count|
+----+-------+
| the|1127670|
|   i| 796135|
|  to| 742246|
|   a| 690505|
| and| 581765|
|  of| 525752|
|  it| 505679|
| you| 484401|
|that| 457100|
|  is| 394441|
|  in| 366683|
|   s| 270160|
|   t| 265447|
| for| 261303|
|  on| 189951|
|have| 188659|
|this| 187976|
| not| 184321|
| but| 183467|
|with| 179896|
+----+-------+
only showing top 20 rows



In [6]:
### END TIMER

end_clock = time()

runtime = end_clock - start_clock

print(f'Run finished. Experiment run on {sampled_count} comments. Runtime resulted in {runtime} seconds.')

Run finished. Experiment run on 864201 comments. Runtime resulted in 26.012290954589844 seconds.
