## Creating spark session

In [1]:
from pyspark.sql import SparkSession

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
spark = SparkSession.builder \
            .master("local[8]") \
            .appName("Higgs Twitter ETL") \
            .config("spark.some.config.option", "some-value")\
            .getOrCreate()

23/10/03 15:52:33 WARN Utils: Your hostname, me-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.42.129 instead (on interface ens33)
23/10/03 15:52:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/03 15:52:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Reading edgelist files

In [4]:
from pyspark.sql.types import IntegerType, StructField, StructType

In [5]:
schema = StructType([StructField('follower', IntegerType(), True), StructField('followed', IntegerType(), True)])
socialDF = spark.read.csv('data/higgs-social_network.edgelist.gz', sep=" ", schema=schema).dropna()
print(socialDF.count(), len(socialDF.columns), '\n')
socialDF.show(5)

                                                                                

14855842 2 

+--------+--------+
|follower|followed|
+--------+--------+
|       1|       2|
|       1|       3|
|       1|       4|
|       1|       5|
|       1|       6|
+--------+--------+
only showing top 5 rows



## Dataframes API cached vs uncached

In [12]:
%%time
followerCountDF = socialDF.groupBy('followed')\
.count()\
.withColumnRenamed('followed', 'user') \
.withColumnRenamed('count', 'num_followers')\
.show(5)

[Stage 11:>                                                         (0 + 1) / 1]

+----+-------------+
|user|num_followers|
+----+-------------+
| 148|          738|
| 463|        10953|
| 471|         1584|
| 496|           49|
| 833|            8|
+----+-------------+
only showing top 5 rows

CPU times: user 16.6 ms, sys: 27.4 ms, total: 44.1 ms
Wall time: 1min 18s


                                                                                

In [13]:
socialDF.cache()

23/10/03 15:57:02 WARN CacheManager: Asked to cache already cached data.


DataFrame[follower: int, followed: int]

In [14]:
%%time
followerCountDF = socialDF.groupBy('followed')\
.count()\
.withColumnRenamed('followed', 'user') \
.withColumnRenamed('count', 'num_followers') \
.show(5)

[Stage 14:>                                                         (0 + 1) / 1]

+----+-------------+
|user|num_followers|
+----+-------------+
| 148|          738|
| 463|        10953|
| 471|         1584|
| 496|           49|
| 833|            8|
+----+-------------+
only showing top 5 rows

CPU times: user 6.52 ms, sys: 5.28 ms, total: 11.8 ms
Wall time: 4.15 s


                                                                                

In [15]:
socialDF.unpersist()

DataFrame[follower: int, followed: int]

## Spark SQL cached vs uncached

In [19]:
%%time
socialDF.createOrReplaceTempView('social')
spark.sql(
     """
    select 
    followed as user, 
    count(follower) as num_followers
    from social
    group by followed
    order by user asc
     """
 ).show(5)

[Stage 26:>                                                         (0 + 1) / 1]

+----+-------------+
|user|num_followers|
+----+-------------+
|   1|        16280|
|   2|         4707|
|   3|          137|
|   4|         8643|
|   5|         2194|
+----+-------------+
only showing top 5 rows

CPU times: user 3.07 ms, sys: 22.3 ms, total: 25.4 ms
Wall time: 45.3 s


                                                                                

In [22]:
%%time
socialDF.createOrReplaceTempView('social')

socialDF.cache()

spark.sql(
     """
    select 
    followed as user, 
    count(follower) as num_followers
    from social
    group by followed
    order by user asc
     """
 ).show(5)

23/10/03 16:03:55 WARN CacheManager: Asked to cache already cached data.
[Stage 32:>                                                         (0 + 1) / 1]

+----+-------------+
|user|num_followers|
+----+-------------+
|   1|        16280|
|   2|         4707|
|   3|          137|
|   4|         8643|
|   5|         2194|
+----+-------------+
only showing top 5 rows

CPU times: user 3.89 ms, sys: 11.7 ms, total: 15.6 ms
Wall time: 4.79 s


                                                                                