## Setup

In [1]:
!pip install pyspark
!pip install findspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7 (from pyspark)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=b6acbd459b1e2fbf04e1c46db686dfc2f51d14abebebfb3176ab07f2e1276c80
  Stored in directory: /home/jupyterlab/.cache/pip/wheels/b7/8e/8f/ba5d017af5f502964eb1358e1d496a8519de1645936b01810e
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.4.1
C

In [2]:
import findspark
findspark.init()

In [5]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

## Initialize Spark Session

In [10]:
sc = SparkContext()
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [11]:
spark

## Parallelize data

In [18]:
data= range(1,30)
xrangeRDD=sc.parallelize(data, 4)
xrangeRDD

PythonRDD[12] at RDD at PythonRDD.scala:53

## Transformation in RDD

In [19]:
subRDD= xrangeRDD.map(lambda x:x-1)
filerRDD=xrangeRDD.filter(lambda x:x<10)

## Action

In [22]:
print(subRDD.collect())
filerRDD.collect()


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]


[1, 2, 3, 4, 5, 6, 7, 8, 9]

## Cache

In [38]:
import time
testRDD= sc.parallelize(range(1, 50000), 4)
testRDD.cache()

t1=time.time()
count1=testRDD.count()
print(count1)
dt1=time.time()-t1
print(dt1)

t2=time.time()
count2=testRDD.count()
print(count2)
dt2=time.time()-t2
print(dt2)


49999
0.35214805603027344
49999
0.14197826385498047


## Download people.json

In [39]:


!curl https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0225EN-SkillsNetwork/labs/data/people.json >> people.json



  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    73  100    73    0     0    820      0 --:--:-- --:--:-- --:--:--   820


## Create Dataframe

In [40]:
df=spark.read.json("people.json").cache()

In [43]:
df.show()
df.printSchema()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [44]:
df.createTempView("people")

## Perform Queries using Spark SQL

In [51]:
df.select("name").show()
df.select(df["name"]).show()
spark.sql("select * from people").show()
df.select(df["age"]>10).show()
df.groupby("age").count().show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

+----------+
|(age > 10)|
+----------+
|      null|
|      true|
|      true|
+----------+



                                                                                

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



## Create RDD of range 1 to 50 and multiply by 2

In [54]:
practiceRDD=sc.parallelize(range(1, 50), 4)
mul=practiceRDD.map(lambda x: x*2)


In [57]:
mul.collect()


[2,
 4,
 6,
 8,
 10,
 12,
 14,
 16,
 18,
 20,
 22,
 24,
 26,
 28,
 30,
 32,
 34,
 36,
 38,
 40,
 42,
 44,
 46,
 48,
 50,
 52,
 54,
 56,
 58,
 60,
 62,
 64,
 66,
 68,
 70,
 72,
 74,
 76,
 78,
 80,
 82,
 84,
 86,
 88,
 90,
 92,
 94,
 96,
 98]

## Create Dataframe and get Average Age 

In [58]:

!curl https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0225EN-SkillsNetwork/labs/people2.json >> people2.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   136  100   136    0     0   1942      0 --:--:-- --:--:-- --:--:--  1942


In [66]:
df2=spark.read.json("people2.json").cache()

23/08/16 06:10:43 WARN execution.CacheManager: Asked to cache already cached data.


In [68]:
df2.createTempView("people2")
spark.sql("select AVG(age) from people2")

DataFrame[avg(age): double]

## Stop Spark Session

In [71]:
spark.stop()