In [None]:
# Create Spark Session and Spark Context

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark-intro').getOrCreate()
sc = spark.sparkContext

## Create RDDs from Python variables

In [2]:
rdd = sc.parallelize(range(20))

In [3]:
rdd

PythonRDD[1] at RDD at PythonRDD.scala:53

In [4]:
rdd.first()

0

In [5]:
rdd.take(2)

[0, 1]

In [6]:
rdd.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

We can apply functions to each element

In [7]:
def less_than_10(x):
    if x < 10:
        return True
    else:
        return False

In [8]:
# show that it is lazy evaluation
rdd.filter(less_than_10)

PythonRDD[4] at RDD at PythonRDD.scala:53

In [9]:
rdd.filter(less_than_10).collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [10]:
rdd.filter(less_than_10).count()

10

In [11]:
rdd.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [12]:
def square(x):
    return x*x # x**2

In [13]:
rdd.map(square)

PythonRDD[7] at RDD at PythonRDD.scala:53

In [14]:
rdd.map(square).collect()

[0,
 1,
 4,
 9,
 16,
 25,
 36,
 49,
 64,
 81,
 100,
 121,
 144,
 169,
 196,
 225,
 256,
 289,
 324,
 361]

In [15]:
def multiple_of_10(x):
    if x % 10 == 0:
        return True
    else:
        return False

In [16]:
rdd.map(square).filter(multiple_of_10).collect()

[0, 100]

## Read from HDFS

In [17]:
sotu_rdd = sc.textFile('/datasets/shakespeare.txt')

In [18]:
sotu_rdd.id()

11

In [19]:
sotu_rdd.first()

'The Project Gutenberg EBook of The Complete Works of William Shakespeare, by '

In [20]:
sotu_rdd.take(10)

['The Project Gutenberg EBook of The Complete Works of William Shakespeare, by ',
 'William Shakespeare',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  You may copy it, give it away or',
 're-use it under the terms of the Project Gutenberg License included',
 'with this eBook or online at www.gutenberg.org',
 '',
 '** This is a COPYRIGHTED Project Gutenberg eBook, Details Below **',
 '**     Please follow the copyright guidelines in this file.     **']

1- Check how many times the word `love` appears

In [None]:
def count_love(line):
    return ??

In [None]:
sotu_rdd.map(count_love).take(10)

In [None]:
sotu_rdd.map(count_love).sum()

In [None]:
def has_love(line):
    # should return True if line has word `love`, and False otherwise
    return ??

In [None]:
sotu_rdd.filter(has_love).take(3)

# My first map reduce job

The classic mapreduce paradigm can be accomplished by using `map` or `flatMap` (if multiple key-value pairs are generated) and `reduceByKey`.

In [2]:
example_dataset = [
['JAN', 'NY', 3.],
['JAN', 'PA', 1.],
['JAN', 'NJ', 2.],
['JAN', 'CT', 4.],
['FEB', 'PA', 1.],
['FEB', 'NJ', 1.],
['FEB', 'NY', 2.],
['FEB', 'VT', 1.],
['MAR', 'NJ', 2.],
['MAR', 'NY', 1.],
['MAR', 'VT', 2.],
['MAR', 'PA', 3.]]

In [3]:
dataset_rdd = sc.parallelize(example_dataset)

## Compute the number of orders per month

1- Generate key-value pairs

In [4]:
dataset_rdd

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

In [6]:
def map_func(row):
    return [row[0], row[2]]

In [7]:
dataset_rdd.map(map_func).take(5)

[['JAN', 3.0], ['JAN', 1.0], ['JAN', 2.0], ['JAN', 4.0], ['FEB', 1.0]]

2- Reduce to count the number of orders per month

In [8]:
def reduce_func(value1, value2):
    return value1 + value2

In [9]:
dataset_rdd.map(map_func).reduceByKey(reduce_func).collect()

[('JAN', 10.0), ('FEB', 5.0), ('MAR', 8.0)]

## Compute the average number of orders per month

In [10]:
dataset_rdd.take(4)

[['JAN', 'NY', 3.0],
 ['JAN', 'PA', 1.0],
 ['JAN', 'NJ', 2.0],
 ['JAN', 'CT', 4.0]]

In [14]:
def avg_map_func(row):
   return [row[0],[row[2],1]]

In [15]:
def avg_reduce_func(value1, value2):
    a1,n1=value1
    a2,n2=value2
    return [ (a1*n1+a2*n2)/(n1+n2),n1+n2]
    

In [16]:
dataset_rdd.map(avg_map_func).reduceByKey(avg_reduce_func).collect()

[('JAN', [2.5, 4]), ('FEB', [1.25, 4]), ('MAR', [2.0, 4])]

In [17]:
dataset_rdd.collect()

[['JAN', 'NY', 3.0],
 ['JAN', 'PA', 1.0],
 ['JAN', 'NJ', 2.0],
 ['JAN', 'CT', 4.0],
 ['FEB', 'PA', 1.0],
 ['FEB', 'NJ', 1.0],
 ['FEB', 'NY', 2.0],
 ['FEB', 'VT', 1.0],
 ['MAR', 'NJ', 2.0],
 ['MAR', 'NY', 1.0],
 ['MAR', 'VT', 2.0],
 ['MAR', 'PA', 3.0]]

## Count the frequency of words appearing in the Shakespeare sonets

## Explore the effect of caching in RAM

## Try again the count from before

# Spark 2.0

You can create `DataFrames` programatically

In [None]:
from pyspark.sql import Row

In [None]:
raw_data = [Row(state='NY', month='JAN', orders=3),
            Row(state='NJ', month='JAN', orders=4),
            Row(state='NY', month='FEB', orders=5),
           ]

In [None]:
raw_data

In [None]:
data_df = spark.createDataFrame(raw_data)

In [None]:
data_df

In [None]:
data_df.printSchema()

In [None]:
data_df.show()

In [None]:
raw_data2 = [Row(state='NY', month='MAR', orders=10),
             Row(state='NJ', month='MAR', orders=3),
             Row(state='NY', month='APR', orders=1),
           ]

In [None]:
data_df2 = spark.createDataFrame(raw_data2)

You can merge them:

In [None]:
all_data_df = data_df.union(data_df2)

In [None]:
all_data_df.show()

Or you can also display with Pandas

In [None]:
# make sure you limit first
all_data_df.limit(10).toPandas()

or `display` in DataBricks format

In [None]:
display(all_data_df)

### Access columns

In [None]:
all_data_df['month']

In [None]:
all_data_df.month

In [None]:
all_data_df['month'] + 1

### Selections

In [None]:
condition_month_jan = (all_data_df['month'] == "JAN")

In [None]:
condition_month_jan

In [None]:
all_data_df.where(condition_month_jan)

In [None]:
all_data_df[condition_month_jan]

In [None]:
all_data_df[condition_month_jan].show()

The conditions are symbolic objects

In [None]:
(all_data_df['month']  == 'MAR') & (all_data_df['orders'] > 5)

In [None]:
all_data_df[(all_data_df['month']  == 'MAR') & (all_data_df['orders'] > 5)].show()

You can create new columns

In [None]:
all_data_df.show()

In [None]:
all_data_df['orders'] + 1

In [None]:
all_data_df.withColumn('order_plus_1', all_data_df['orders'] + 1).printSchema()

In [None]:
all_data_df.withColumn('order_plus_1', all_data_df['orders'] + 1).show()

You can perform some basic grouping operations

In [None]:
all_data_df.groupBy('month')

In [None]:
all_data_df.groupBy('month').count()

In [None]:
all_data_df.groupBy('month').count().show()

You can order by a certain column or group of columns

In [None]:
all_data_df.orderBy('orders').show()

In [None]:
all_data_df.orderBy('orders', ascending=False).show()

You can register as tables and perform SQL

In [None]:
all_data_df.registerTempTable('orders')

In [None]:
spark.sql('select count(*) from orders')

In [None]:
spark.sql('select count(*) from orders').show()

The `DataFrame` object can read from multiple sources.

In [None]:
%ls /datasets/

In [None]:
spotify_df = spark.read.csv('/datasets/spotify_songs.csv', 
                             header=True, 
                             inferSchema=True)

In [None]:
spotify_df.printSchema()

In [None]:
# big dataset
semantic_scholar = spark.read.json('/datasets/semantic_scholar/s2-corpus-00.json')

In [None]:
# complex schema
semantic_scholar.printSchema()