In [1]:
# Sparkcontext = sc

`class pyspark.SparkContext (
   master = None,
   appName = None, 
   sparkHome = None, 
   pyFiles = None, 
   environment = None, 
   batchSize = 0, 
   serializer = PickleSerializer(), 
   conf = None, 
   gateway = None, 
   jsc = None, 
   profiler_cls = <class 'pyspark.profiler.BasicProfiler'>
)`

`Following are the parameters of a SparkContext.

**Master** − It is the URL of the cluster it connects to.

**appName** − Name of your job.

**sparkHome** − Spark installation directory.

**pyFiles** − The .zip or .py files to send to the cluster and add to the PYTHONPATH.

**Environment** − Worker nodes environment variables.

**batchSize** − The number of Python objects represented as a single Java object. Set 1 to disable batching, 0 to automatically choose the batch size based on object sizes, or -1 to use an unlimited batch size.

**Serializer** − RDD serializer.

**Conf** − An object of L{SparkConf} to set all the Spark properties.

**Gateway** − Use an existing gateway and JVM, otherwise initializing a new JVM.

**JSC** − The JavaSparkContext instance.

**profiler_cls** − A class of custom Profiler used to do profiling (the default is pyspark.profiler.BasicProfiler).`

In [8]:
# From the above parameters, master and appname are mostly used.
# First two lines of PySpark program looks as shown below
from pyspark import SparkContext
from pyspark import SparkConf

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [9]:
spark.stop()

##### Creating a Spark Context

In [10]:
from pyspark import SparkConf
from pyspark import SparkContext
conf = SparkConf()
conf.setMaster("local")
conf.setAppName("Spark-basic")

sc = SparkContext(conf = conf)

In [6]:
def mod(x):
    import numpy as np
    return (x, np.mod(x,2))

In [11]:
#Distribute a local Python collection to form an RDD. Using xrange
# is recommended if the input represents a range for performance. 

rdd = sc.parallelize(range(1,1000,2))
# Distribute a 
print(rdd)

PythonRDD[1] at RDD at PythonRDD.scala:53


In [12]:
rdd.take(10)

[1, 3, 5, 7, 9, 11, 13, 15, 17, 19]

In [17]:
rdd.distinct().take(10)

[1, 3, 5, 7, 9, 11, 13, 15, 17, 19]

In [19]:
rdd.histogram(5)

([1.0, 200.6, 400.2, 599.8, 799.4, 999], [100, 100, 100, 100, 100])

In [20]:
rdd.min()

1

In [21]:
rdd.max()

999

In [24]:
rdd.cache().take(3)

[1, 3, 5]

In [25]:
rdd.isEmpty()

False

In [33]:
#rdd.lookup()
#rdd.sortByKey(ascending=False).take(10)

In [34]:
values = [1,2,3,4,5]
rdd = sc.parallelize(values)

In [35]:
rdd.min()

1

In [36]:
rdd = sc.textFile("datas/text_count.txt")

In [38]:
rdd.take(2)

['hello my name is Krishna', 'I am from Nepal']

In [39]:
rdd.collect() # Return a list that contains all of the elements in rdd 

['hello my name is Krishna', 'I am from Nepal', 'I love data analysis']

#### rdd.persist()
`Set this RDD's storage level to persist its values across operations
after the first time it is computed. This can only be used to assign
a new storage level if the RDD does not have a storage level set yet.
If no storage level is specified defaults to (C{MEMORY_ONLY}).`

In [40]:
rdd.persist()

datas/text_count.txt MapPartitionsRDD[36] at textFile at <unknown>:0

###### rdd.map(function)
Return a new rdd by applying a function to each element of given rdd

In [None]:
#x.map(function)

In [41]:
x = sc.parallelize(["spark", "rdd", "example", "sample","example"])
y = x.map(lambda x: (x,1))

In [49]:
print(x.collect())
print(y.collect())

['spark', 'rdd', 'example', 'sample', 'example']
[('spark', 1), ('rdd', 1), ('example', 1), ('sample', 1), ('example', 1)]


##### rdd.flatMap(function)
`Return a new RDD by first applying a function to all elements of this
RDD, and then flattening the results.**used when multiple value returned**`

In [50]:
rdd = sc.parallelize([2,3,4])
sorted(rdd.flatMap(lambda x : range(1,x)).collect())

[1, 1, 1, 2, 2, 3]

In [52]:
(rdd.flatMap(lambda x : range(1,x)).collect())

[1, 1, 2, 1, 2, 3]

#### filter()
Return a new rdd that satisfying the filter condition

In [54]:
rdd = sc.parallelize([1,2,3,4,5,6])
rdd.filter(lambda x: x%2==0).collect()

[2, 4, 6]

##### sample()
Return the sampled subset of rdd

In [57]:
parallel = sc.parallelize(range(9))
parallel.sample(True, 0.2).count()

6

In [66]:
parallel.sample(True, 0.5).collect()
# Read more with shift tab

[0, 4]

##### Union() & intersection()
Return the union and intersection of two rdd a

In [67]:
x = sc.parallelize([1,2,3,4,5,6,7,8])
y = sc.parallelize([1,3,10,12])
x.union(y).collect()

[1, 2, 3, 4, 5, 6, 7, 8, 1, 3, 10, 12]

In [69]:
x.intersection(y).collect()

[1, 3]

##### distinct()
Return a new rdd containing the distinct elements in the rdd

In [70]:
x.union(y).distinct().collect()

[2, 4, 6, 8, 10, 12, 1, 3, 5, 7]

#### sortBy()


In [77]:
x.sortBy(lambda _:_, ascending=False).collect()

[8, 7, 6, 5, 4, 3, 2, 1]

In [79]:
z = sc.parallelize([("a",10),("m",20),("t",20),("c",15)])
z.sortBy(lambda _: _[1]).collect()

[('a', 10), ('c', 15), ('m', 20), ('t', 20)]

##### Mappartitions

In [None]:
rdd = sc.parallelize([1,2,3,4],4)

rdd.mapPartitions()

##### GroupBy()

In [80]:
rdd = sc.parallelize([1,2,3,5,6,8])
result = rdd.groupBy(lambda _ : _ %2).collect()
sorted([(x,sorted(y)) for (x,y) in result])

[(0, [2, 6, 8]), (1, [1, 3, 5])]