### Sanjoy Biswas

#### Setup Hadoop

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
!tar xf spark-2.4.8-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
!pip install pyspark



#### Import Libraries

In [3]:
from pyspark import SparkContext, SparkConf

In [4]:
conf = SparkConf().setAppName('Youtube').setMaster('local')
sc = SparkContext(conf = conf)

#### Apply Map Function

In [5]:
num = sc.parallelize([2,5,7,9,8])
num.collect()

[2, 5, 7, 9, 8]

In [6]:
num.map(lambda a : a*2).collect()

[4, 10, 14, 18, 16]

In [7]:
num.map(lambda a : pow(a,3)).collect()

[8, 125, 343, 729, 512]

In [8]:
nm = sc.parallelize(['abc','xyz','mnp'])
nm.collect()

['abc', 'xyz', 'mnp']

In [9]:
nm.map(lambda a : "Mr. "+ a).collect()

['Mr. abc', 'Mr. xyz', 'Mr. mnp']

#### Apply FlatMap

In [10]:
rdd = sc.parallelize([2,3,4])
rdd.collect()

[2, 3, 4]

In [11]:
a = range(1,3)
for i in a:
  print(i)

1
2


In [12]:
rdd.flatMap(lambda x: range(1,x)).collect()

[1, 1, 2, 1, 2, 3]

In [13]:
a = sc.parallelize([1,2,3])

In [14]:
a.flatMap(lambda x: (x, x*10,57)).collect()

[1, 10, 57, 2, 20, 57, 3, 30, 57]

#### Apply Filter

In [15]:
num.collect()

[2, 5, 7, 9, 8]

In [16]:
num.filter(lambda x: x%2==0).collect()

[2, 8]

In [17]:
nm.collect()

['abc', 'xyz', 'mnp']

In [18]:
nm.filter(lambda x : "b" in x).collect()

['abc']

#### Apply Union

In [19]:
num1 = sc.parallelize([2,3,4,5])
num1.collect()

[2, 3, 4, 5]

In [20]:
num2 = sc.parallelize([6,7,8,9,3,4])
num2.collect()

[6, 7, 8, 9, 3, 4]

In [21]:
num1.union(num2).collect()

[2, 3, 4, 5, 6, 7, 8, 9, 3, 4]

In [22]:
parallel = sc.parallelize(range(1,10))
parallel.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [23]:
parallel.sample(True,.2).collect()

[6]

In [24]:
parallel.sample(False,.2,seed = 23).collect()

[4, 5]