### SparkContext - 1

In [9]:
import pyspark
from pyspark import SparkContext, SparkConf

sc = SparkContext()
print(sc)
print(type(sc))

<SparkContext master=local[*] appName=pyspark-shell>
<class 'pyspark.context.SparkContext'>


In [49]:
# 객체가 잘 만들어졌음을 볼 수 있습니다. 만약 SparkContext를 한 개 더 만들면 어떻게 될까요?
# 에러 발생

# new_sc = SparkContext()
# ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at /var/folders/5p/l3b0fr1x5wn7450vgnp88r080000gn/T/ipykernel_1755/3605226842.py:4 


In [10]:
# SparkContext 종료
sc.stop()

### SparkContext - 2

In [11]:
sc = SparkContext(master='local', appName='pyspark test')
print(sc)

<SparkContext master=local appName=pyspark test>


In [12]:
# sparkContext의 Configuration을 확인하기 위해서 .getConf().getAll()을 이용합니다.
sc.getConf().getAll()

[('spark.master', 'local'),
 ('spark.app.startTime', '1664642777215'),
 ('spark.driver.host', '172.16.227.180'),
 ('spark.app.id', 'local-1664642777295'),
 ('spark.app.name', 'pyspark test'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.driver.port', '56178')]

In [13]:
print(sc.appName)
print(sc.master)
sc.stop()

pyspark test
local


### SparkContext - 3

In [14]:
# SparkConf()을 이용해 SparkContext의 Configuration을 설정하는 방법으로 SparkContext를 만들 수 있습니다
# .setMaster(), setAppName()을 이용해 어플리케이션의 이름과 Master의 URL을 설정해줄 수 있습니다.

conf = SparkConf().setAppName("Pyspark Test1").setMaster('local')
sc = SparkContext(conf=conf)
print(sc)
print(sc.appName)
print(sc.master)
sc.stop()

<SparkContext master=local appName=Pyspark Test1>
Pyspark Test1
local


### SparkContext - 4

In [15]:
# Create Spark Context
from pyspark import SparkConf, SparkContext
conf = SparkConf()
conf.setMaster("local").setAppName("Spark Example App")

sc = SparkContext.getOrCreate(conf)
print(sc.appName)
sc.stop()

Spark Example App


### SparkContext - 5

In [16]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("Pyspark Test1").setMaster('local')
sc = SparkContext(conf=conf)
print(sc)
sc.stop()

<SparkContext master=local appName=Pyspark Test1>


### SparkSession - 1

In [17]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Pyspark-2").master('local[*]').getOrCreate()
spark

In [18]:
print(spark.sparkContext)
print(spark.sparkContext.appName)

# SparkContext stop() method
# spark.sparkContext.stop()

<SparkContext master=local[*] appName=Pyspark-2>
Pyspark-2


In [19]:
# Create RDD
rdd = spark.sparkContext.range(1, 5)
print(rdd.collect())

[Stage 0:>                                                          (0 + 8) / 8]

[1, 2, 3, 4]


                                                                                

In [26]:
print(spark.sparkContext.applicationId)
print(spark.sparkContext.uiWebUrl)
print(spark.sparkContext.version)

local-1664642816099
http://172.16.227.180:4040
3.2.1


In [50]:
import os
tempdir = '/Users/sunghwanki/Desktop/Project/Python_Advance/Pyspark'
path = os.path.join(tempdir, 'sample.txt')

with open(path, "w") as testFile:
    testFile.write("Hello world!")

rdd = spark.sparkContext.textFile(path)
rdd.collect()

['Hello world!']

In [52]:
rdd = spark.sparkContext.textFile('./test.txt')
rdd.top(10)

['macOS 12.5                       March 13, 2021                       macOS 12.5',
 'S\x08SY\x08YN\x08NO\x08OP\x08PS\x08SI\x08IS\x08S',
 'S\x08ST\x08TA\x08AN\x08ND\x08DA\x08AR\x08RD\x08DS\x08S',
 'S\x08SE\x08EE\x08E A\x08AL\x08LS\x08SO\x08O',
 'P\x08PR\x08RI\x08IM\x08MA\x08AR\x08RI\x08IE\x08ES\x08S',
 'O\x08OP\x08PE\x08ER\x08RA\x08AT\x08TO\x08OR\x08RS\x08S',
 'N\x08NA\x08AM\x08ME\x08E',
 'H\x08HI\x08IS\x08ST\x08TO\x08OR\x08RY\x08Y',
 'FIND(1)                      General Commands Manual                     FIND(1)',
 'E\x08EX\x08XA\x08AM\x08MP\x08PL\x08LE\x08ES\x08S']

In [53]:
myList = [1,2,3,4,5]
myRdd = spark.sparkContext.parallelize(myList)
print(myRdd.first())
print(myRdd.take(3))
print(myRdd.collect())

1
[1, 2, 3]
[1, 2, 3, 4, 5]


In [54]:
%%writefile ./test2.txt
my name is danny
your name is john
we are friend
good morning
the king
may the force be with you

Writing ./test2.txt


In [56]:
myRdd1 = spark.sparkContext.textFile(os.path.join(tempdir, 'test2.txt'))
print(myRdd1.first())
print(myRdd1.take(3))
print(myRdd1.collect())


my name is danny
['my name is danny', 'your name is john', 'we are friend']
['my name is danny', 'your name is john', 'we are friend', 'good morning', 'the king', 'may the force be with you']


In [61]:
dataRange = list(range(1,10))
print(dataRange)
print(type(dataRange))

[1, 2, 3, 4, 5, 6, 7, 8, 9]
<class 'list'>


In [67]:
rangeRdd = spark.sparkContext.parallelize(dataRange)
print(rangeRdd)
print(type(rangeRdd))
print(rangeRdd.collect())

ParallelCollectionRDD[47] at readRDDFromFile at PythonRDD.scala:274
<class 'pyspark.rdd.RDD'>
[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [69]:
def sub(n):
    return n-1

subRdd = rangeRdd.map(sub)
print(subRdd.collect())

[0, 1, 2, 3, 4, 5, 6, 7, 8]


flatMap()

map과 비슷하지만 map에서 이중 리스트로 표현되는 데이터를 하나의 리스트에서 전부 나열해줍니다.

아래 4개의 문자열이 있는 리스트를 RDD로 생성하고 동일한 lambda 함수를 인자로 각각 map과 flatMap을 사용합니다.

map을 사용한 RDD는 lambda 함수로 인해 문자열과 문자열 끝에 s가 붙은 문자열의 쌍으로 이루어져 있지만 flatMap을 사용한 RDD는 모든 문자열이 하나의 리스트안에 나열되어있습니다.

각 RDD를 count함수를 사용하면 결과가 다르게 나오는 것을 확인할 수 있습니다.

이렇게 모든 데이터를 하나의 리스트로 변환해주기 때문에 워드카운트와 같은 작업에 flatMap이 유용하게 사용됩니다.

In [78]:
animals = ['cat','dog','elephant', 'tiger']
animalsRdd = spark.sparkContext.parallelize(animals)
animalsRdd.collect()

['cat', 'dog', 'elephant', 'tiger']

In [80]:
animalsRDDMap = animalsRdd.map(lambda x: (x, x+'s'))
animalsRDDFlatmap = animalsRdd.flatMap(lambda x: (x,x+'s'))

print(animalsRDDMap.collect())
print(animalsRDDFlatmap.collect())

[('cat', 'cats'), ('dog', 'dogs'), ('elephant', 'elephants'), ('tiger', 'tigers')]
['cat', 'cats', 'dog', 'dogs', 'elephant', 'elephants', 'tiger', 'tigers']


In [81]:
print(animalsRDDMap.count())
print(animalsRDDFlatmap.count())

4
8


In [82]:
spark.sparkContext.stop()

In [83]:
from pyspark import SparkContext
sc = SparkContext("local", "Transdormation Demo")
words_list = sc.parallelize (
  ["pyspark", 
  "interview", 
  "questions", 
  "at", 
  "interviewbit"]
)
filtered_words = words_list.filter(lambda x: 'interview' in x)
filtered = filtered_words.collect()
print(filtered)

[Stage 0:>                                                          (0 + 1) / 1]

['interview', 'interviewbit']


                                                                                

In [85]:
counts = filtered_words.count()
print("Count of elements in RDD -> ",  counts)

Count of elements in RDD ->  2


In [4]:
# Title : PySpark Script Template
# Description : This template can be used to create pyspark script
# Author : sqlandhadoop.com
# Date : 30-June-2021
# Version : 1.0 (Initial Draft)
# Usage : spark-submit --executor-memory 4G --executor-cores 4 PySpark_Script_Template.py > ./PySpark_Script_Template.log 2>&1 &

In [7]:
# import modules
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import sys, logging
from datetime import datetime

In [None]:
# Logging configuration
formatter = logging.Formatter('[%(asctime)s] %(levelname)s @ line % (lineno)d: %(message)s')
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
handler.setFormatter(formatter)

logger = logging.getLogger()
logger.setLevle(logging.INFO)
logger.addHandler(handler)



In [None]:
# import modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import sys,logging
from datetime import datetime

# Logging configuration
formatter = logging.Formatter('[%(asctime)s] %(levelname)s @ line %(lineno)d: %(message)s')
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
handler.setFormatter(formatter)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(handler)

# current time variable to be used for logging purpose
dt_string = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
# change it to your app name
AppName = "MyPySparkApp"


# adding dummy function. change or remove it.
def some_function1():
    logger.info("Inside some_function 1")

# adding dummy function. change or remove it.
def some_function2():
    logger.info("Inside some_function 2")

def main():
    # start spark code
    spark = SparkSession.builder.appName(AppName+"_"+str(dt_string)).getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    logger.info("Starting spark application")

    #calling function 1
    some_function1()

    #calling function 2
    some_function2()

    #do something here
    logger.info("Reading CSV File")
    df_category = spark.read.option("delimiter","|").csv("hdfs:///var/data/category_pipe.txt")
    logger.info("Previewing CSV File Data")
    df_category.show(truncate=False)

    logger.info("Ending spark application")
    # end spark code
    spark.stop()
    return None

# Starting point for PySpark
if __name__ == '__main__':
    main()
    sys.exit()