In [1]:
import findspark
findspark.init()



In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("transformations_actions").master("local").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
sc = spark.sparkContext

sc.getConf().getAll()

[('spark.master', 'local'),
 ('spark.driver.port', '2991'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.host', '10.138.59.4'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.executor.id', 'driver'),
 ('spark.app.name', 'transformations_actions'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.id', 'local-1684989682789')]

In [3]:
foods = sc.parallelize(["짜장면", "마라탕", "짬뽕", "떡볶이", "쌀국수", "짬뽕", "짜장면", "짜장면", "짜장면", "라면", "우동", "라면"])
foods
foods.collect()

['짜장면', '마라탕', '짬뽕', '떡볶이', '쌀국수', '짬뽕', '짜장면', '짜장면', '짜장면', '라면', '우동', '라면']

In [4]:
foods.countByValue()

defaultdict(int,
            {'짜장면': 4,
             '마라탕': 1,
             '짬뽕': 2,
             '떡볶이': 1,
             '쌀국수': 1,
             '라면': 2,
             '우동': 1})

In [5]:
foods.take(3)

['짜장면', '마라탕', '짬뽕']

In [6]:
foods.first()

'짜장면'

In [7]:
foods.count()

12

In [8]:
foods.distinct().collect()

['짜장면', '마라탕', '짬뽕', '떡볶이', '쌀국수', '라면', '우동']

In [12]:
foods.foreach(lambda x: print(x))

In [14]:
sc.parallelize([1, 2, 3]).map(lambda x: x + 2).collect()

[3, 4, 5]

In [16]:
sc.parallelize([1, 2, 3]).map(lambda x: x * 2).collect()

[2, 4, 6]

In [19]:
movies = [
    "그린 북",
    "매트릭스",
    "토이 스토리",
    "캐스트 어웨이",
    "포드 V 페라리",
    "보헤미안 랩소디",
    "빽 투 더 퓨처",
    "반지의 제왕",
    "죽은 시인의 사회"
]

moviesRDD = sc.parallelize(movies)
moviesRDD.collect()

['그린 북',
 '매트릭스',
 '토이 스토리',
 '캐스트 어웨이',
 '포드 V 페라리',
 '보헤미안 랩소디',
 '빽 투 더 퓨처',
 '반지의 제왕',
 '죽은 시인의 사회']

In [21]:
flatMovies = moviesRDD.flatMap(lambda x: x.split(" "))
flatMovies.collect()

['그린',
 '북',
 '매트릭스',
 '토이',
 '스토리',
 '캐스트',
 '어웨이',
 '포드',
 'V',
 '페라리',
 '보헤미안',
 '랩소디',
 '빽',
 '투',
 '더',
 '퓨처',
 '반지의',
 '제왕',
 '죽은',
 '시인의',
 '사회']

In [22]:
filteredMovies = flatMovies.filter(lambda x: x != "매트릭스")
filteredMovies.collect()

['그린',
 '북',
 '토이',
 '스토리',
 '캐스트',
 '어웨이',
 '포드',
 'V',
 '페라리',
 '보헤미안',
 '랩소디',
 '빽',
 '투',
 '더',
 '퓨처',
 '반지의',
 '제왕',
 '죽은',
 '시인의',
 '사회']

In [24]:
num1 = sc.parallelize([1, 2, 3, 4])
num2 = sc.parallelize([4, 5, 6, 7, 8, 9, 10])

In [27]:
num1.intersection(num2).collect()

num1.union(num2).collect()
num1.subtract(num2).collect()

[2, 1, 3]

In [29]:
numUnion = num1.union(num2)


In [32]:
numUnion.sample(True, .5, seed=2).collect()

[1, 1, 2, 4, 4, 5, 6, 7, 9]

In [56]:
foods = sc.parallelize(["짜장면", "마라탕", "짬뽕", "떡볶이", "쌀국수", "짬뽕", "짜장면", "짜장면", "짜장면",  "라면", "우동", "라면", "치킨", "돈까스", "회", "햄버거", "피자"])

In [57]:
#groupBy
foodsGroup = foods.groupBy(lambda x: x[0])

In [59]:
res = foodsGroup.collect()

In [60]:
for (k, v) in res:
    print(k, list(v))

짜 ['짜장면', '짜장면', '짜장면', '짜장면']
마 ['마라탕']
짬 ['짬뽕', '짬뽕']
떡 ['떡볶이']
쌀 ['쌀국수']
라 ['라면', '라면']
우 ['우동']
치 ['치킨']
돈 ['돈까스']
회 ['회']
햄 ['햄버거']
피 ['피자']


In [65]:
nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
list(nums.groupBy(lambda x: x % 2).collect()[1][1])

[1, 3, 5, 7, 9]

In [77]:
r = foods.groupBy(lambda x: (len(x) > 2, x[0]))
r.collect()

[((True, '짜'), <pyspark.resultiterable.ResultIterable at 0x1a27954d9e8>),
 ((True, '마'), <pyspark.resultiterable.ResultIterable at 0x1a27954da90>),
 ((False, '짬'), <pyspark.resultiterable.ResultIterable at 0x1a27954db00>),
 ((True, '떡'), <pyspark.resultiterable.ResultIterable at 0x1a27954dac8>),
 ((True, '쌀'), <pyspark.resultiterable.ResultIterable at 0x1a27954db70>),
 ((False, '라'), <pyspark.resultiterable.ResultIterable at 0x1a27954dba8>),
 ((False, '우'), <pyspark.resultiterable.ResultIterable at 0x1a27954dbe0>),
 ((False, '치'), <pyspark.resultiterable.ResultIterable at 0x1a27954dc18>),
 ((True, '돈'), <pyspark.resultiterable.ResultIterable at 0x1a27954dc50>),
 ((False, '회'), <pyspark.resultiterable.ResultIterable at 0x1a27954dc88>),
 ((True, '햄'), <pyspark.resultiterable.ResultIterable at 0x1a27954dcc0>),
 ((False, '피'), <pyspark.resultiterable.ResultIterable at 0x1a27954dcf8>)]