In [1]:
!which python

/opt/conda/bin/python


In [2]:
!python --version

Python 3.11.6


In [3]:
!java -version

openjdk version "17.0.8.1" 2023-08-24
OpenJDK Runtime Environment (build 17.0.8.1+1-Ubuntu-0ubuntu122.04)
OpenJDK 64-Bit Server VM (build 17.0.8.1+1-Ubuntu-0ubuntu122.04, mixed mode, sharing)


In [4]:
!which java

/usr/bin/java


In [5]:
pip show pyspark

Name: pyspark
Version: 3.5.0
Summary: Apache Spark Python API
Home-page: https://github.com/apache/spark/tree/master/python
Author: Spark Developers
Author-email: dev@spark.apache.org
License: http://www.apache.org/licenses/LICENSE-2.0
Location: /usr/local/spark/python
Requires: py4j
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [6]:
#SPARK_HOME
import os
os.environ.get('SPARK_HOME')

'/usr/local/spark'

In [8]:
#JAVA_HOME
os.environ.get('JAVA_HOME')

In [10]:
os.environ.get('PYTHONPATH')

'/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip:/usr/local/spark/python:'

In [11]:
import pyspark

In [12]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pyspark example1').getOrCreate() #chaining
#SparkContext.SparkSession

In [13]:
spark

In [14]:
spark.stop()

In [15]:
spark = SparkSession.builder.appName('pyspark example1').getOrCreate() #chaining

In [16]:
spark

In [17]:
data = [ ('Alice',1), ('Bob',2), ('Charlie',3) ]
type(data)

list

In [20]:
#DataFrame 객체(분산객체)를 생성 <> pandas의 DataFrame이 아니다
data1 = spark.createDataFrame(data, ['Name', 'Value'])
data1[0][1]

Column<'Name[1]'>

In [21]:
data1.show()

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [40]:
data1.filter(data1.Name == 'Bob').show()

+----+-----+
|Name|Value|
+----+-----+
| Bob|    2|
+----+-----+



In [41]:
data1.filter(data1.Value > 2).show()

+-------+-----+
|   Name|Value|
+-------+-----+
|Charlie|    3|
+-------+-----+



In [42]:
data1.createOrReplaceTempView('people')

In [44]:
spark.sql('select * from people').show()

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [45]:
spark.sql('select * from people where Value = 2').show()

+----+-----+
|Name|Value|
+----+-----+
| Bob|    2|
+----+-----+



In [46]:
spark.sql('select * from people where Name = "Charlie"').show()

+-------+-----+
|   Name|Value|
+-------+-----+
|Charlie|    3|
+-------+-----+



# MLlib

In [47]:
from pyspark.ml.regression import LinearRegression

In [48]:
import numpy as np

In [49]:
from pyspark.ml.feature import VectorAssembler

In [53]:
data_age = [ ('Alice',25), ('Bob',30), ('Charlie',33) ]
data2 = spark.createDataFrame(data_age, ['Name', 'Age'])
data2

DataFrame[Name: string, Age: bigint]

In [57]:
assembler = VectorAssembler(inputCols = ['Age'], outputCol = 'features')
vector_df = assembler.transform(data2)
vector_df

DataFrame[Name: string, Age: bigint, features: vector]

## 설명
- inputCols: 리스트 형태로 입력 컬럼들을 넣어요. (*여러 개 가능*)    
- outputCol: 벡터로 묶인 결과가 저장될 단일 컬럼명이에요. (*복수형 아님 주의*)

In [59]:
lr = LinearRegression(featuresCol='features', labelCol='Age')
model = lr.fit(vector_df)

In [60]:
pred = model.transform(vector_df)
pred

DataFrame[Name: string, Age: bigint, features: vector, prediction: double]

In [61]:
pred.show()

+-------+---+--------+-----------------+
|   Name|Age|features|       prediction|
+-------+---+--------+-----------------+
|  Alice| 25|  [25.0]|24.99999999999993|
|    Bob| 30|  [30.0]|30.00000000000001|
|Charlie| 33|  [33.0]|33.00000000000006|
+-------+---+--------+-----------------+



# 병렬로 관리되는 RDD 객체 생성

In [23]:
spark = SparkSession.builder.appName('pyspark example1').getOrCreate()

In [24]:
rdd = spark.sparkContext.parallelize([1,2,3,4,5])
rdd

ParallelCollectionRDD[17] at readRDDFromFile at PythonRDD.scala:289

In [25]:
data1

DataFrame[Name: string, Value: bigint]

In [26]:
rdd.take(5) #rdd 객체 출력 함수 - n개를 반드시 지정

[1, 2, 3, 4, 5]

In [27]:
#map 연산 : rdd 값으로 연산
squared_rdd = rdd.map(lambda x:x*x)
squared_rdd

PythonRDD[20] at RDD at PythonRDD.scala:53

In [30]:
rdd.take(3)

[1, 2, 3]

In [28]:
squared_rdd.take(3)

[1, 4, 9]

In [31]:
squared_rdd.collect()

[1, 4, 9, 16, 25]

In [62]:
spark.stop()

spark : 컨텍스트 매니저 (?)

# Streaming

In [63]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

In [64]:
spark = SparkSession.builder.appName('pyspark example1').getOrCreate() #chaining

In [65]:
lines = spark.readStream.format('socket')\
.option('host', 'localhost')\
.option('port',9999)\
.load() #STREAMMING DATA

In [None]:
words = lines.select(explode(split(lines.value, ' ' )).alias('word'))

In [66]:
spark.stop()