<a href="https://colab.research.google.com/github/hyelin606/spark/blob/main/ch01_basic_240415_%EC%98%A4%ED%9B%84.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Google Drive 연동

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


## Spark 설치

In [2]:
!apt-get install openjdk-8-jdk-headless
!wget -q https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar -zxf spark-3.5.1-bin-hadoop3.tgz

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  libxtst6 openjdk-8-jdk-headless openjdk-8-jre-headless
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 39.7 MB of archives.
After this operation, 144 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libxtst6 amd64 2:1.2.3-1build4 [13.4 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 openjdk-8-jre-headless amd64 8u402-ga-2ubuntu1~22.04 [30.8 MB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 openjdk-8-jdk-headless amd64 8u402-ga-2ubuntu1~22.04 [8,873 kB]

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

In [4]:
!pip install findspark -q

In [5]:
import findspark
findspark.init()

In [6]:
import pyspark
spark_version = pyspark.__version__
print("Apache Spark 버전 확인: " + spark_version)

Apache Spark 버전 확인: 3.5.1


## 기본예제 1. 회귀 (Regression)

In [11]:
from pyspark.sql import SparkSession
import seaborn as sns

spark = SparkSession.builder.appName("regression").getOrCreate()
spark

## 데이터 불러오기

In [12]:
tips = sns.load_dataset("tips")
tips_df = spark.createDataFrame(tips)
tips_df.show(1)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 1 row



## 데이터 변환
- 머신러닝 수행을 위해 반드시 해야하는 과정
- VectorAssembler : https://spark.apache.org/docs/3.5.1/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html

In [14]:
from pyspark.ml.feature import VectorAssembler

# features 존재하지 않음 => 생성
# 타겟변수는 tip
feature_columns = ['total_bill', 'size'] # 수치형

assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'features')
df = assembler.transform(tips_df)
df.show(1)
# sex, day와 같은 문자형은 별도 처리해야 함

+----------+----+------+------+---+------+----+-----------+
|total_bill| tip|   sex|smoker|day|  time|size|   features|
+----------+----+------+------+---+------+----+-----------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|[16.99,2.0]|
+----------+----+------+------+---+------+----+-----------+
only showing top 1 row



In [15]:
train = df.select('features', 'tip')
train.show(1)

+-----------+----+
|   features| tip|
+-----------+----+
|[16.99,2.0]|1.01|
+-----------+----+
only showing top 1 row



## 데이터셋 분리
- 디폴트로 층화추출
  + pyspark 내부에서는 해당 메서드 존재 X
  + 직접 사용자 정의 함수를 개발하여 적용해야 함
    - 불편하기에 pyspark로 코드 짜는 사람 없음

In [19]:
train_data, test_data = train.randomSplit([0.8, 0.2], seed=42)

## 회귀모형 학습

In [20]:
train_data.show(1)

+----------+---+
|  features|tip|
+----------+---+
|[3.07,1.0]|1.0|
+----------+---+
only showing top 1 row



In [30]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol='features', labelCol='tip', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_data)

In [32]:
lr_model.coefficients

DenseVector([0.0702, 0.1024])

In [33]:
lr_model.intercept

1.3529536822318204

## 예측

In [34]:
predictions = lr_model.transform(test_data)
predictions.show(1)

+----------+---+------------------+
|  features|tip|        prediction|
+----------+---+------------------+
|[7.25,1.0]|1.0|1.9641720903635371|
+----------+---+------------------+
only showing top 1 row



In [35]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol = 'tip', predictionCol = 'prediction', metricName = 'rmse')
rmse = evaluator.evaluate(predictions)
rmse

1.0336986607859564

## 기본예제 2. 분류 (Classification)

In [36]:
spark.stop()

In [37]:
from pyspark.sql import SparkSession
import seaborn as sns

spark = SparkSession.builder.appName("classification").getOrCreate()
spark

In [38]:
tips = sns.load_dataset("tips")
df = spark.createDataFrame(tips)
df.show(1)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 1 row



## 문자 데이터 처리

In [42]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

indexers = [
    StringIndexer(inputCol=column, outputCol=column+"_index").fit(df)
    for column in ['sex', 'smoker', 'day', 'time']
]

pipeline = Pipeline(stages=indexers)
tips_df = pipeline.fit(df).transform(df)

tips_df.show(10)

+----------+----+------+------+---+------+----+---------+------------+---------+----------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_index|smoker_index|day_index|time_index|
+----------+----+------+------+---+------+----+---------+------------+---------+----------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|      1.0|         0.0|      1.0|       0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|      0.0|         0.0|      1.0|       0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|      0.0|         0.0|      1.0|       0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|      0.0|         0.0|      1.0|       0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|      1.0|         0.0|      1.0|       0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|      0.0|         0.0|      1.0|       0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|      0.0|         0.0|      1.0|       0.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|      0.0|         0.0|      1.0|

## VectorAssembler 사용
- features: 독립변수
- target 변수 구분

In [44]:
assembler = VectorAssembler(
    inputCols = ['total_bill', 'tip', 'size', 'smoker_index', 'day_index', 'time_index'],
    outputCol = 'features'
)

train = assembler.transform(tips_df)
train.show(1)

+----------+----+------+------+---+------+----+---------+------------+---------+----------+--------------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_index|smoker_index|day_index|time_index|            features|
+----------+----+------+------+---+------+----+---------+------------+---------+----------+--------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|      1.0|         0.0|      1.0|       0.0|[16.99,1.01,2.0,0...|
+----------+----+------+------+---+------+----+---------+------------+---------+----------+--------------------+
only showing top 1 row



## 최종 데이터셋

In [45]:
final_data = train.select("features", "sex_index")
final_data.show(1)

+--------------------+---------+
|            features|sex_index|
+--------------------+---------+
|[16.99,1.01,2.0,0...|      1.0|
+--------------------+---------+
only showing top 1 row



## 데이터셋 분리

In [46]:
from pyspark.ml.classification import LogisticRegression

train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)
lr = LogisticRegression(featuresCol='features', labelCol='sex_index')
lr_model = lr.fit(train_data)
print("Coefficients: \n" + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: 
[-0.029542135575379266,0.050979340226452716,-0.0055652867997908265,0.19993263323453486,0.08135603506558281,0.6843956307498666]
Intercept: -0.5273389488527473


## 모형 평가

In [47]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

predictions = lr_model.transform(test_data)

evaluator = BinaryClassificationEvaluator(labelCol='sex_index')
print('Test Area Under ROC', evaluator.evaluate(predictions))

accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="sex_index", predictionCol="prediction", metricName="accuracy")
accuracy = accuracy_evaluator.evaluate(predictions)
print("Accuracy: %.3f" % accuracy)

Test Area Under ROC 0.65625
Accuracy: 0.690
