In [4]:
!apt-get install openjdk-8-jdk-headless
!wget -q https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar -zxf spark-3.5.1-bin-hadoop3.tgz

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  libxtst6 openjdk-8-jdk-headless openjdk-8-jre-headless
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 39.7 MB of archives.
After this operation, 144 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libxtst6 amd64 2:1.2.3-1build4 [13.4 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 openjdk-8-jre-headless amd64 8u402-ga-2ubuntu1~22.04 [30.8 MB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 openjdk-8-jdk-headless amd64 8u402-ga-2ubuntu1~22.04 [8,873 kB]

In [5]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

In [6]:
!pip install findspark -q

In [7]:
import findspark
findspark.init()

In [8]:
import pyspark
spark_version = pyspark.__version__
print("Apache Spark 버전 확인: " + spark_version)

Apache Spark 버전 확인: 3.5.1


In [9]:
from pyspark.sql import SparkSession
import seaborn as sns

spark = SparkSession.builder.appName("regression").getOrCreate()
spark

In [10]:
tips = sns.load_dataset("tips")
tips_df = spark.createDataFrame(tips)
tips_df.show(1)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 1 row



## 데이터 변환
- 머신러닝 수행 위해 반드시 수행해야 하는 과정
- VectorAssembler :  https://spark.apache.org/docs/3.5.1/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html

In [13]:
from pyspark.ml.feature import VectorAssembler

#features 존재하지 않음 ==> 생성
# 타겟변수는 tip
feature_columns = ['total_bill','size'] # 수치형

assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'features')
df = assembler.transform(tips_df)
df.show(1)

+----------+----+------+------+---+------+----+-----------+
|total_bill| tip|   sex|smoker|day|  time|size|   features|
+----------+----+------+------+---+------+----+-----------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|[16.99,2.0]|
+----------+----+------+------+---+------+----+-----------+
only showing top 1 row



In [14]:
train = df.select('features', 'tip')
train.show(1)

+-----------+----+
|   features| tip|
+-----------+----+
|[16.99,2.0]|1.01|
+-----------+----+
only showing top 1 row



## 데이터셋 분리
- 디폴트로 층화추출

In [15]:
train_data, test_data = tips_df.randomSplit([0.8,0.2], seed = 42)

## 회귀모형 학습

In [None]:
from pyspark.regression import LinearRegression

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_data)



# Classification

In [16]:
spark.stop()

In [17]:
from pyspark.sql import SparkSession
import seaborn as sns

spark = SparkSession.builder.appName("classification").getOrCreate()
spark

In [18]:
tips = sns.load_dataset("tips")
df = spark.createDataFrame(tips)
df.show(1)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 1 row



## 문자 데이터 처리

In [21]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

indexers = [
    StringIndexer(inputCol=column, outputCol = column+"_index").fit(df)
    for column in ['sex', 'smoker', 'day', 'time']
]

pipeline = Pipeline(stages=indexers)
tips_df = pipeline.fit(df).transform(df)

tips_df.show(1)


+----------+----+------+------+---+------+----+---------+------------+---------+----------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_index|smoker_index|day_index|time_index|
+----------+----+------+------+---+------+----+---------+------------+---------+----------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|      1.0|         0.0|      1.0|       0.0|
+----------+----+------+------+---+------+----+---------+------------+---------+----------+
only showing top 1 row



## VectorAssembler 사용
- features : 독립변수
- target 변수 구분

In [25]:
assembler = VectorAssembler(
    inputCols = ['total_bill', 'tip', 'size', 'smoker_index', 'day_index', 'time_index'],
    outputCol = 'features'
)

train = assembler.transform(tips_df)
train.show(1)

+----------+----+------+------+---+------+----+---------+------------+---------+----------+--------------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_index|smoker_index|day_index|time_index|            features|
+----------+----+------+------+---+------+----+---------+------------+---------+----------+--------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|      1.0|         0.0|      1.0|       0.0|[16.99,1.01,2.0,0...|
+----------+----+------+------+---+------+----+---------+------------+---------+----------+--------------------+
only showing top 1 row



## 최종 데이터셋

In [26]:
final_data = train.select("features", "sex_index")
final_data.show(1)

+--------------------+---------+
|            features|sex_index|
+--------------------+---------+
|[16.99,1.01,2.0,0...|      1.0|
+--------------------+---------+
only showing top 1 row



## 데이터셋 분리

In [27]:
from pyspark.ml.classification import LogisticRegression
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)
lr = LogisticRegression(featuresCol='features', labelCol='sex_index')
lr_model = lr.fit(train_data)
print("Coefficients: \n" + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: 
[-0.029542135575379266,0.050979340226452716,-0.0055652867997908265,0.19993263323453486,0.08135603506558281,0.6843956307498666]
Intercept: -0.5273389488527473


## 모형평가

In [28]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

predictions = lr_model.transform(test_data)

evaluator = BinaryClassificationEvaluator(labelCol='sex_index')
print('Test Area Under ROC', evaluator.evaluate(predictions))

accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="sex_index", predictionCol="prediction", metricName="accuracy")
accuracy = accuracy_evaluator.evaluate(predictions)
print("Accuracy: %.3f" % accuracy)

Test Area Under ROC 0.65625
Accuracy: 0.690
