In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DataFrame").config("spark.sql.repl.eagerEval.enabled", True).getOrCreate()
spark

In [3]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
import warnings
warnings.filterwarnings('ignore')

housing = fetch_california_housing()
df = pd.DataFrame(data=housing.data, columns=housing.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [4]:
df['PRICE'] = housing.target

df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,PRICE
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [5]:
## Pandas DataFrame을 Spark DataFrame으로 변환
spark_df = spark.createDataFrame(df)
spark_df.limit(5)

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,PRICE
8.3252,41.0,6.984126984126984,1.0238095238095235,322.0,2.555555555555556,37.88,-122.23,4.526
8.3014,21.0,6.238137082601054,0.9718804920913884,2401.0,2.109841827768014,37.86,-122.22,3.585
7.2574,52.0,8.288135593220339,1.073446327683616,496.0,2.8022598870056497,37.85,-122.24,3.521
5.6431,52.0,5.817351598173516,1.0730593607305936,558.0,2.547945205479452,37.85,-122.25,3.413
3.8462,52.0,6.281853281853282,1.0810810810810811,565.0,2.1814671814671813,37.85,-122.25,3.422


In [6]:
print(spark_df.columns[:-1])

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [9]:
featureAssembler = VectorAssembler(inputCols=spark_df.columns[:-1], outputCol="features")
output = featureAssembler.transform(spark_df)
output.limit(5)

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,PRICE,features
8.3252,41.0,6.984126984126984,1.0238095238095235,322.0,2.555555555555556,37.88,-122.23,4.526,"[8.3252,41.0,6.98..."
8.3014,21.0,6.238137082601054,0.9718804920913884,2401.0,2.109841827768014,37.86,-122.22,3.585,"[8.3014,21.0,6.23..."
7.2574,52.0,8.288135593220339,1.073446327683616,496.0,2.8022598870056497,37.85,-122.24,3.521,"[7.2574,52.0,8.28..."
5.6431,52.0,5.817351598173516,1.0730593607305936,558.0,2.547945205479452,37.85,-122.25,3.413,"[5.6431,52.0,5.81..."
3.8462,52.0,6.281853281853282,1.0810810810810811,565.0,2.1814671814671813,37.85,-122.25,3.422,"[3.8462,52.0,6.28..."


In [10]:
## feature scaling
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="standardized")
data_scaled = scaler.fit(output).transform(output)
data_scaled.show(5)

+------+--------+------------------+------------------+----------+------------------+--------+---------+-----+--------------------+--------------------+
|MedInc|HouseAge|          AveRooms|         AveBedrms|Population|          AveOccup|Latitude|Longitude|PRICE|            features|        standardized|
+------+--------+------------------+------------------+----------+------------------+--------+---------+-----+--------------------+--------------------+
|8.3252|    41.0| 6.984126984126984|1.0238095238095237|     322.0|2.5555555555555554|   37.88|  -122.23|4.526|[8.3252,41.0,6.98...|[4.38209539419521...|
|8.3014|    21.0| 6.238137082601054|0.9718804920913884|    2401.0| 2.109841827768014|   37.86|  -122.22|3.585|[8.3014,21.0,6.23...|[4.36956790291790...|
|7.2574|    52.0| 8.288135593220339| 1.073446327683616|     496.0|2.8022598870056497|   37.85|  -122.24|3.521|[7.2574,52.0,8.28...|[3.82004265529144...|
|5.6431|    52.0|5.8173515981735155|1.0730593607305936|     558.0| 2.5479452054794

In [11]:
## final data
ml_data = data_scaled.select("standardized", "PRICE")
ml_data.show(5, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|standardized                                                                                                                                            |PRICE|
+--------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|[4.382095394195214,3.257702301608302,2.8228125480951687,2.160341990754154,0.2843362208866197,0.246056553095331,17.734477624640416,-61.00726959606983]   |4.526|
|[4.369567902917905,1.6685792276530327,2.521301756615352,2.050766464104957,2.120159212263273,0.20314189867185023,17.72511412008675,-61.002278409814735]  |3.585|
|[3.8200426552914455,4.1317199922837,3.349860792340827,2.265080684038648,0.4379837439744204,0.269809986002839,17.720432367809916,-61.01226078232493]     |3.521|
|[2.9703313456713363,4.13171999228

In [12]:
# train/test split
train_data, test_data = ml_data.randomSplit([0.75, 0.25])
train_data.count(), test_data.count()

(15592, 5048)

In [13]:
# pyspark.ml 을 이용한 선형회귀
regressor = LinearRegression(featuresCol="standardized", labelCol='PRICE')
model = regressor.fit(train_data)

In [14]:
model.intercept

-36.57716042470901

In [15]:
# Prediction
pred = model.transform(test_data)
pred.show(5)

+--------------------+-----+------------------+
|        standardized|PRICE|        prediction|
+--------------------+-----+------------------+
|[0.42172378199072...|0.818|1.0389370471856125|
|[0.42246069324233...|1.125|0.9844144050449941|
|[0.42503988262295...|1.375|1.5149826648179925|
|[0.47430766344463...|0.861|1.4602404938952702|
|[0.50036273984072...|1.875| 1.396282995398714|
+--------------------+-----+------------------+
only showing top 5 rows



In [16]:
model.summary.meanAbsoluteError, model.summary.meanSquaredError, model.summary.r2

(0.5326023721765731, 0.5255341830480305, 0.6053519350315231)

In [17]:
## spark DataFrame을 numpy array로 변경
y_test = np.array(test_data.select('PRICE').collect())
y_pred = np.array(pred.select('prediction').collect())

In [19]:
from sklearn.metrics import r2_score
# R2 계산 
print("결정계수: {:.2f}".format(r2_score(y_test, y_pred)))

결정계수: 0.60


## 2. Linear Regression - Tip 금액 예측
- computing -> cluster -> library 에서 `seaborn` install

In [22]:
import seaborn as sns
tips = sns.load_dataset("tips")
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [23]:
## spark dataframe으로 변환
df = spark.createDataFrame(tips)
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [24]:
df.groupBy('time').count().show()
df.groupBy('smoker').count().show()
df.groupBy('day').count().show()
df.groupBy('time').count().show()

+------+-----+
|  time|count|
+------+-----+
|Dinner|  176|
| Lunch|   68|
+------+-----+

+------+-----+
|smoker|count|
+------+-----+
|    No|  151|
|   Yes|   93|
+------+-----+

+----+-----+
| day|count|
+----+-----+
| Sun|   76|
| Sat|   87|
|Thur|   62|
| Fri|   19|
+----+-----+

+------+-----+
|  time|count|
+------+-----+
|Dinner|  176|
| Lunch|   68|
+------+-----+



### Categorical Feature를 One-Hot 벡터로 변환

In [25]:
## 여러개의 feature를 한번에 index 변환
indexer = StringIndexer(inputCols=['sex', 'smoker', 'day', 'time'], outputCols=['sex_', 'smoker_', 'day_', 'time_']).fit(df)
df_r = indexer.transform(df)
df_r.show(n=100)

+----------+----+------+------+----+------+----+----+-------+----+-----+
|total_bill| tip|   sex|smoker| day|  time|size|sex_|smoker_|day_|time_|
+----------+----+------+------+----+------+----+----+-------+----+-----+
|     16.99|1.01|Female|    No| Sun|Dinner|   2| 1.0|    0.0| 1.0|  0.0|
|     10.34|1.66|  Male|    No| Sun|Dinner|   3| 0.0|    0.0| 1.0|  0.0|
|     21.01| 3.5|  Male|    No| Sun|Dinner|   3| 0.0|    0.0| 1.0|  0.0|
|     23.68|3.31|  Male|    No| Sun|Dinner|   2| 0.0|    0.0| 1.0|  0.0|
|     24.59|3.61|Female|    No| Sun|Dinner|   4| 1.0|    0.0| 1.0|  0.0|
|     25.29|4.71|  Male|    No| Sun|Dinner|   4| 0.0|    0.0| 1.0|  0.0|
|      8.77| 2.0|  Male|    No| Sun|Dinner|   2| 0.0|    0.0| 1.0|  0.0|
|     26.88|3.12|  Male|    No| Sun|Dinner|   4| 0.0|    0.0| 1.0|  0.0|
|     15.04|1.96|  Male|    No| Sun|Dinner|   2| 0.0|    0.0| 1.0|  0.0|
|     14.78|3.23|  Male|    No| Sun|Dinner|   2| 0.0|    0.0| 1.0|  0.0|
|     10.27|1.71|  Male|    No| Sun|Dinner|   2| 0.

In [26]:
df_r.select(['sex_']).distinct().show()
df_r.select(['smoker_']).distinct().show()
df_r.select(['day_']).distinct().show()
df_r.select(['time_']).distinct().show()

+----+
|sex_|
+----+
| 0.0|
| 1.0|
+----+

+-------+
|smoker_|
+-------+
|    0.0|
|    1.0|
+-------+

+----+
|day_|
+----+
| 1.0|
| 0.0|
| 2.0|
| 3.0|
+----+

+-----+
|time_|
+-----+
|  0.0|
|  1.0|
+-----+



In [27]:
print(df_r.columns)

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size', 'sex_', 'smoker_', 'day_', 'time_']


In [28]:
from pyspark.ml.feature import OneHotEncoder

## OneHot encoding
ohe_encoder = OneHotEncoder(inputCols=['sex_', 'smoker_', 'day_', 'time_'], outputCols=['sex_ohe', 'smoker_ohe', 'day_ohe', 'time_ohe']).fit(df_r)
df_ohe = ohe_encoder.transform(df_r)
df_ohe.show(5)

+----------+----+------+------+---+------+----+----+-------+----+-----+-------------+-------------+-------------+-------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_|smoker_|day_|time_|      sex_ohe|   smoker_ohe|      day_ohe|     time_ohe|
+----------+----+------+------+---+------+----+----+-------+----+-----+-------------+-------------+-------------+-------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2| 1.0|    0.0| 1.0|  0.0|    (1,[],[])|(1,[0],[1.0])|(3,[1],[1.0])|(1,[0],[1.0])|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3| 0.0|    0.0| 1.0|  0.0|(1,[0],[1.0])|(1,[0],[1.0])|(3,[1],[1.0])|(1,[0],[1.0])|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3| 0.0|    0.0| 1.0|  0.0|(1,[0],[1.0])|(1,[0],[1.0])|(3,[1],[1.0])|(1,[0],[1.0])|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.0|    0.0| 1.0|  0.0|(1,[0],[1.0])|(1,[0],[1.0])|(3,[1],[1.0])|(1,[0],[1.0])|
|     24.59|3.61|Female|    No|Sun|Dinner|   4| 1.0|    0.0| 1.0|  0.0|    (1,[],[])|(1,[0],[1.0])|(3,[1

In [29]:
## feature vector화
feature_Assembler = VectorAssembler(inputCols=['total_bill', 'size', 'sex_ohe', 'smoker_ohe', 'day_ohe', 'time_ohe'], outputCol='features')
output = feature_Assembler.transform(df_ohe)
output.show()

+----------+----+------+------+---+------+----+----+-------+----+-----+-------------+-------------+-------------+-------------+--------------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_|smoker_|day_|time_|      sex_ohe|   smoker_ohe|      day_ohe|     time_ohe|            features|
+----------+----+------+------+---+------+----+----+-------+----+-----+-------------+-------------+-------------+-------------+--------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2| 1.0|    0.0| 1.0|  0.0|    (1,[],[])|(1,[0],[1.0])|(3,[1],[1.0])|(1,[0],[1.0])|[16.99,2.0,0.0,1....|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3| 0.0|    0.0| 1.0|  0.0|(1,[0],[1.0])|(1,[0],[1.0])|(3,[1],[1.0])|(1,[0],[1.0])|[10.34,3.0,1.0,1....|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3| 0.0|    0.0| 1.0|  0.0|(1,[0],[1.0])|(1,[0],[1.0])|(3,[1],[1.0])|(1,[0],[1.0])|[21.01,3.0,1.0,1....|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.0|    0.0| 1.0|  0.0|(1,[0],[1.0])|(1,[0],[1.0])|(3,[1],

In [30]:
## feature scaling
scaler = StandardScaler(inputCol="features", outputCol="standardized")
data_scaled = scaler.fit(output).transform(output)
data_scaled.show(5)

+----------+----+------+------+---+------+----+----+-------+----+-----+-------------+-------------+-------------+-------------+--------------------+--------------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_|smoker_|day_|time_|      sex_ohe|   smoker_ohe|      day_ohe|     time_ohe|            features|        standardized|
+----------+----+------+------+---+------+----+----+-------+----+-----+-------------+-------------+-------------+-------------+--------------------+--------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2| 1.0|    0.0| 1.0|  0.0|    (1,[],[])|(1,[0],[1.0])|(3,[1],[1.0])|(1,[0],[1.0])|[16.99,2.0,0.0,1....|[1.90847155648990...|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3| 0.0|    0.0| 1.0|  0.0|(1,[0],[1.0])|(1,[0],[1.0])|(3,[1],[1.0])|(1,[0],[1.0])|[10.34,3.0,1.0,1....|[1.16148298376136...|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3| 0.0|    0.0| 1.0|  0.0|(1,[0],[1.0])|(1,[0],[1.0])|(3,[1],[1.0])|(1,[0],[1.0])|[21.01,3.0,1.0,1....|[2

In [31]:
data = data_scaled.select("standardized", "tip")
data.show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------+----+
|standardized                                                                                                            |tip |
+------------------------------------------------------------------------------------------------------------------------+----+
|[1.9084715564899046,2.102828735795046,0.0,2.0547931158599058,0.0,2.1549470973741336,0.0,2.225805748475487]              |1.01|
|[1.1614829837613665,3.154243103692569,2.083476457489454,2.0547931158599058,0.0,2.1549470973741336,0.0,2.225805748475487]|1.66|
|[2.3600345733874573,3.154243103692569,2.083476457489454,2.0547931158599058,0.0,2.1549470973741336,0.0,2.225805748475487]|3.5 |
|[2.659953293565682,2.102828735795046,2.083476457489454,2.0547931158599058,0.0,2.1549470973741336,0.0,2.225805748475487] |3.31|
|[2.762172782465377,4.205657471590092,0.0,2.0547931158599058,0.0,2.1549470973741336,0.0,2.22580574847548

In [32]:
# train/test split
train_data, test_data = data.randomSplit([0.75, 0.25])
train_data.count(), test_data.count()

(181, 63)

In [33]:
## Linear Regression
regressor = LinearRegression(featuresCol="standardized", labelCol='tip')
model = regressor.fit(train_data)

In [34]:
model.coefficients, model.intercept

(DenseVector([0.9666, 0.1016, -0.0959, 0.0355, -0.0526, -0.0302, -0.0832, -0.0445]),
 0.8926421380188156)

In [35]:
### prediction
pred = model.transform(test_data)
pred.show(5)

+--------------------+----+------------------+
|        standardized| tip|        prediction|
+--------------------+----+------------------+
|[1.73211485586076...|1.57|2.4893981825009464|
|[2.07022547299052...| 3.0|3.0299263105497456|
|[1.16035969267455...|1.67| 2.243339174029719|
|[2.01293762756321...|4.08|2.7163570220486433|
|[2.20726698558132...| 3.0| 3.103934214498545|
+--------------------+----+------------------+
only showing top 5 rows



In [36]:
model.summary.meanSquaredError, model.summary.meanAbsoluteError, model.summary.r2

(1.014077828741951, 0.7345702090253423, 0.49354727285377686)