### Examples of ML


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/Colab/ssafy_second_pjt/spark

/content/drive/MyDrive/Colab/ssafy_second_pjt/spark


In [3]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 44 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 49.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=5dab8e796fb001cbf5315c38e47dc509168ab51f1fdb5b4424dc2f7a917bb42f
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tutorial_06').getOrCreate()

In [5]:
training = spark.read.csv('test1.csv', header=True, inferSchema=True)
training.show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|   Krish| 31|        10| 50000|
|Sudhansh| 30|         7| 40000|
|   Sunny| 29|         2| 32000|
|      Ji| 29|         1| 28000|
|    Park| 26|         3| 27000|
|       Q| 23|         2| 18000|
+--------+---+----------+------+



In [6]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [7]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

### Assembler
+ input columns를 1개의 column으로 합치기

In [8]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Age','Experience'], outputCol='Features')

In [9]:
output = assembler.transform(training)
output.show()

+--------+---+----------+------+-----------+
|    Name|Age|Experience|Salary|   Features|
+--------+---+----------+------+-----------+
|   Krish| 31|        10| 50000|[31.0,10.0]|
|Sudhansh| 30|         7| 40000| [30.0,7.0]|
|   Sunny| 29|         2| 32000| [29.0,2.0]|
|      Ji| 29|         1| 28000| [29.0,1.0]|
|    Park| 26|         3| 27000| [26.0,3.0]|
|       Q| 23|         2| 18000| [23.0,2.0]|
+--------+---+----------+------+-----------+



In [10]:
final_data = output.select('Features', 'Salary')
final_data.show()

+-----------+------+
|   Features|Salary|
+-----------+------+
|[31.0,10.0]| 50000|
| [30.0,7.0]| 40000|
| [29.0,2.0]| 32000|
| [29.0,1.0]| 28000|
| [26.0,3.0]| 27000|
| [23.0,2.0]| 18000|
+-----------+------+



### randomsplit & LR

In [31]:
from pyspark.ml.regression import LinearRegression
### train test split
train, test = final_data.randomSplit([0.75,0.25], seed=1)

### lr estimator & model
lr = LinearRegression(featuresCol='Features', labelCol='Salary')
lr_model = lr.fit(train)

In [32]:
### coef
lr_model.coefficients

DenseVector([2103.8805, 1881.3175])

In [33]:
### Intercetps
lr_model.intercept

-33839.84531270862

In [34]:
### prediction 1
pred = lr_model.transform(test)
pred.show()

+----------+------+------------------+
|  Features|Salary|        prediction|
+----------+------+------------------+
|[30.0,7.0]| 40000|42445.792772369656|
+----------+------+------------------+



In [35]:
train.show()

+-----------+------+
|   Features|Salary|
+-----------+------+
| [23.0,2.0]| 18000|
| [26.0,3.0]| 27000|
| [29.0,1.0]| 28000|
| [29.0,2.0]| 32000|
|[31.0,10.0]| 50000|
+-----------+------+



In [39]:
### prediction 2 : evaluate 이용해서 이렇게하면, pred2에서 통계량, error 등 여러 옵션 볼 수 있음
pred2 = lr_model.evaluate(test)
pred2.predictions.show()

+----------+------+------------------+
|  Features|Salary|        prediction|
+----------+------+------------------+
|[30.0,7.0]| 40000|42445.792772369656|
+----------+------+------------------+



In [40]:
type(pred2)

pyspark.ml.regression.LinearRegressionSummary

In [41]:
pred2.meanAbsoluteError, pred2.meanSquaredError

(2445.7927723696557, 5981902.285375646)