# Logistic Regression

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('logistic-regression').getOrCreate()

In [3]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

In [22]:
training = spark.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ['label', 'features'])


training.show()

+-----+--------------+
|label|      features|
+-----+--------------+
|  1.0| [0.0,1.1,0.1]|
|  0.0|[2.0,1.0,-1.0]|
|  0.0| [2.0,1.3,1.0]|
|  1.0|[0.0,1.2,-0.5]|
+-----+--------------+



In [23]:
# 로지스틱 리그레션의 인스턴스를 만들어보자
lr = LogisticRegression(maxIter=30, regParam=0.01)
mode = lr.fit(training)

In [25]:
test = spark.createDataFrame([
    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
    (0.0, Vectors.dense([3.0, 2.0, -0.1])),    
    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ['label', 'features'])


test.show()

+-----+--------------+
|label|      features|
+-----+--------------+
|  1.0|[-1.0,1.5,1.3]|
|  0.0|[3.0,2.0,-0.1]|
|  1.0|[0.0,2.2,-1.5]|
+-----+--------------+



In [26]:
prediction = model.transform(test)
prediction.show()

+-----+--------------+--------------------+--------------------+----------+
|label|      features|       rawPrediction|         probability|prediction|
+-----+--------------+--------------------+--------------------+----------+
|  1.0|[-1.0,1.5,1.3]|[-6.2435550918400...|[0.00193916823498...|       1.0|
|  0.0|[3.0,2.0,-0.1]|[5.45228608726759...|[0.99573180142693...|       0.0|
|  1.0|[0.0,2.2,-1.5]|[-4.4104172202339...|[0.01200425500655...|       1.0|
+-----+--------------+--------------------+--------------------+----------+



# PipeLine

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('logistic-regression').getOrCreate()

23/06/11 20:16:39 WARN Utils: Your hostname, Keemyoui-MacBookPro.local resolves to a loopback address: 127.0.0.1; using 192.168.35.79 instead (on interface en0)
23/06/11 20:16:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/06/11 20:16:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/11 20:16:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

In [4]:
# spark가 있을 때 1이 들어오는 간단한 로직
training = spark.createDataFrame([
    (0, 'a b c d e spark', 1.0),
    (1, 'b d', 0.0),
    (2, 'spark f g h', 1.0),
    (3, 'hadoop maprecedure', 0.0)
], ['id', 'text', 'label'])

In [6]:
# 글자들을 split, input으로 text가 들어갈거임
tokenizer = Tokenizer(inputCol = 'text', outputCol='words')
# TermFrequency도 인풋 = 토크나이저의 getOutputCol : 토크나이저가 쓰는 outputcol을 쓴다.
hashingTF = HashingTF(inputCol = tokenizer.getOutputCol(), outputCol='features')
lr = LogisticRegression(maxIter=30, regParam=0.001)

In [19]:
# pipeline을 통해 model을 나온다. (model이 아웃풋)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
model = pipeline.fit(training)

In [22]:
test = spark.createDataFrame([
    (4, 'spark i j k'),    
    (5, 'l m n'),    
    (6, 'spark hadoop spark'),    
    (7, 'apache hadoop'),        
], ['id', 'text'])

In [24]:
prediction = model.transform(test)
prediction.select(['id', 'text', 'probability', 'prediction']).show()

+---+------------------+--------------------+----------+
| id|              text|         probability|prediction|
+---+------------------+--------------------+----------+
|  4|       spark i j k|[0.63102699631690...|       0.0|
|  5|             l m n|[0.98489377609773...|       0.0|
|  6|spark hadoop spark|[0.13563147748816...|       1.0|
|  7|     apache hadoop|[0.99563405823116...|       0.0|
+---+------------------+--------------------+----------+

