**From the documentation**

In [2]:
import org.apache.spark.ml.classification.LogisticRegression

// Load training data
val training = spark.read.format("libsvm").load("./datasets/sample_libsvm_data.txt")

val lr = new LogisticRegression()
  .setMaxIter(10)
  .setRegParam(0.3)
  .setElasticNetParam(0.8)

// Fit the model
val lrModel = lr.fit(training)

// Print the coefficients and intercept for logistic regression
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

// We can also use the multinomial family for binary classification
val mlr = new LogisticRegression()
  .setMaxIter(10)
  .setRegParam(0.3)
  .setElasticNetParam(0.8)
  .setFamily("multinomial")

val mlrModel = mlr.fit(training)

// Print the coefficients and intercepts for logistic regression with multinomial family
println(s"Multinomial coefficients: ${mlrModel.coefficientMatrix}")
println(s"Multinomial intercepts: ${mlrModel.interceptVector}")

Coefficients: (692,[244,263,272,300,301,328,350,351,378,379,405,406,407,428,433,434,455,456,461,462,483,484,489,490,496,511,512,517,539,540,568],[-7.353983524188197E-5,-9.102738505589466E-5,-1.9467430546904298E-4,-2.0300642473486668E-4,-3.1476183314863995E-5,-6.842977602660743E-5,1.5883626898239883E-5,1.4023497091372047E-5,3.5432047524968605E-4,1.1443272898171087E-4,1.0016712383666666E-4,6.014109303795481E-4,2.840248179122762E-4,-1.1541084736508837E-4,3.85996886312906E-4,6.35019557424107E-4,-1.1506412384575676E-4,-1.5271865864986808E-4,2.804933808994214E-4,6.070117471191634E-4,-2.008459663247437E-4,-1.421075579290126E-4,2.739010341160883E-4,2.7730456244968115E-4,-9.838027027269332E-5,-3.808522443517704E-4,-2.5315198008555033E-4,2.7747714770754307E-4,-2.443619763919199E-4,-0.0015394744687597765,-2.3073328411331293E-4]) Intercept: 0.22456315961250325
Multinomial coefficients: 2 x 692 CSCMatrix
(0,244) 4.290365458958277E-5
(1,244) -4.290365458958294E-5
(0,263) 6.488313287833108E-5
(1,263)

import org.apache.spark.ml.classification.LogisticRegression
training: org.apache.spark.sql.DataFrame = [label: double, features: vector]
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_cc65aa20ed1d
lrModel: org.apache.spark.ml.classification.LogisticRegressionModel = LogisticRegressionModel: uid=logreg_cc65aa20ed1d, numClasses=2, numFeatures=692
mlr: org.apache.spark.ml.classification.LogisticRegression = logreg_705b41b798d7
mlrModel: org.apache.spark.ml.classification.LogisticRegressionModel = LogisticRegressionModel: uid=logreg_705b41b798d7, numClasses=2, numFeatures=692


**Real dataset**

In [3]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.log4j._

Logger.getLogger("org").setLevel(Level.ERROR)
val spark = SparkSession.builder().getOrCreate()

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.log4j._
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@701ece27


In [4]:
val data = spark.read.option("header", "true")
                .option("inferSchema", "true")
                .format("csv")
                .load("./datasets/titanic.csv")
data.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

data: org.apache.spark.sql.DataFrame = [PassengerId: int, Survived: int ... 10 more fields]


In [5]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [6]:
data.head(1)

res4: Array[org.apache.spark.sql.Row] = Array([1,0,3,Braund, Mr. Owen Harris,male,22.0,1,0,A/5 21171,7.25,null,S])


In [7]:
val df = data.select(data("Survived").as("label"),
                     $"Pclass", $"Sex", $"Age", $"SibSp", $"Parch", $"Fare", $"Embarked")
df.show()

+-----+------+------+----+-----+-----+-------+--------+
|label|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+-----+------+------+----+-----+-----+-------+--------+
|    0|     3|  male|22.0|    1|    0|   7.25|       S|
|    1|     1|female|38.0|    1|    0|71.2833|       C|
|    1|     3|female|26.0|    0|    0|  7.925|       S|
|    1|     1|female|35.0|    1|    0|   53.1|       S|
|    0|     3|  male|35.0|    0|    0|   8.05|       S|
|    0|     3|  male|null|    0|    0| 8.4583|       Q|
|    0|     1|  male|54.0|    0|    0|51.8625|       S|
|    0|     3|  male| 2.0|    3|    1| 21.075|       S|
|    1|     3|female|27.0|    0|    2|11.1333|       S|
|    1|     2|female|14.0|    1|    0|30.0708|       C|
|    1|     3|female| 4.0|    1|    1|   16.7|       S|
|    1|     1|female|58.0|    0|    0|  26.55|       S|
|    0|     3|  male|20.0|    0|    0|   8.05|       S|
|    0|     3|  male|39.0|    1|    5| 31.275|       S|
|    0|     3|female|14.0|    0|    0| 7.8542|  

df: org.apache.spark.sql.DataFrame = [label: int, Pclass: int ... 6 more fields]


In [10]:
val final_df = df.na.drop()

final_df: org.apache.spark.sql.DataFrame = [label: int, Pclass: int ... 6 more fields]


In [11]:
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors


In [13]:
val genderIndexer = new StringIndexer()
                    .setInputCol("Sex")
                    .setOutputCol("SexIndex")
val embarkIndexer = new StringIndexer()
                    .setInputCol("Embarked")
                    .setOutputCol("EmbarkIndex")

genderIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_916515b42705
embarkIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_501bc4cfd167


In [14]:
val genderEncoder = new OneHotEncoder()
                    .setInputCol("SexIndex")
                    .setOutputCol("SexVec")
val embarkEncoder = new OneHotEncoder()
                    .setInputCol("EmbarkIndex")
                    .setOutputCol("EmbarkVec")

genderEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHotEncoder_d9bdf150cd58
embarkEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHotEncoder_2989a5eb4db6


In [15]:
val assembler = new VectorAssembler()
                .setInputCols(Array("Pclass", "SexVec", "Age", "SibSp", "Parch", "Fare", "EmbarkVec"))
                .setOutputCol("features")

assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_38bf00bc7e8b, handleInvalid=error, numInputCols=7


In [16]:
val Array(train, test) = final_df.randomSplit(Array(0.7, 0.3), seed=42)

train: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: int, Pclass: int ... 6 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: int, Pclass: int ... 6 more fields]


In [17]:
val lr = new LogisticRegression()

lr: org.apache.spark.ml.classification.LogisticRegression = logreg_3d4c5a01dfba


In [18]:
import org.apache.spark.ml.Pipeline

val pipeline = new Pipeline()
                .setStages(Array(genderIndexer, embarkIndexer, genderEncoder, embarkEncoder, assembler, lr))

import org.apache.spark.ml.Pipeline
pipeline: org.apache.spark.ml.Pipeline = pipeline_c5a0454cf328


In [19]:
val model = pipeline.fit(train)

model: org.apache.spark.ml.PipelineModel = pipeline_c5a0454cf328


In [20]:
val results = model.transform(test)

results: org.apache.spark.sql.DataFrame = [label: int, Pclass: int ... 14 more fields]


In [21]:
import org.apache.spark.mllib.evaluation.MulticlassMetrics

import org.apache.spark.mllib.evaluation.MulticlassMetrics


In [22]:
val predictionAndLabels = results.select($"prediction", $"label")
                            .as[(Double, Double)].rdd

predictionAndLabels: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[198] at rdd at <console>:38


In [23]:
val metrics = new MulticlassMetrics(predictionAndLabels)

metrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@6093dfb6


In [24]:
metrics.confusionMatrix

res7: org.apache.spark.mllib.linalg.Matrix =
83.0  22.0
20.0  63.0


In [25]:
metrics.accuracy

res8: Double = 0.776595744680851
