# SVM

Import needed modules and initialize access to Spark instance

In [1]:
from pyspark import SparkContext, SQLContext

spark_context = SparkContext()
spark_sql = SQLContext(spark_context)

LIBSVM format:
```
0 128:51 129:159 130:253 131:159 132:50 155:48 156:238 157:252 158:252 159:252 160:237 182:54 183:227 184:253 185:252 186:239 187:233 188:252 189:57 190:6 208:10 209:60 210:224 211:252 212:253 213:252 214:202 215:84 216:252 217:253 218:122 236:163 237:252 238:252 239:252 240:253 241:252 242:252 243:96 244:189 245:253 246:167 263:51 264:238 265:253 266:253 267:190 268:114 269:253 270:228 271:47 272:79 273:255 274:168 290:48 291:238 292:252 293:252 294:179 295:12 296:75 297:121 298:21 301:253 302:243 303:50 317:38 318:165 319:253 320:233 321:208 322:84 329:253 330:252 331:165 344:7 345:178 346:252 347:240 348:71 349:19 350:28 357:253 358:252 359:195 372:57 373:252 374:252 375:63 385:253 386:252 387:195 400:198 401:253 402:190 413:255 414:253 415:196 427:76 428:246 429:252 430:112 441:253 442:252 443:148 455:85 456:252 457:230 458:25 467:7 468:135 469:253 470:186 471:12 483:85 484:252 485:223 494:7 495:131 496:252 497:225 498:71 511:85 512:252 513:145 521:48 522:165 523:252 524:173 539:86 540:253 541:225 548:114 549:238 550:253 551:162 567:85 568:252 569:249 570:146 571:48 572:29 573:85 574:178 575:225 576:253 577:223 578:167 579:56 595:85 596:252 597:252 598:252 599:229 600:215 601:252 602:252 603:252 604:196 605:130 623:28 624:199 625:252 626:252 627:253 628:252 629:252 630:233 631:145 652:25 653:128 654:252 655:253 656:252 657:141 658:37
1 159:124 160:253 161:255 162:63 186:96 187:244 188:251 189:253 190:62 214:127 215:251 216:251 217:253 218:62 241:68 242:236 243:251 244:211 245:31 246:8 268:60 269:228 270:251 271:251 272:94 296:155 297:253 298:253 299:189 323:20 324:253 325:251 326:235 327:66 350:32 351:205 352:253 353:251 354:126 378:104 379:251 380:253 381:184 382:15 405:80 406:240 407:251 408:193 409:23 432:32 433:253 434:253 435:253 436:159 460:151 461:251 462:251 463:251 464:39 487:48 488:221 489:251 490:251 491:172 515:234 516:251 517:251 518:196 519:12 543:253 544:251 545:251 546:89 570:159 571:255 572:253 573:253 574:31 597:48 598:228 599:253 600:247 601:140 602:8 625:64 626:251 627:253 628:220 653:64 654:251 655:253 656:220 681:24 682:193 683:253 684:220
```

In [2]:
data = spark_sql.read.format("libsvm").load("sample_libsvm_data.txt")

splits = data.randomSplit([0.6, 0.4], 1234)
training = splits[0]
test = splits[1]

print(training.show())
print(training.filter(training['label'] == 1).show())


from pyspark.ml.classification import LinearSVC

lsvc = LinearSVC(maxIter=10, regParam=0.1)
lsvcModel = lsvc.fit(training)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[121,122,123...|
|  0.0|(692,[122,123,124...|
|  0.0|(692,[122,123,148...|
|  0.0|(692,[123,124,125...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[125,126,127...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[127,128,129...|
|  0.0|(692,[129,130,131...|
|  0.0|(692,[150,151,152...|
+-----+--------------------+
only showing top 20 rows

None
+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[119,120,121...|
|  1.0|(692,[123,124,125...|
|  1.0|(692,[123,124,125...|
|  1.0|(692,[123,124,125...|
|  1.0|(692,[123,124,125...|
|  1.0|(692,

Predicting on test data:

In [3]:
result = lsvcModel.transform(test)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 1.0


# Multiclass? Why not?

In [4]:
data = spark_sql.read.format("libsvm").load("sample_multiclass_classification_data.txt")
(training, test) = data.randomSplit([0.8, 0.2])

print(training.show())

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[-0....|
|  0.0|(4,[0,1,2,3],[-1....|
|  0.0|(4,[0,1,2,3],[-1....|
|  0.0|(4,[0,1,2,3],[0.0...|
|  0.0|(4,[0,1,2,3],[0.1...|
|  0.0|(4,[0,1,2,3],[0.1...|
|  0.0|(4,[0,1,2,3],[0.1...|
|  0.0|(4,[0,1,2,3],[0.1...|
|  0.0|(4,[0,1,2,3],[0.1...|
|  0.0|(4,[0,1,2,3],[0.2...|
|  0.0|(4,[0,1,2,3],[0.2...|
|  0.0|(4,[0,1,2,3],[0.2...|
|  0.0|(4,[0,1,2,3],[0.3...|
+-----+--------------------+
only showing top 20 rows

None


1: create classifier object

In [5]:
lsvc = LinearSVC(maxIter=10, regParam=0.1)

2: create OneVsRest object

In [6]:
from pyspark.ml.classification import OneVsRest

ovr = OneVsRest(classifier=lsvc)

3: Fit training set

In [7]:
ovrModel = ovr.fit(training)

4: Predict test set

In [8]:
predictions = ovrModel.transform(test)

5: Let's check accurancy

In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


print(predictions.show())
predictionAndLabels = predictions.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels) * 100) + "%")

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|(4,[0,1,2,3],[-0....|       0.0|
|  0.0|(4,[0,1,2,3],[0.0...|       0.0|
|  0.0|(4,[0,1,2,3],[0.1...|       0.0|
|  0.0|(4,[0,1,2,3],[0.1...|       0.0|
|  0.0|(4,[0,1,2,3],[0.1...|       0.0|
|  0.0|(4,[0,1,2,3],[0.1...|       0.0|
|  0.0|(4,[0,1,2,3],[0.1...|       0.0|
|  0.0|(4,[0,1,2,3],[0.4...|       0.0|
|  0.0|(4,[0,1,2,3],[0.5...|       0.0|
|  0.0|(4,[0,1,2,3],[0.8...|       0.0|
|  0.0|(4,[0,1,2,3],[1.0...|       0.0|
|  0.0|(4,[0,2,3],[0.222...|       0.0|
|  1.0|(4,[0,1,2,3],[-0....|       1.0|
|  1.0|(4,[0,1,2,3],[-0....|       1.0|
|  1.0|(4,[0,1,2,3],[-0....|       1.0|
|  1.0|(4,[0,1,2,3],[-0....|       1.0|
|  1.0|(4,[0,1,2,3],[-0....|       1.0|
|  1.0|(4,[0,1,2,3],[-0....|       1.0|
|  1.0|(4,[0,1,2,3],[-0....|       1.0|
|  1.0|(4,[0,1,2,3],[-0....|       1.0|
+-----+--------------------+----------+
only showing top 20 rows

None
Test set 