**Part 3: Applied Machine Learning**

Preliminaries

In [None]:
!apt-get update
!apt-get install -y openjdk-8-jdk-headless -qq  
!apt-get install maven -qq 

!curl -L "https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz" > spark-2.4.5-bin-hadoop2.7.tgz 
!tar -xvf spark-2.4.5-bin-hadoop2.7.tgz 
!pip install -q findspark 


In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").config("spark.driver.memory", "16g").getOrCreate()
from pyspark.ml.feature import StringIndexer, VectorIndexer, IndexToString, StringIndexer, VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, DoubleType, StringType
from pyspark import SparkContext,  SparkFiles
sc = SparkContext.getOrCreate()
import os
from os import path

In [None]:
if os.getcwd().split("/")[-1]=="src":
  %cd ..
dir = os.getcwd()
!mkdir src 
!mkdir out 
%cd ./src 

/content/src


**Task 1**

Download data set

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
schema = StructType().add("sepal_length",DoubleType(),True).add("sepal_width",DoubleType(),True).add("petal_length",DoubleType(),True).add("petal_width",DoubleType(),True).add("class",StringType(),True)
spark.sparkContext.addFile(url)
iris= spark.read.schema(schema).csv(SparkFiles.get("iris.data"), sep=',',inferSchema=True)  

**Task 2**

Predict with Spark

-Data Processing

In [None]:
num_cols = ['sepal_length','sepal_width','petal_length','petal_width']
labelCol = 'class'
assembler = VectorAssembler(inputCols =num_cols, outputCol='features')
output = assembler.transform(iris)
final_data = output.select('features','class')

-Training

In [None]:
labelIndexer = StringIndexer(inputCol='class',outputCol='indexedLabel').fit(final_data)
labelIndexer.transform(final_data)
featureIndexer =VectorIndexer(inputCol="features",outputCol="indexedFeatures", maxCategories=4).fit(final_data)
logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='indexedLabel')
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels)
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, logr,labelConverter])
model = pipeline.fit(final_data)

-Predicting

In [None]:
pred_data = spark.createDataFrame([(5.1, 3.5, 1.4, 0.2),(6.2, 3.4, 5.4, 2.3)],["sepal_length", "sepal_width", "petal_length", "petal_width"])
output = assembler.transform(pred_data)
predictions = model.transform(output.select("features"))

In [None]:
predictions.select("predictedLabel").show()

+--------------+
|predictedLabel|
+--------------+
|   Iris-setosa|
|Iris-virginica|
+--------------+



In [None]:
DFpredicted = predictions.select("predictedLabel").withColumnRenamed("predictedLabel", "class")
try:
  DFpredicted.write.csv(dir+"/out/out_3_2.txt")
except:
  print("File already created")

File already created
