# Spark Preparation
We check if we are in Google Colab.  If this is the case, install all necessary packages.

To run spark in Colab, we need to first install all the dependencies in Colab environment i.e. Apache Spark 3.2.1 with hadoop 3.2, Java 8 and Findspark to locate the spark in the system. The tools installation can be carried out inside the Jupyter Notebook of the Colab.
Learn more from [A Must-Read Guide on How to Work with PySpark on Google Colab for Data Scientists!](https://www.analyticsvidhya.com/blog/2020/11/a-must-read-guide-on-how-to-work-with-pyspark-on-google-colab-for-data-scientists/)

credit: Natawut Nupairoj

In [None]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [None]:
if IN_COLAB:
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null
    !wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
    !tar xf spark-3.2.1-bin-hadoop3.2.tgz
    !mv spark-3.2.1-bin-hadoop3.2 spark
    !pip install -q findspark

In [None]:
if IN_COLAB:
  import os
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["SPARK_HOME"] = "/content/spark"

In [None]:
import findspark
findspark.init()

# Pyspark_Classification_Pipeline_Churn

In [None]:
#1 - import module
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel
from pyspark.ml.feature import StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
#2 - Create spark context
sc = SparkContext.getOrCreate()

In [None]:
sc

In [None]:
sc._conf.getAll()

In [None]:
print (sc.getConf().toDebugString())

In [None]:
#3 - Setup SparkSession(SparkSQL)
spark = (SparkSession
         .builder
         .appName("Pyspark_Classification_Pipeline_Churn")
         .getOrCreate())
print (spark)

In [None]:
!wget https://github.com/kaopanboonyuen/GISTDA2022/raw/main/dataset/churn.csv

In [None]:
#4 - Read file to spark DataFrame
data = (spark
        .read
        .option("header","true")
        .option("inferSchema", "true")
        .csv("churn.csv"))
data.cache()
print ("finish caching data")

In [None]:
#5 - Understand data and problems
category = ['International plan','Voice mail plan']
continuous = ['Number vmail messages','Total day minutes','Total day calls','Total day charge','Total eve minutes','Total eve calls','Total eve charge','Total night minutes','Total night calls','Total night charge','Total intl minutes','Total intl calls','Total intl charge','Customer service calls']
label = 'churn'

unique_features = ['State','Account length','Area code']
unused_features = ['Total day charge','Total eve charge','Total night charge','Total intl charge']
#bcz charges computed from minutes / 22.2252

print (len(category) + len(continuous))


In [None]:
data.describe().toPandas()

In [None]:
data.printSchema()

In [None]:
data.sample(False, 0.001, 1234).toPandas()

In [None]:
data.groupBy(label).count().toPandas()

In [None]:
#6 - Change column type from boolean to string
data.select(label).printSchema()
data = data.withColumn(label, data[label].cast("string"))
data.select(label).printSchema()

In [None]:
#8 - Remove unused variables
print ("number of features : " + str(len(data.drop(label).head())))
for unused_feature in unique_features + unused_features:
    print (unused_feature)
    data = data.drop(unused_feature)
print ("\nnumber of features remain : " + str(len(data.drop(label).head())))


category = [feature for feature in category if feature not in (unique_features + unused_features)]
continuous = [feature for feature in continuous if feature not in (unique_features + unused_features)]

print ("\nnumber of features remain : " + str(len(category) + len(continuous)))

In [None]:
#9 - split Train and Test data
data = data.sort(label)
(trainingData, testData) = data.randomSplit([0.7, 0.3],seed = 50)

print(type(data))
print(type(trainingData))
print(type(testData))

print ("data count : " + str(data.count()))
print ("trainingData count : " + str(trainingData.count()))
print ("testData count : " + str(testData.count()))

data.groupBy(label).count().show()
trainingData.groupBy(label).count().show()
testData.groupBy(label).count().show()

In [None]:
#10 - String indexer
featureidx_list = [StringIndexer(inputCol = label, outputCol='label')]
featureidx_list += [StringIndexer(inputCol = c, outputCol=c + 'idx') for c in category]

print (featureidx_list)

In [None]:
#11 - Create Vector
features = continuous + [c + 'idx' for c in category]
assem =  VectorAssembler(inputCols = features ,outputCol="features")

print (type(assem))

In [None]:
#12 - Create model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

print (dt)


In [None]:
#13 - Set ML pipeline
print (featureidx_list)
print (assem)
print (dt)
print ("\n")

all_process_list = featureidx_list + [assem,dt]
print (all_process_list)

pipeline = Pipeline(stages=all_process_list)
print ("\n")
print (pipeline)


In [None]:
#14 - Train model
model = pipeline.fit(trainingData)
#predictions.cache()

In [None]:
#15 - (Optional) Assign multiple parameter lists used to train multiple models
paramGrid = (ParamGridBuilder()
    .addGrid(dt.maxDepth, [4,5,6])
     .addGrid(dt.minInstancesPerNode, [1,10])
     .addGrid(dt.impurity, ["gini","entropy"])        
    .build())

for param in paramGrid:
    print (param)
    print ("\n\n")

In [None]:
# #16 - (Optional) Train multiple models with multiple parameters
# crossval = CrossValidator(estimator=pipeline,
#                       estimatorParamMaps=paramGrid,
#                       evaluator=MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1"),
#                       numFolds=3)
# cvModel = crossval.fit(trainingData)
# model = cvModel.bestModel

# print (model)

In [None]:
#17 - Make predictions
predictions = model.transform(testData)

In [None]:
# Print sample result
predictions.toPandas()

In [None]:
# Print sample result
predictions.select("prediction", "rawPrediction", "probability", "label", "features").toPandas()

In [None]:
#18 - Evaluate model
for metricName in ['accuracy','weightedPrecision','weightedRecall','f1']:
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName=metricName)
    result = evaluator.evaluate(predictions)
    print ('%s = %g' % (metricName,result))
    if(metricName == 'accuracy') :
        print("error = %g " % (1.0 - result))


In [None]:
#19 - Show tree diagram
treeModel = model.stages[-1]
treeModel_debug_str = treeModel.toDebugString
print (treeModel_debug_str)


In [None]:
#20 - Save model
model_dir = "/user/admin/"
modelFile = "dt_churn"

#Save model as Pipeline model format
model.write().overwrite().save(model_dir + modelFile +".plmodel")

#Save model as DecisionTree model format
treeModel.write().overwrite().save(model_dir + modelFile +".model")

print ("finish save model")


In [None]:
#21 - Load Pipeline model
read_plmodel = PipelineModel.read().load(model_dir + modelFile + ".plmodel")
print (read_plmodel.stages)


In [None]:
#22 - Load DecisionTree model
read_model = DecisionTreeClassificationModel.read().load(model_dir + modelFile + ".model")
print ("depth : " + str(read_model.depth))
print ("numNodes : " + str(read_model.numNodes))
print ("featureImportances : " + str(read_model.featureImportances))


#these lines avaiable for Spark2.1 or above
#print readed_model.numClasses  
#print readed_model.numFeatures
