# Advanced Analytics and Machine Learning

In [None]:
import pandas as pd
from pyspark.ml.classification import *
from pyspark.ml.clustering import KMeans, BisectingKMeans, GaussianMixture, LDA
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.feature import *
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.ml.regression import *
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.mllib.evaluation import BinaryClassificationMetrics, RegressionMetrics, RankingMetrics
from pyspark.sql import Row, SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [None]:
spark = SparkSession.builder.master("local").appName("Hello World").getOrCreate()

## RFormula

In [None]:
sales = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("../data/retail-data/by-day/*.csv")\
    .coalesce(5)\
    .where("Description IS NOT NULL")
fakeIntDF = spark.read.parquet("../data/simple-ml-integers")
simpleDF = spark.read.json("../data/simple-ml")
scaleDF = spark.read.parquet("../data/simple-ml-scaling")
sales.show(2)
fakeIntDF.show(2)
simpleDF.show(2)
scaleDF.show(2)

In [None]:
supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")
supervised.fit(simpleDF).transform(simpleDF).take(2)

## SQL Transformers

In [None]:
basicTransformation = SQLTransformer()\
    .setStatement("""
        SELECT sum(Quantity), count(*), CustomerID
        FROM __THIS__
        GROUP BY CustomerID
      """)
basicTransformation.transform(sales).show()

## VectorAssembler

In [None]:
va = VectorAssembler().setInputCols(["int1", "int2", "int3"])
va.transform(fakeIntDF).show()

## Bucketing

In [None]:
contDF = spark.range(20).selectExpr("cast(id as double)")
bucketBorders = [-1.0, 5.0, 10.0, 250.0, 600.0]
bucketer = Bucketizer().setSplits(bucketBorders).setInputCol("id").setOutputCol("result")
bucketer.transform(contDF).show(5)

In [None]:
bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id").setOutputCol("result")
bucketer.fit(contDF).transform(contDF).show(5)

## StandardScaler

In [None]:
sScaler = StandardScaler().setInputCol("features")
sScaler.fit(scaleDF).transform(scaleDF).show()

## MinMaxScaler

In [None]:
minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features")
fittedminMax = minMax.fit(scaleDF)
fittedminMax.transform(scaleDF).show()

## MaxAbsScaler

In [None]:
maScaler = MaxAbsScaler().setInputCol("features")
fittedmaScaler = maScaler.fit(scaleDF)
fittedmaScaler.transform(scaleDF).show()

## Normalizer

In [None]:
manhattanDistance = Normalizer().setP(1).setInputCol("features")
manhattanDistance.transform(scaleDF).show()

## StringIndexer

In [None]:
lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
idxRes = lblIndxr.fit(simpleDF).transform(simpleDF)
idxRes.show(5)

In [None]:
labelReverse = IndexToString().setInputCol("labelInd")
labelReverse.transform(idxRes).show(5)

## OneHotEncoder

In [None]:
lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd")
colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color"))
ohe = OneHotEncoder().setInputCol("colorInd")
ohe.transform(colorLab).show(5)

## Tokenizer

In [None]:
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(5, False)

## CountVectorizer

In [None]:
cv = CountVectorizer()\
    .setInputCol("DescOut")\
    .setOutputCol("countVec")\
    .setVocabSize(500)\
    .setMinTF(1)\
    .setMinDF(2)
cv.fit(tokenized).transform(tokenized).show(5, False)

## Word2Vec

In [None]:
documentDF = spark.createDataFrame([
    ("Hi I hear about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])
word2Vec = Word2Vec(vectorSize=3, minCount=1, inputCol="text", outputCol="result")
word2Vec.fit(documentDF).transform(documentDF).show(5, False)

## PCA

In [None]:
pca = PCA().setInputCol("features").setK(2)
pca.fit(scaleDF).transform(scaleDF).show(5, False)

## PolynomialExpansion

In [None]:
pe = PolynomialExpansion().setInputCol("features").setDegree(2)
pe.transform(scaleDF).show()

## Logistic Regression

In [None]:
bInput = spark.read.format("parquet").load("../data/binary-classification")\
  .selectExpr("features", "cast(label as double) as label")
bInput.show(5)

In [None]:
lr = LogisticRegression()
trainedModel = lr.fit(bInput)
print(trainedModel.coefficients)
print(trainedModel.intercept)

In [None]:
summary = trainedModel.summary
print(summary.areaUnderROC)
summary.roc.show()
summary.pr.show()

In [None]:
summary.objectiveHistory

## Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier()
trainedModel = dt.fit(bInput)

## Random Forest Classifier

In [None]:
rfClassifier = RandomForestClassifier()
trainedModel = rfClassifier.fit(bInput)

## GBM Classifier

In [None]:
gbtClassifier = GBTClassifier()
trainedModel = gbtClassifier.fit(bInput)

## Naive Bayes Classifier

In [None]:
nb = NaiveBayes()
trainedModel = nb.fit(bInput.where("label != 0"))

## Evaluators for Classifiers

In [None]:
out = trainedModel.transform(bInput)\
    .select("prediction", "label")\
    .rdd.map(lambda x: (float(x[0]), float(x[1])))
metrics = BinaryClassificationMetrics(out)

In [None]:
print(metrics.areaUnderPR)
print(metrics.areaUnderROC)

## Classification Pipeline

In [None]:
df = spark.read.json("../data/simple-ml")
df.orderBy("value2").show()

In [None]:
train, test = df.randomSplit([0.7, 0.4])
rForm = RFormula()
lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features")
stages = [rForm, lr]
pipeline = Pipeline().setStages(stages)
params = ParamGridBuilder()\
    .addGrid(rForm.formula, [
        "lab~ . + color:value1",
        "lab~ . + color:value1 + color:value2"])\
    .addGrid(lr.elasticNetParam, [0, 0.5, 1])\
    .addGrid(lr.regParam, [0.1, 2.0])\
    .build()
evaluator = BinaryClassificationEvaluator()\
    .setMetricName("areaUnderROC")\
    .setRawPredictionCol("prediction")\
    .setLabelCol("label")
tvs = TrainValidationSplit()\
    .setTrainRatio(0.75)\
    .setEstimatorParamMaps(params)\
    .setEstimator(pipeline)\
    .setEvaluator(evaluator)
tvsFitted = tvs.fit(train)
evaluator.evaluate(tvsFitted.transform(test))

## Linear Regression

In [None]:
df = spark.read.load("../data/regression")

In [None]:
lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
trainedModel = lr.fit(df)
summary = trainedModel.summary
summary.residuals.show()
print(summary.totalIterations)
print(summary.objectiveHistory)
print(summary.rootMeanSquaredError)
print(summary.r2)

## Generalized Linear Regression

In [None]:
glr = GeneralizedLinearRegression()\
    .setFamily("gaussian")\
    .setLink("identity")\
    .setMaxIter(10)\
    .setRegParam(0.3)\
    .setLinkPredictionCol("linkOut")
trainedModel = glr.fit(df)

## Decision Tree Regressor

In [None]:
dtr = DecisionTreeRegressor()
trainedModel = dtr.fit(df)

## Random Forest Regressor

In [None]:
rf =  RandomForestRegressor()
trainedModel = rf.fit(df)

## GBT Regressor

In [None]:
gbt = GBTRegressor()
trainedModel = gbt.fit(df)

## Regression Pipeline

In [None]:
glr = GeneralizedLinearRegression().setFamily("gaussian").setLink("identity")
pipeline = Pipeline().setStages([glr])
params = ParamGridBuilder().addGrid(glr.regParam, [0, 0.5, 1]).build()
evaluator = RegressionEvaluator()\
    .setMetricName("rmse")\
    .setPredictionCol("prediction")\
    .setLabelCol("label")
cv = CrossValidator()\
    .setEstimator(pipeline)\
    .setEvaluator(evaluator)\
    .setEstimatorParamMaps(params)\
    .setNumFolds(2) # should always be 3 or more but this dataset is small
model = cv.fit(df)

In [None]:
out = model.transform(df)\
  .select("prediction", "label").rdd.map(lambda x: (float(x[0]), float(x[1])))
metrics = RegressionMetrics(out)
print("MSE: " + str(metrics.meanSquaredError))
print("RMSE: " + str(metrics.rootMeanSquaredError))
print("R-squared: " + str(metrics.r2))
print("MAE: " + str(metrics.meanAbsoluteError))
print("Explained variance: " + str(metrics.explainedVariance))

## Recommendation

In [None]:
ratings = spark.read.text("../data/sample_movielens_ratings.txt")\
    .rdd.toDF()\
    .selectExpr("split(value , '::') as col")\
    .selectExpr(
        "cast(col[0] as int) as userId",
        "cast(col[1] as int) as movieId",
        "cast(col[2] as float) as rating",
        "cast(col[3] as long) as timestamp")
ratings.show(5, False)

In [None]:
training, test = ratings.randomSplit([0.8, 0.2])
als = ALS()\
    .setMaxIter(5)\
    .setRegParam(0.01)\
    .setUserCol("userId")\
    .setItemCol("movieId")\
    .setRatingCol("rating")
alsModel = als.fit(training)
predictions = alsModel.transform(test)

In [None]:
alsModel.recommendForAllUsers(3)\
  .selectExpr("userId", "explode(recommendations)").show(5)

In [None]:
alsModel.recommendForAllItems(3)\
  .selectExpr("movieId", "explode(recommendations)").show(5)

In [None]:
evaluator = RegressionEvaluator()\
    .setMetricName("rmse")\
    .setLabelCol("rating")\
    .setPredictionCol("prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = %f" % rmse)

In [None]:
regComparison = predictions.select("rating", "prediction")\
  .rdd.map(lambda x: (x[0], x[1]))
metrics = RegressionMetrics(regComparison)
metrics.rootMeanSquaredError

In [None]:
perUserActual = predictions\
    .where("rating > 2.5")\
    .groupBy("userId")\
    .agg(expr("collect_set(movieId) as movies"))
perUserPredictions = predictions\
  .orderBy(col("userId"), expr("prediction DESC"))\
  .groupBy("userId")\
  .agg(expr("collect_list(movieId) as movies"))

In [None]:
perUserActualvPred = perUserActual.join(perUserPredictions, ["userId"]).rdd\
    .map(lambda row: (row[1], row[2][:10]))
ranks = RankingMetrics(perUserActualvPred)
print(ranks.meanAveragePrecision)
print(ranks.precisionAt(5))

## K-Means

In [None]:
va = VectorAssembler()\
    .setInputCols(["Quantity", "UnitPrice"])\
    .setOutputCol("features")
sales = va.transform(spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("../data/retail-data/by-day/*.csv")
  .limit(50)
  .coalesce(1)
  .where("Description IS NOT NULL"))
sales.cache()
sales.show(5, False)

In [None]:
km = KMeans().setK(5)
trainedModel = km.fit(sales)

In [None]:
summary = trainedModel.summary
print(summary.clusterSizes) # number of points
trainedModel.computeCost(sales)
centers = trainedModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

## Gaussian Mixtures

In [None]:
gmm = GaussianMixture().setK(5)
print(gmm.explainParams())
model = gmm.fit(sales)

In [None]:
summary = model.summary
print(model.weights)
model.gaussiansDF.show()
summary.cluster.show(5)
print(summary.clusterSizes)
summary.probability.show(5)

## Latent Dirichlet Allocation

In [None]:
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.drop("features"))
cv = CountVectorizer()\
    .setInputCol("DescOut")\
    .setOutputCol("features")\
    .setVocabSize(500)\
    .setMinTF(0)\
    .setMinDF(0)\
    .setBinary(True)
cvFitted = cv.fit(tokenized)
prepped = cvFitted.transform(tokenized)

In [None]:
lda = LDA().setK(10).setMaxIter(5)
trainedModel = lda.fit(prepped)

In [None]:
trainedModel.describeTopics(3).show()

In [None]:
cvFitted.vocabulary