In [None]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Spark MLlib")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

# TRANSFORMERS

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

# Creación de un nuevo DataFrame con nombres de columnas y vectores diferentes.
dataset = spark.createDataFrame(
    [
        (0, 22.0, 0.5, Vectors.dense([1.0, 20.0, 0.7]), 0.0),
        (1, 15.0, 0.3, Vectors.dense([2.0, 30.0, 0.2]), 1.0),
        (2, 25.0, 0.34, Vectors.dense([2.0, 25.0, 0.3]), 1.0),
        (3, 19.0, 0.55, Vectors.dense([1.0, 50.0, 0.4]), 0.0),
        (4, 11.0, 0.39, Vectors.dense([4.0, 20.0, 0.6]), 1.0),
    ],
    ["id", "sessionDuration", "bounceRate", "userMetrics", "conversion"],
)

# Creamos un ensamblador con las nuevas columnas.
assembler = VectorAssembler(
    inputCols=["sessionDuration", "bounceRate", "userMetrics"], outputCol="features"
)

# Aplicamos el ensamblador para combinar las columnas en una nueva columna de tipo vector.
output = assembler.transform(dataset)
print("Assembled sessionDuration, bounceRate, userMetrics into column 'features'")
output.select("features", "conversion").show(truncate=False)

# ESTIMATORS

## REGRESSION

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.sql import functions as F

df_train, df_test = output.select(
    "features", F.col("conversion").alias("label")
).randomSplit([0.55, 0.45])

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
trainedModel = lr.fit(df_train)

# Uso del modelo ajustado como un transformador para generar predicciones.
predictions = trainedModel.transform(df_test)
predictions.show()

## CLASSIFICATION

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
trainedModel = lr.fit(df_train)

# Uso del modelo ajustado como un transformador para generar predicciones.
predictions = trainedModel.transform(df_test)
predictions.show()

In [None]:
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(trainedModel.coefficientMatrix))
print("Intercept: " + str(trainedModel.interceptVector))

trainingSummary = trainedModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print(
    "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
    % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall)
)

In [None]:
# Save model in hadoop

trainedModel.write().overwrite().save("hdfs://namenode:9000/models/lr_test")

In [None]:
spark.stop()