### Regresión Logística

In [0]:
generation = "mod4gen13"

In [0]:
data = spark.read.csv(f"/mnt/{generation}/input/titanic.csv", inferSchema=True, header=True)

In [0]:
data.count(), len(data.columns)

In [0]:
data.printSchema()

In [0]:
data.display()

**EDA**

In [0]:
data.groupBy("Survived").count().display()

In [0]:
data.groupBy("Survived").count().display()

In [0]:
data.select("Sex").display()

In [0]:
data.summary("count", "mean", "min", "10%", "20%", "30%", "40%", "50%", "60%", "70%", "80%", "90%", "max").select("summary", "Age", "Fare").display()

In [0]:
data.select("Age").display()

In [0]:
data.select("Fare").display()

**Pre-procesamiento**

In [0]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [0]:
data.columns

In [0]:
my_cols = data.select(["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"])

In [0]:
my_final_data = my_cols.na.drop()

In [0]:
gender_indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
gender_encoder = OneHotEncoder(inputCol="SexIndex", outputCol="SexVec")

In [0]:
embark_indexer = StringIndexer(inputCol="Embarked", outputCol="EmbarkIndex")
embark_encoder = OneHotEncoder(inputCol="EmbarkIndex", outputCol="EmbarkVec")

In [0]:
assembler = VectorAssembler(inputCols=["Pclass", "SexVec", "Age", "SibSp", "Parch", "Fare", "EmbarkVec"], outputCol="features")

**Modelación**

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

In [0]:
log_reg = LogisticRegression(featuresCol="features", labelCol="Survived")

In [0]:
pipeline = Pipeline(stages=[gender_indexer,
                            embark_indexer,
                            gender_encoder,
                            embark_encoder,
                            assembler,
                            log_reg])

In [0]:
train_titanic_data, test_titanic_data = my_final_data.randomSplit([0.8, 0.2])

In [0]:
fit_model = pipeline.fit(train_titanic_data)

In [0]:
type(fit_model)

In [0]:
results = fit_model.transform(test_titanic_data)

In [0]:
results.display()

**Evaluación**

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="Survived")

In [0]:
type(my_eval)

In [0]:
results.select("Survived", "prediction").display()

In [0]:
my_eval.getMetricName()

In [0]:
auc = my_eval.evaluate(dataset=results)
auc

**Persistencia de modelo**

In [0]:
fit_model.save(f"/mnt/{generation}/output/lr_model/")

In [0]:
from pyspark.ml import PipelineModel

In [0]:
loaded_model = PipelineModel.load(f"/mnt/{generation}/output/lr_model/")

In [0]:
new_results = loaded_model.transform(test_titanic_data)

In [0]:
new_results.display()