In [0]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

In [0]:
# Sitió web de venta al por menor:
# Regresión logística
ubica_archivo2 = '/FileStore/tables/Log_Reg_dataset.csv'
df = spark.read.csv(ubica_archivo2, inferSchema = True, header = True)
print((df.count(),len(df.columns)))

In [0]:
df.printSchema()

In [0]:
df.show(5)
df.columns

In [0]:
# Status: Compró el cliente o no
# Age:edad
# Repeat_visitor: visitante repetido o visitante por primera vez
# Plataforma de busqueda usada: Google, Yahoo, Bing
# Número de páginas web en el sitio web
  

In [0]:
#Exploración de datos
df.describe().show()

In [0]:
df.groupBy('Country').count().show()
df.groupBy('Platform').count().show()
df.groupBy('Status').count().show()

In [0]:
df.groupBy('Status').mean().show()

In [0]:
#converting categorical data to numerical form
from pyspark.ml.feature import StringIndexer
search_engine_indexer = StringIndexer(inputCol="Platform", outputCol="Search_Engine_Num").fit(df) # Indexar los sitios de busqueda no suele ser una buena idea
df = search_engine_indexer.transform(df)
df.show(5)

In [0]:
df.groupBy('Search_Engine_Num').count().show()

In [0]:
search_engine_encoder = OneHotEncoder(inputCol="Search_Engine_Num", outputCol="Search_Engine_Vector")
#search_engine_encoder.setDropLast(False)
df = search_engine_encoder.fit(df).transform(df)
df.show(3,False) # El false es paqra que no trunque el contenido de las columnas

In [0]:
df.groupBy('Platform').count().orderBy('count',ascending=False).show(5,False)
df.groupBy('Search_Engine_Num').count().orderBy('count',ascending=False).show(5,False)

In [0]:
df.groupBy('Platform').count().orderBy('count',ascending=False).show(5,False)
df.groupBy('Search_Engine_Vector').count().orderBy('count',ascending=False).show(5,False)

In [0]:
# Indexer (no recomendado)
df.groupBy('Platform').count().orderBy('count',ascending=False).show(5,False)
df.groupBy('Search_Engine_Num').count().orderBy('count',ascending=False).show(5,False)
df.groupBy('Search_Engine_Vector').count().orderBy('count',ascending=False).show(5,False)
country_indexer = StringIndexer(inputCol="Country", outputCol="Country_Num").fit(df)
country_indexer = StringIndexer(inputCol="Country", outputCol="Country_Num").fit(df)
df = country_indexer.transform(df)
df.select(['Country','Country_Num']).show(3,False)


In [0]:
# one hot encoding
country_encoder = OneHotEncoder(inputCol="Country_Num", outputCol="Country_Vector")
df = country_encoder.fit(df).transform(df)
df.select(['Country','country_Num','Country_Vector']).show(3,False)

In [0]:
df.groupBy('Country').count().orderBy('count',ascending=False).show(5,False)
#df.groupBy('Country_Num').count().orderBy('count',ascending=False).show(5,False)
df.groupBy('Country_Vector').count().orderBy('count',ascending=False).show(5,False)


In [0]:
# Integrar la información para correr el modelo
df_assembler = VectorAssembler(inputCols=['Search_Engine_Vector','Country_Vector','Age', 'Repeat_Visitor','Web_pages_viewed'], outputCol="features")
df = df_assembler.transform(df)
df.printSchema()

In [0]:
df.show(n = 5)

In [0]:
df.select(['features','Status']).show(10,False)

In [0]:
model_df=df.select(['features','Status'])
model_df.show(n = 5)

In [0]:
training_df,test_df=model_df.randomSplit([0.75,0.25])

In [0]:
print(training_df.count())
training_df.groupBy('Status').count().show()

In [0]:
print(test_df.count())
test_df.groupBy('Status').count().show()

In [0]:
log_reg=LogisticRegression(labelCol='Status').fit(training_df)

In [0]:
# No muy útiles
train_results=log_reg.evaluate(training_df).predictions
train_results.show(n=5)

In [0]:
results=log_reg.evaluate(test_df).predictions
results.printSchema() 

In [0]:
results.select(['Status','prediction']).show(10,False)

In [0]:
tp = results[(results.Status == 1) & (results.prediction== 1)].count()
tn = results[(results.Status == 0) & (results.prediction == 0)].count()
fp = results[(results.Status == 0) & (results.prediction== 1)].count()
fn = results[(results.Status == 1) & (results.prediction == 0)].count()

In [0]:
fn

In [0]:
accuracy=float((tp+tn) /(results.count()))
print(accuracy)

In [0]:
# sensibilidad o recall
recall = float(tp)/(tp + fn)
print(recall)

In [0]:
# Precision: número de los valores predicihos positivamente que en verdad son observaciones positivas (TP / (TP + FP))
precision = float(tp) / (tp + fn)
print(precision)