In [126]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
import pyspark.sql.functions as F

### Lectura datos

In [112]:
# Leer CSV desde Cloud Storage (GCS) con inferencia de tipos y encabezado
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("gs://dataproc_spark_tfm/Base.csv")

                                                                                

### Selección de tantos casos de fraude como no fraude

In [127]:
# Step 1: Separate the fraud and non-fraud transactions
fraud_df = df.filter(df.fraud_bool == 1)
nonfraud_df = df.filter(df.fraud_bool == 0)

# Step 2: Count frauds
fraud_count = fraud_df.count()

# Step 3: Take same number of non-fraud rows randomly
nonfraud_sample = nonfraud_df.sample(withReplacement=False, fraction=1.0).limit(fraud_count)

# Step 4: Union both
balanced_df = fraud_df.union(nonfraud_sample)

# Optional: Shuffle the data
df = balanced_df.orderBy(F.rand())

# Show result
df.groupBy("fraud_bool").count().show()



+----------+-----+
|fraud_bool|count|
+----------+-----+
|         1|11029|
|         0|11029|
+----------+-----+



                                                                                

### Visualización de los datos

In [129]:
df.show(2)



+----------+------+---------------------+-------------------------+----------------------------+------------+------------------+----------------------+------------+------------+------------------+-----------------+------------------+--------------------+--------------------------------+-----------------+-----------------+-------------+--------------+----------------+------------------+-----------------+---------------+---------------------+---------------+--------+-------------------------+---------+------------------+-------------------------+------------------+-----+
|fraud_bool|income|name_email_similarity|prev_address_months_count|current_address_months_count|customer_age|days_since_request|intended_balcon_amount|payment_type|zip_count_4w|       velocity_6h|     velocity_24h|       velocity_4w|bank_branch_count_8w|date_of_birth_distinct_emails_4w|employment_status|credit_risk_score|email_is_free|housing_status|phone_home_valid|phone_mobile_valid|bank_months_count|has_other_cards|pr

                                                                                

In [130]:
df.columns

['fraud_bool',
 'income',
 'name_email_similarity',
 'prev_address_months_count',
 'current_address_months_count',
 'customer_age',
 'days_since_request',
 'intended_balcon_amount',
 'payment_type',
 'zip_count_4w',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'bank_branch_count_8w',
 'date_of_birth_distinct_emails_4w',
 'employment_status',
 'credit_risk_score',
 'email_is_free',
 'housing_status',
 'phone_home_valid',
 'phone_mobile_valid',
 'bank_months_count',
 'has_other_cards',
 'proposed_credit_limit',
 'foreign_request',
 'source',
 'session_length_in_minutes',
 'device_os',
 'keep_alive_session',
 'device_distinct_emails_8w',
 'device_fraud_count',
 'month']

In [131]:
df.printSchema()

root
 |-- fraud_bool: integer (nullable = true)
 |-- income: double (nullable = true)
 |-- name_email_similarity: double (nullable = true)
 |-- prev_address_months_count: integer (nullable = true)
 |-- current_address_months_count: integer (nullable = true)
 |-- customer_age: integer (nullable = true)
 |-- days_since_request: double (nullable = true)
 |-- intended_balcon_amount: double (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- zip_count_4w: integer (nullable = true)
 |-- velocity_6h: double (nullable = true)
 |-- velocity_24h: double (nullable = true)
 |-- velocity_4w: double (nullable = true)
 |-- bank_branch_count_8w: integer (nullable = true)
 |-- date_of_birth_distinct_emails_4w: integer (nullable = true)
 |-- employment_status: string (nullable = true)
 |-- credit_risk_score: integer (nullable = true)
 |-- email_is_free: integer (nullable = true)
 |-- housing_status: string (nullable = true)
 |-- phone_home_valid: integer (nullable = true)
 |-- phone_mobil

In [132]:
categorical_cols = ["payment_type", "employment_status", "housing_status", "source",  "device_os"]
numeric_cols = [
    "days_since_request", "intended_balcon_amount", "zip_count_4w", "velocity_6h", "velocity_24h", "velocity_4w", "bank_branch_count_8w",
    "date_of_birth_distinct_emails_4w", "credit_risk_score", "phone_home_valid", "phone_mobile_valid", "bank_months_count", "has_other_cards", "proposed_credit_limit",
    "foreign_request", "session_length_in_minutes", "keep_alive_session", "device_distinct_emails_8w"
]

In [133]:
df = df.select(numeric_cols + categorical_cols + ["fraud_bool"]).dropna()

### Convertir categóricas: indexar + one-hot

In [134]:
# StringIndexer: estimador para pre-procesar variables categóricas
indexers = [StringIndexer(inputCol=col, outputCol=col+"_idx", handleInvalid="keep") for col in categorical_cols]
# OneHotEncoder: recibe un conjunto de columnas y convierte cada una (de manera independiente) a un conjunto de variables dummy con codificación onehot.
encoders = [OneHotEncoder(inputCol=col+"_idx", outputCol=col+"_vec") for col in categorical_cols]

### VectorAssembler

In [135]:
# recibe varias columnas y las concatena en una sola de tipo vector, de longitud igual al número de columnas que se quieran ensamblar.
assembler_inputs = numeric_cols + [col+"_vec" for col in categorical_cols]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

### Escalado opcional (no siempre necesario con OneHotEncoder)

In [136]:
# normalizacion a un vector de características numéricas.
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

### Modelo

In [137]:
lr = LogisticRegression(
    featuresCol="scaledFeatures",
    labelCol="fraud_bool",
    maxIter=100,    # max number of iterations
    regParam=0.01,  # regularization parameter
    elasticNetParam=0.0  # type of regularization (0=L2, 1=L1)
)

### Pipeline
Secuencia de etapas que se ejecutan en un cierto orden

In [138]:
pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler, lr])

### Division de los datos para entrenamiento y testeo

In [139]:
# Division de datos
train, test = df.randomSplit([0.8, 0.2], seed=42)

### Fit
Entrenamiento de los datos

In [140]:
model = pipeline.fit(train)



### Transform
Predicción del modelo

In [141]:
predictions = model.transform(test)
predictions.select("prediction", "fraud_bool", "probability").show(10)



+----------+----------+--------------------+
|prediction|fraud_bool|         probability|
+----------+----------+--------------------+
|       1.0|         1|[0.13300108230561...|
|       0.0|         0|[0.92315277493912...|
|       0.0|         0|[0.88285142045902...|
|       0.0|         0|[0.59349387469242...|
|       1.0|         1|[0.06040499778241...|
|       1.0|         1|[0.02399978901282...|
|       0.0|         0|[0.88625457619558...|
|       0.0|         0|[0.89965933611505...|
|       0.0|         0|[0.96237373396883...|
|       0.0|         0|[0.93506155634322...|
+----------+----------+--------------------+
only showing top 10 rows



                                                                                

### Saving model to feed dataproc with flink

In [117]:
# model.write().overwrite().save("gs://dataproc_spark_tfm/modelo_fraude_csv")

                                                                                

### Individual prediction

In [153]:
test_sample = test.limit(1)
individual_prediction = model.transform(test_sample)
individual_prediction.select("prediction", "fraud_bool", "probability").show()

[Stage 535:>                                                        (0 + 1) / 1]

+----------+----------+--------------------+
|prediction|fraud_bool|         probability|
+----------+----------+--------------------+
|       1.0|         1|[0.00386084662679...|
+----------+----------+--------------------+



                                                                                

### Métricas para cuantificar la predicción del modelo

In [142]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [143]:
# 'fraud_bool' debe ser numérico (0 o 1)
predictionAndLabels = predictions.select("prediction", "fraud_bool") \
                                 .rdd.map(lambda row: (float(row["prediction"]), float(row["fraud_bool"])))




In [144]:
metrics = MulticlassMetrics(predictionAndLabels)

                                                                                

In [145]:
# Matriz de confusión
confusion_matrix = metrics.confusionMatrix().toArray()
print("Matriz de confusión:\n", confusion_matrix)

# Métricas principales
print("Accuracy:", metrics.accuracy)
print("Precision para clase 1:", metrics.precision(1.0))
print("Recall para clase 1:", metrics.recall(1.0))
print("F1 Score para clase 1:", metrics.fMeasure(1.0))



Matriz de confusión:
 [[1959.  227.]
 [ 280. 1838.]]
Accuracy: 0.8822026022304833
Precision para clase 1: 0.8900726392251816
Recall para clase 1: 0.8677998111425873
F1 Score para clase 1: 0.8787951231173798


                                                                                