In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285397 sha256=03611ed661d8d3c466ed2a1dbaa6d4dcf32a44fcc52b870f09bc93f2b9f6bb04
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [41]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from pyspark.sql.types import *

In [3]:
from google.colab import files
files.upload()

Saving online_retail_II.csv to online_retail_II.csv


In [42]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('RFM segmentation with spark').getOrCreate()

In [55]:
df = spark.read.csv('online_retail_II.csv',header=True,sep = ',',inferSchema=True)
df.show()

+-------+---------+--------------------+--------+-------------------+-----+-----------+--------------+
|Invoice|StockCode|         Description|Quantity|        InvoiceDate|Price|Customer ID|       Country|
+-------+---------+--------------------+--------+-------------------+-----+-----------+--------------+
| 489434|    85048|15CM CHRISTMAS GL...|      12|2009-12-01 07:45:00| 6.95|    13085.0|United Kingdom|
| 489434|   79323P|  PINK CHERRY LIGHTS|      12|2009-12-01 07:45:00| 6.75|    13085.0|United Kingdom|
| 489434|   79323W| WHITE CHERRY LIGHTS|      12|2009-12-01 07:45:00| 6.75|    13085.0|United Kingdom|
| 489434|    22041|"RECORD FRAME 7""...|      48|2009-12-01 07:45:00|  2.1|    13085.0|United Kingdom|
| 489434|    21232|STRAWBERRY CERAMI...|      24|2009-12-01 07:45:00| 1.25|    13085.0|United Kingdom|
| 489434|    22064|PINK DOUGHNUT TRI...|      24|2009-12-01 07:45:00| 1.65|    13085.0|United Kingdom|
| 489434|    21871| SAVE THE PLANET MUG|      24|2009-12-01 07:45:00| 1.2

In [44]:
df.printSchema()

root
 |-- Invoice: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- Price: double (nullable = true)
 |-- Customer ID: double (nullable = true)
 |-- Country: string (nullable = true)



In [45]:
df.toPandas().shape

(1067371, 8)

# **LIMPIEZA Y MANIPULACION DE DATOS**

In [56]:
#Realizamos una funcion que cuenta la cantidad de valores NO NULOS en cada columna del dataframde

import pyspark.sql.functions as F

def check_null(data):
    data.agg(*[F.count(c).alias(c)
             for c in data.columns]).show()

In [57]:
check_null(df)

+-------+---------+-----------+--------+-----------+-------+-----------+-------+
|Invoice|StockCode|Description|Quantity|InvoiceDate|  Price|Customer ID|Country|
+-------+---------+-----------+--------+-----------+-------+-----------+-------+
|1067371|  1067371|    1062989| 1067371|    1067371|1067371|     824364|1067371|
+-------+---------+-----------+--------+-----------+-------+-----------+-------+



In [58]:
#Eliminamos todas las filas que tengan al menos un valor nulo
df = df.dropna(how='any')
check_null(df)

+-------+---------+-----------+--------+-----------+------+-----------+-------+
|Invoice|StockCode|Description|Quantity|InvoiceDate| Price|Customer ID|Country|
+-------+---------+-----------+--------+-----------+------+-----------+-------+
| 824364|   824364|     824364|  824364|     824364|824364|     824364| 824364|
+-------+---------+-----------+--------+-----------+------+-----------+-------+



In [59]:
#Quitamos las invoice canceladas

df=df.filter(~F.col('Invoice').contains('C'))
df.show()

+-------+---------+--------------------+--------+-------------------+-----+-----------+--------------+
|Invoice|StockCode|         Description|Quantity|        InvoiceDate|Price|Customer ID|       Country|
+-------+---------+--------------------+--------+-------------------+-----+-----------+--------------+
| 489434|    85048|15CM CHRISTMAS GL...|      12|2009-12-01 07:45:00| 6.95|    13085.0|United Kingdom|
| 489434|   79323P|  PINK CHERRY LIGHTS|      12|2009-12-01 07:45:00| 6.75|    13085.0|United Kingdom|
| 489434|   79323W| WHITE CHERRY LIGHTS|      12|2009-12-01 07:45:00| 6.75|    13085.0|United Kingdom|
| 489434|    22041|"RECORD FRAME 7""...|      48|2009-12-01 07:45:00|  2.1|    13085.0|United Kingdom|
| 489434|    21232|STRAWBERRY CERAMI...|      24|2009-12-01 07:45:00| 1.25|    13085.0|United Kingdom|
| 489434|    22064|PINK DOUGHNUT TRI...|      24|2009-12-01 07:45:00| 1.65|    13085.0|United Kingdom|
| 489434|    21871| SAVE THE PLANET MUG|      24|2009-12-01 07:45:00| 1.2

In [50]:
df.toPandas().shape

(805620, 8)

In [60]:
#Creamos una funcion en SPark para cambiar la coma por punto
#La funcion a crear hace lo siguiente
#def make_float(value):
  #updated_str=value.replace(',','.')
  #updated_float=float(updated_str)
  #return updated_str

make_float_udf = F.udf(lambda value: float(str(value.replace(',', '.'))), FloatType())

In [61]:
#No aplicamos los funcion por el valor ya es flotante, pero en caso de que sea string, hariamos lo siguiente
#df = df.withColumn('Price', make_float_udf(F.col("Price")))
#df.show(5)

In [62]:
#Calculamos la columna Total Price
df=df.withColumn('TotalPrice',F.round(df.Price*df.Quantity,2))
df.show()

+-------+---------+--------------------+--------+-------------------+-----+-----------+--------------+----------+
|Invoice|StockCode|         Description|Quantity|        InvoiceDate|Price|Customer ID|       Country|TotalPrice|
+-------+---------+--------------------+--------+-------------------+-----+-----------+--------------+----------+
| 489434|    85048|15CM CHRISTMAS GL...|      12|2009-12-01 07:45:00| 6.95|    13085.0|United Kingdom|      83.4|
| 489434|   79323P|  PINK CHERRY LIGHTS|      12|2009-12-01 07:45:00| 6.75|    13085.0|United Kingdom|      81.0|
| 489434|   79323W| WHITE CHERRY LIGHTS|      12|2009-12-01 07:45:00| 6.75|    13085.0|United Kingdom|      81.0|
| 489434|    22041|"RECORD FRAME 7""...|      48|2009-12-01 07:45:00|  2.1|    13085.0|United Kingdom|     100.8|
| 489434|    21232|STRAWBERRY CERAMI...|      24|2009-12-01 07:45:00| 1.25|    13085.0|United Kingdom|      30.0|
| 489434|    22064|PINK DOUGHNUT TRI...|      24|2009-12-01 07:45:00| 1.65|    13085.0|U

In [66]:
#Creamos una funcion en SPark para cambiar InvoiceDate a data time type
#La funcion a crear hace lo siguiente
#def make_time(value):
  #updated_time= pd.to_datetime(value, format = '%d.%m.%Y %H:%M')
  #return  updated_time

make_time_udf = F.udf(lambda value: pd.to_datetime(value, format = '%Y-%m-%d %H:%M:%S'), TimestampType())


In [68]:
df = df.withColumn('InvoiceDate', make_time_udf(F.col("InvoiceDate")))
df.show(5)

+-------+---------+--------------------+--------+-------------------+-----+-----------+--------------+----------+
|Invoice|StockCode|         Description|Quantity|        InvoiceDate|Price|Customer ID|       Country|TotalPrice|
+-------+---------+--------------------+--------+-------------------+-----+-----------+--------------+----------+
| 489434|    85048|15CM CHRISTMAS GL...|      12|2009-12-01 07:45:00| 6.95|    13085.0|United Kingdom|      83.4|
| 489434|   79323P|  PINK CHERRY LIGHTS|      12|2009-12-01 07:45:00| 6.75|    13085.0|United Kingdom|      81.0|
| 489434|   79323W| WHITE CHERRY LIGHTS|      12|2009-12-01 07:45:00| 6.75|    13085.0|United Kingdom|      81.0|
| 489434|    22041|"RECORD FRAME 7""...|      48|2009-12-01 07:45:00|  2.1|    13085.0|United Kingdom|     100.8|
| 489434|    21232|STRAWBERRY CERAMI...|      24|2009-12-01 07:45:00| 1.25|    13085.0|United Kingdom|      30.0|
+-------+---------+--------------------+--------+-------------------+-----+-----------+-

In [71]:
#Obtenemos la fecha maxima y lo guardamos en un dataframe de pandas
date_max = df.select(F.max(df.InvoiceDate).alias('max_date')).toPandas() #toPandas lo guarda como un dataframe de pandas
date_max


Unnamed: 0,max_date
0,2011-12-09 12:50:00


In [73]:
# Se calcula la diferencia entre la fecha maxima y la fecha de la factura
df = df.withColumn('Duration', F.datediff(F.lit(date_max.iloc[0][0]), 'InvoiceDate')) #lit crea una columna con un valor literal constante
df.show(5)

+-------+---------+--------------------+--------+-------------------+-----+-----------+--------------+----------+--------+
|Invoice|StockCode|         Description|Quantity|        InvoiceDate|Price|Customer ID|       Country|TotalPrice|Duration|
+-------+---------+--------------------+--------+-------------------+-----+-----------+--------------+----------+--------+
| 489434|    85048|15CM CHRISTMAS GL...|      12|2009-12-01 07:45:00| 6.95|    13085.0|United Kingdom|      83.4|     738|
| 489434|   79323P|  PINK CHERRY LIGHTS|      12|2009-12-01 07:45:00| 6.75|    13085.0|United Kingdom|      81.0|     738|
| 489434|   79323W| WHITE CHERRY LIGHTS|      12|2009-12-01 07:45:00| 6.75|    13085.0|United Kingdom|      81.0|     738|
| 489434|    22041|"RECORD FRAME 7""...|      48|2009-12-01 07:45:00|  2.1|    13085.0|United Kingdom|     100.8|     738|
| 489434|    21232|STRAWBERRY CERAMI...|      24|2009-12-01 07:45:00| 1.25|    13085.0|United Kingdom|      30.0|     738|
+-------+-------

In [75]:
#Renombramos columnas

df=df.withColumnRenamed('Invoice','invoice').withColumnRenamed('Customer ID', 'customer_id').withColumnRenamed('Duration', 'duration').withColumnRenamed('TotalPrice', 'total_price')
df.show()

+-------+---------+--------------------+--------+-------------------+-----+-----------+--------------+-----------+--------+
|invoice|StockCode|         Description|Quantity|        InvoiceDate|Price|customer_id|       Country|total_price|duration|
+-------+---------+--------------------+--------+-------------------+-----+-----------+--------------+-----------+--------+
| 489434|    85048|15CM CHRISTMAS GL...|      12|2009-12-01 07:45:00| 6.95|    13085.0|United Kingdom|       83.4|     738|
| 489434|   79323P|  PINK CHERRY LIGHTS|      12|2009-12-01 07:45:00| 6.75|    13085.0|United Kingdom|       81.0|     738|
| 489434|   79323W| WHITE CHERRY LIGHTS|      12|2009-12-01 07:45:00| 6.75|    13085.0|United Kingdom|       81.0|     738|
| 489434|    22041|"RECORD FRAME 7""...|      48|2009-12-01 07:45:00|  2.1|    13085.0|United Kingdom|      100.8|     738|
| 489434|    21232|STRAWBERRY CERAMI...|      24|2009-12-01 07:45:00| 1.25|    13085.0|United Kingdom|       30.0|     738|
| 489434

# **MODELO RFM**