<a href="https://colab.research.google.com/github/julihdez36/Analytics/blob/main/Sesion_spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# importación de librería y módulos

from pyspark.sql import SparkSession

In [None]:
# Iniciar sesión en spark

# Crear o reutilizar una SparkSession
spark = SparkSession.builder.appName('practise').getOrCreate()

In [None]:
spark

In [None]:
# Lectura de un conjunto de datos de spark

# df = spark.read.option('header', 'true').csv('heart.csv') # No se indica el schema

# Resolución manual

# schema = 'Age INTEGER, Sex STRING, ChestPainType STRING'
# df = spark.read.csv('heart.csv', schema = schema, header = True)

# Resolución automática

df = spark.read.csv('heart.csv', header = True, inferSchema= True)

# Remplazo de los valores nulos con otro valor indicado

# df = spark.read.csv('heart.csv', header = True, inferSchema= True, nullValue= 'NA')

df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- ChestPainType: string (nullable = true)
 |-- RestingBP: integer (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- FastingBS: integer (nullable = true)
 |-- RestingECG: string (nullable = true)
 |-- MaxHR: integer (nullable = true)
 |-- ExerciseAngina: string (nullable = true)
 |-- Oldpeak: double (nullable = true)
 |-- ST_Slope: string (nullable = true)
 |-- HeartDisease: integer (nullable = true)



In [None]:
# Guardar la data

# import os
# os.chdir('path')

df.write.format('csv').mode('overwrite').save('heart_save.csv')

In [None]:
df.collect()

[Row(Age=40, Sex='M', ChestPainType='ATA', RestingBP=140, Cholesterol=289, FastingBS=0, RestingECG='Normal', MaxHR=172, ExerciseAngina='N', Oldpeak=0.0, ST_Slope='Up', HeartDisease=0),
 Row(Age=49, Sex='F', ChestPainType='NAP', RestingBP=160, Cholesterol=180, FastingBS=0, RestingECG='Normal', MaxHR=156, ExerciseAngina='N', Oldpeak=1.0, ST_Slope='Flat', HeartDisease=1),
 Row(Age=37, Sex='M', ChestPainType='ATA', RestingBP=130, Cholesterol=283, FastingBS=0, RestingECG='ST', MaxHR=98, ExerciseAngina='N', Oldpeak=0.0, ST_Slope='Up', HeartDisease=0),
 Row(Age=48, Sex='F', ChestPainType='ASY', RestingBP=138, Cholesterol=214, FastingBS=0, RestingECG='Normal', MaxHR=108, ExerciseAngina='Y', Oldpeak=1.5, ST_Slope='Flat', HeartDisease=1),
 Row(Age=54, Sex='M', ChestPainType='NAP', RestingBP=150, Cholesterol=195, FastingBS=0, RestingECG='Normal', MaxHR=122, ExerciseAngina='N', Oldpeak=0.0, ST_Slope='Up', HeartDisease=0),
 Row(Age=39, Sex='M', ChestPainType='NAP', RestingBP=120, Cholesterol=339, F

In [None]:
import pandas as pd

In [None]:
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [None]:
pd_df = df.toPandas()

spark_df = spark.createDataFrame(pd_df)

print(type(pd_df))
print(type(spark_df))

<class 'pandas.core.frame.DataFrame'>
<class 'pyspark.sql.dataframe.DataFrame'>


In [None]:
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [None]:
# Visualizar mi df

df.head(3)

[Row(Age=40, Sex='M', ChestPainType='ATA', RestingBP=140, Cholesterol=289, FastingBS=0, RestingECG='Normal', MaxHR=172, ExerciseAngina='N', Oldpeak=0.0, ST_Slope='Up', HeartDisease=0),
 Row(Age=49, Sex='F', ChestPainType='NAP', RestingBP=160, Cholesterol=180, FastingBS=0, RestingECG='Normal', MaxHR=156, ExerciseAngina='N', Oldpeak=1.0, ST_Slope='Flat', HeartDisease=1),
 Row(Age=37, Sex='M', ChestPainType='ATA', RestingBP=130, Cholesterol=283, FastingBS=0, RestingECG='ST', MaxHR=98, ExerciseAngina='N', Oldpeak=0.0, ST_Slope='Up', HeartDisease=0)]

In [None]:
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- ChestPainType: string (nullable = true)
 |-- RestingBP: integer (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- FastingBS: integer (nullable = true)
 |-- RestingECG: string (nullable = true)
 |-- MaxHR: integer (nullable = true)
 |-- ExerciseAngina: string (nullable = true)
 |-- Oldpeak: double (nullable = true)
 |-- ST_Slope: string (nullable = true)
 |-- HeartDisease: integer (nullable = true)



In [None]:
df.show(3)

+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
|Age|Sex|ChestPainType|RestingBP|Cholesterol|FastingBS|RestingECG|MaxHR|ExerciseAngina|Oldpeak|ST_Slope|HeartDisease|
+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
| 40|  M|          ATA|      140|        289|        0|    Normal|  172|             N|    0.0|      Up|           0|
| 49|  F|          NAP|      160|        180|        0|    Normal|  156|             N|    1.0|    Flat|           1|
| 37|  M|          ATA|      130|        283|        0|        ST|   98|             N|    0.0|      Up|           0|
+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
only showing top 3 rows



### Casting de columnas (cambio de tipo)

In [None]:
# Casting  de columnas (cambio de tipado)

from pyspark.sql.types import FloatType
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StringType

df = df.withColumn('RestingBP', df.RestingBP.cast(FloatType()))

df.select('RestingBP').describe().show()

+-------+------------------+
|summary|         RestingBP|
+-------+------------------+
|  count|               918|
|   mean|132.39651416122004|
| stddev|18.514154119907808|
|    min|               0.0|
|    max|             200.0|
+-------+------------------+



In [None]:
from pyspark.sql.types import FloatType

df = df.withColumn('Oldpeak', df.Oldpeak.cast(FloatType()))

df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- ChestPainType: string (nullable = true)
 |-- RestingBP: integer (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- FastingBS: integer (nullable = true)
 |-- RestingECG: string (nullable = true)
 |-- MaxHR: integer (nullable = true)
 |-- ExerciseAngina: string (nullable = true)
 |-- Oldpeak: float (nullable = true)
 |-- ST_Slope: string (nullable = true)
 |-- HeartDisease: integer (nullable = true)



In [None]:
print(type(df.Sex))
print(type(df.select('Sex')))

<class 'pyspark.sql.column.Column'>
<class 'pyspark.sql.dataframe.DataFrame'>


In [None]:
df.describe().show()

+-------+------------------+----+-------------+------------------+------------------+-------------------+----------+------------------+--------------+------------------+--------+-------------------+
|summary|               Age| Sex|ChestPainType|         RestingBP|       Cholesterol|          FastingBS|RestingECG|             MaxHR|ExerciseAngina|           Oldpeak|ST_Slope|       HeartDisease|
+-------+------------------+----+-------------+------------------+------------------+-------------------+----------+------------------+--------------+------------------+--------+-------------------+
|  count|               918| 918|          918|               918|               918|                918|       918|               918|           918|               918|     918|                918|
|   mean|53.510893246187365|NULL|         NULL|132.39651416122004| 198.7995642701525|0.23311546840958605|      NULL|136.80936819172112|          NULL|0.8873638342213787|    NULL| 0.5533769063180828|
| std

In [None]:
df.select('Oldpeak').describe().show()

+-------+------------------+
|summary|           Oldpeak|
+-------+------------------+
|  count|               918|
|   mean|0.8873638342213787|
| stddev|1.0665701462440484|
|    min|              -2.6|
|    max|               6.2|
+-------+------------------+



### Agregación y eliminación de columnas

In [None]:
# Agregar una columna

AgeFixed = df['Age'] + 1
df = df.withColumn('AgeFixed', AgeFixed)

df.describe().show()

+-------+------------------+----+-------------+------------------+------------------+-------------------+----------+------------------+--------------+------------------+--------+-------------------+------------------+
|summary|               Age| Sex|ChestPainType|         RestingBP|       Cholesterol|          FastingBS|RestingECG|             MaxHR|ExerciseAngina|           Oldpeak|ST_Slope|       HeartDisease|          AgeFixed|
+-------+------------------+----+-------------+------------------+------------------+-------------------+----------+------------------+--------------+------------------+--------+-------------------+------------------+
|  count|               918| 918|          918|               918|               918|                918|       918|               918|           918|               918|     918|                918|               918|
|   mean|53.510893246187365|NULL|         NULL|132.39651416122004| 198.7995642701525|0.23311546840958605|      NULL|136.80936819

In [None]:
# Remoción de columnas

df.drop('AgeFixed').show(3)

+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
|Age|Sex|ChestPainType|RestingBP|Cholesterol|FastingBS|RestingECG|MaxHR|ExerciseAngina|Oldpeak|ST_Slope|HeartDisease|
+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
| 40|  M|          ATA|    140.0|        289|        0|    Normal|  172|             N|    0.0|      Up|           0|
| 49|  F|          NAP|    160.0|        180|        0|    Normal|  156|             N|    1.0|    Flat|           1|
| 37|  M|          ATA|    130.0|        283|        0|        ST|   98|             N|    0.0|      Up|           0|
+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
only showing top 3 rows



In [None]:
# Renombrar columnas

df.withColumnRenamed('Age', 'Edad').select('Edad').show(1)

+----+
|Edad|
+----+
|  40|
+----+
only showing top 1 row



In [None]:
name_pairs = [('Age', 'Edad'), ('Sex','Sexo')]

for old_name, new_name in name_pairs:
  df = df.withColumnRenamed(old_name, new_name)

df.select(['Edad', 'Sexo']).show(1)

+----+----+
|Edad|Sexo|
+----+----+
|  40|   M|
+----+----+
only showing top 1 row



In [None]:
name_original = [('Edad','Age'),('Sexo','Sex')]

for i, j in name_original:
  df = df.withColumnRenamed(i,j)

df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- ChestPainType: string (nullable = true)
 |-- RestingBP: float (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- FastingBS: integer (nullable = true)
 |-- RestingECG: string (nullable = true)
 |-- MaxHR: integer (nullable = true)
 |-- ExerciseAngina: string (nullable = true)
 |-- Oldpeak: float (nullable = true)
 |-- ST_Slope: string (nullable = true)
 |-- HeartDisease: integer (nullable = true)
 |-- AgeFixed: integer (nullable = true)



### Imputación de datos y llenados

- Métodos de interpolación
- Métodos de imputación simple (media movil, suavización exponencial)
- Métodos de imputación compuesta

In [None]:
# Eliminación de datos nulos o que contienen NA

print(f'Registros sin eliminación de datos nulos: {df.count()}')

df = df.na.drop()

print(f'Registros con eliminación de datos nulos: {df.count()}')


Registros sin eliminación de datos nulos: 918
Registros con eliminación de datos nulos: 918


In [None]:
df.count()

918

In [None]:
# Eliminación de toda fila o registro con al menos 2 valores NA
# df.na.drop(thresh= 2)

# Elimninación de toda fila o registro con cualquier valor NA en alguna columna indicada

df.na.drop(how = 'any', subset= ['Edad','Sexo'])

DataFrame[Edad: int, Sexo: string, ChestPainType: string, RestingBP: float, Cholesterol: int, FastingBS: int, RestingECG: string, MaxHR: int, ExerciseAngina: string, Oldpeak: double, ST_Slope: string, HeartDisease: int, AgeFixed: int]

In [None]:
# Llenado de datos vacios (imputación de datos)

df.na.fill(value = 'Cualquiera', subset = ['Sexo'])

In [None]:
# Imputación de datos (media movil)

from pyspark.ml.feature import Imputer

imptr = Imputer(inputCols = ['Edad', 'RestingBP'],
                outputCols = ['Edad', 'RestingBP']).setStrategy('mean') # Puede ser mediana o moda

df = imptr.fit(df).transform(df)

### Filtrado de la información

In [None]:
# Primera opción de filtrado

# df.filter('Age > 50').show()

# Segunda opción de filtrado

df.where('Age > 50').show()

# Tercera opción

df.where(df['Age'] > 50).show(3)

+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+--------+
|Age|Sex|ChestPainType|RestingBP|Cholesterol|FastingBS|RestingECG|MaxHR|ExerciseAngina|Oldpeak|ST_Slope|HeartDisease|AgeFixed|
+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+--------+
| 54|  M|          NAP|    150.0|        195|        0|    Normal|  122|             N|    0.0|      Up|           0|      55|
| 54|  M|          ATA|    110.0|        208|        0|    Normal|  142|             N|    0.0|      Up|           0|      55|
| 58|  M|          ATA|    136.0|        164|        0|        ST|   99|             Y|    2.0|    Flat|           1|      59|
+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+--------+
only showing top 3 rows



In [None]:
# Agreguemos condiciones con operadores lógicos

df.where((df['Age'] > 50) & (df['RestingECG'] == 'ST')).show()

# Otros operadores lógicos: |, ~

+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
|Age|Sex|ChestPainType|RestingBP|Cholesterol|FastingBS|RestingECG|MaxHR|ExerciseAngina|Oldpeak|ST_Slope|HeartDisease|
+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
| 58|  M|          ATA|      136|        164|        0|        ST|   99|             Y|    2.0|    Flat|           1|
| 53|  M|          ASY|      124|        260|        0|        ST|  112|             Y|    3.0|    Flat|           0|
| 54|  F|          NAP|      130|        294|        0|        ST|  100|             Y|    0.0|    Flat|           1|
| 52|  M|          NAP|      140|        259|        0|        ST|  170|             N|    0.0|      Up|           0|
| 58|  M|          NAP|      130|        213|        0|        ST|  140|             N|    0.0|    Flat|           1|
| 54|  M|          ASY|      150|        365|        0| 

In [None]:
# Evaluar una expresión de texto (str) dentro de una función o método

from pyspark.sql.functions import expr

exp = 'Age + 2'
df.withColumn('new_col', expr(exp)).select('new_col').show()

+-------+
|new_col|
+-------+
|     42|
|     51|
|     39|
|     50|
|     56|
|     41|
|     47|
|     56|
|     39|
|     50|
|     39|
|     60|
|     41|
|     51|
|     44|
|     56|
|     40|
|     45|
|     62|
|     38|
+-------+
only showing top 20 rows



In [None]:
# Group by

disease_by_age = df.groupBy('Age').mean().select('Age','avg(HeartDisease)')

# Ordenemos los valores en forma descendente

from pyspark.sql.functions import desc

disease_by_age.orderBy(desc('Age')).show(8)


+---+------------------+
|Age| avg(HeartDisease)|
+---+------------------+
| 77|               1.0|
| 76|               0.5|
| 75|0.6666666666666666|
| 74|0.7142857142857143|
| 73|               1.0|
| 72|              0.75|
| 71|               0.4|
| 70|0.8571428571428571|
+---+------------------+
only showing top 8 rows



In [None]:
from pyspark.sql.functions import asc

disease_by_age.orderBy(asc('Age')).show(8)


+---+-------------------+
|Age|  avg(HeartDisease)|
+---+-------------------+
| 28|                0.0|
| 29|                0.0|
| 30|                0.0|
| 31|                0.5|
| 32|                0.4|
| 33|                0.5|
| 34| 0.2857142857142857|
| 35|0.36363636363636365|
+---+-------------------+
only showing top 8 rows



In [None]:
from pyspark.sql import functions as F
df.agg(F.min(df['Age'])).show() # Edad mínima

+--------+
|min(Age)|
+--------+
|      28|
+--------+



In [None]:
df.agg(F.min(df['Age']), F.max(df['Age']), F.avg(df['Age']), F.count(df['Age'])).show()

+--------+--------+------------------+----------+
|min(Age)|max(Age)|          avg(Age)|count(Age)|
+--------+--------+------------------+----------+
|      28|      77|53.510893246187365|       918|
+--------+--------+------------------+----------+



In [None]:
# Ejecutando consultas SQL en nuestros datos

df.createOrReplaceTempView('df') # Indicar a pyspark cómo será llamada la tabla para consultas SQL
spark.sql("""SELECT sex from df""").show(2) # Una consulta SQL

+---+
|sex|
+---+
|  M|
|  F|
+---+
only showing top 2 rows



In [None]:
df.selectExpr('age >= 40 as older', 'age').show(5)

+-----+---+
|older|age|
+-----+---+
| true| 40|
| true| 49|
|false| 37|
| true| 48|
| true| 54|
+-----+---+
only showing top 5 rows



In [None]:
df.groupBy('Age').pivot('Sex', ('M','F')).count().show()

+---+---+----+
|Age|  M|   F|
+---+---+----+
| 31|  1|   1|
| 65| 17|   4|
| 53| 27|   6|
| 34|  5|   2|
| 28|  1|NULL|
| 76|  1|   1|
| 44| 16|   3|
| 47| 15|   4|
| 52| 31|   5|
| 40| 12|   1|
| 57| 32|   6|
| 54| 36|  15|
| 48| 22|   9|
| 64| 16|   6|
| 41| 17|   7|
| 43| 15|   9|
| 37|  7|   4|
| 61| 27|   4|
| 72|  4|NULL|
| 35|  8|   3|
+---+---+----+
only showing top 20 rows



In [None]:
df.selectExpr('Age >= 40 as older', 'Age', 'Sex').groupBy('Sex').pivot('older', ('true','false')).count().show()

+---+----+-----+
|Sex|true|false|
+---+----+-----+
|  F| 174|   19|
|  M| 664|   61|
+---+----+-----+



### Machine learning con pyspark

In [None]:
# División del conjunto de datos entre entrenamiento y prueba

X_column_names = ['Age','Cholesterol']
target_column_name = ['MaxHR']

from pyspark.ml.feature import VectorAssembler

v_asmblr = VectorAssembler(inputCols= X_column_names, outputCol= 'Fvec')
df = v_asmblr.transform(df)
x = df.select(['Age','Cholesterol','Fvec','MaxHR'])
x.show(8)

+---+-----------+------------+-----+
|Age|Cholesterol|        Fvec|MaxHR|
+---+-----------+------------+-----+
| 40|        289|[40.0,289.0]|  172|
| 49|        180|[49.0,180.0]|  156|
| 37|        283|[37.0,283.0]|   98|
| 48|        214|[48.0,214.0]|  108|
| 54|        195|[54.0,195.0]|  122|
| 39|        339|[39.0,339.0]|  170|
| 45|        237|[45.0,237.0]|  170|
| 54|        208|[54.0,208.0]|  142|
+---+-----------+------------+-----+
only showing top 8 rows



In [None]:
# División del conjutno de datos

trainset, testset = x.randomSplit([.8,.2])

In [None]:
# Predicción de RestingBP usando una regresión lineal

from pyspark.ml.regression import LinearRegression
model = LinearRegression(featuresCol= 'Fvec',labelCol= 'MaxHR') #especificar el modelo
model = model.fit(trainset) # entrenamiento del modelo
print(model.coefficients)
print(model.intercept)

[-0.9987788008486029,0.041173730979745665]
181.73049189699833


In [None]:
# Evaluemos el modelo

model.evaluate(testset).predictions.show(8)

+---+-----------+------------+-----+------------------+
|Age|Cholesterol|        Fvec|MaxHR|        prediction|
+---+-----------+------------+-----+------------------+
| 32|          0|  [32.0,0.0]|  127|149.76957026984303|
| 33|        298|[33.0,298.0]|  185|161.04056330095864|
| 34|        220|[34.0,220.0]|  150|156.83023348368988|
| 35|        161|[35.0,161.0]|  153| 153.4022045550363|
| 35|        257|[35.0,257.0]|  140|157.35488272909186|
| 36|        166|[36.0,166.0]|  180| 152.6092944090864|
| 36|        340|[36.0,340.0]|  184|159.77352359956214|
| 37|        211|[37.0,211.0]|  142|153.46333350232635|
+---+-----------+------------+-----+------------------+
only showing top 8 rows



In [None]:
from pyspark.ml.feature import StringIndexer
indxr = StringIndexer(inputCol= 'ChestPainType', outputCol= 'ChestPainTypeIndex')
indxr.fit(df).transform(df).select('ChestPainTypeIndex').show(5)

+------------------+
|ChestPainTypeIndex|
+------------------+
|               2.0|
|               1.0|
|               2.0|
|               0.0|
|               1.0|
+------------------+
only showing top 5 rows

