In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [3]:
spark

#### Leer el dataset

In [5]:
# Opción 1
pyspark_df = spark.read.option('header', 'true').csv('../datasets/test1.csv')
pyspark_df

DataFrame[Name: string, age: string, Experience: string]

In [6]:
pyspark_df.show()

+---------+---+----------+
|     Name|age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [7]:
# Comprobar los tipos de las columnas
# Por defecto lee todas columnas como string
pyspark_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [8]:
# Se añade opción en la lectura de datos para que infiera los tipos
pyspark_df = spark.read.option('header', 'true').csv('../datasets/test1.csv', inferSchema=True)

In [9]:
pyspark_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [10]:
# Opción 2
pyspark_df = spark.read.csv('../datasets/test1.csv', header=True, inferSchema=True)
pyspark_df.show()

+---------+---+----------+
|     Name|age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [11]:
pyspark_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [12]:
# Estructura de datos Dataframe, por lo que se pueden realizar varios tipos de
# operaciones
type(pyspark_df)

pyspark.sql.dataframe.DataFrame

#### Selección de columnas e indexación

In [13]:
pyspark_df.columns

['Name', 'age', 'Experience']

In [18]:
# Se selecciona una columna
# Es una dataframe, no una serie como en pandas
pyspark_df.select('Name')

DataFrame[Name: string]

In [19]:
pyspark_df.select('Name').show()

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
+---------+



In [20]:
# Se seleccionar múltiples columnas
pyspark_df.select(['Name', 'Experience'])

DataFrame[Name: string, Experience: int]

In [21]:
pyspark_df.select(['Name', 'Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|Sudhanshu|         8|
|    Sunny|         4|
+---------+----------+



In [25]:
# Otra manera de comprobar los tipos
pyspark_df.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int')]

#### Análisis descriptivo de las variables

In [26]:
pyspark_df.describe().show()

+-------+-----+----+-----------------+
|summary| Name| age|       Experience|
+-------+-----+----+-----------------+
|  count|    3|   3|                3|
|   mean| null|30.0|7.333333333333333|
| stddev| null| 1.0|3.055050463303893|
|    min|Krish|  29|                4|
|    max|Sunny|  31|               10|
+-------+-----+----+-----------------+



#### Añadir columnas

In [28]:
# Argumentos: Nombre de la nueva columna y valores que toma
pyspark_df = pyspark_df.withColumn('Experience After 2 year', pyspark_df['Experience']+2)
pyspark_df

DataFrame[Name: string, age: int, Experience: int, Experience After 2 year: int]

In [29]:
pyspark_df.show()

+---------+---+----------+-----------------------+
|     Name|age|Experience|Experience After 2 year|
+---------+---+----------+-----------------------+
|    Krish| 31|        10|                     12|
|Sudhanshu| 30|         8|                     10|
|    Sunny| 29|         4|                      6|
+---------+---+----------+-----------------------+



#### Eliminar columnas

In [32]:
pyspark_df = pyspark_df.drop('Experience After 2 year')
pyspark_df

DataFrame[Name: string, age: int, Experience: int]

In [33]:
pyspark_df.show()

+---------+---+----------+
|     Name|age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



#### Renombrar columnas

In [35]:
# Argumentos: Nombre columna actual, nombre nuevo
pyspark_df.withColumnRenamed('Name', 'New Name').show()

+---------+---+----------+
| New Name|age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+

