In [28]:
spark

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.session.SparkSession object at 0x7f553f578d90>

# Column Collection Type

In [10]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Array

In [None]:
arrayType = T.ArrayType(T.IntegerType(), False)

In [None]:
print(arrayType.jsonValue()) # schema description as JSONSchema

print(arrayType.simpleString()) # schema description in Hive style

print(arrayType.typeName()) # type name

## Map

In [None]:
mapType = T.MapType(T.StringType(), T.IntegerType())

In [None]:
print(mapType.keyType)

print(mapType.valueType)

print(mapType.valueContainsNull)

## Struct

In [None]:
data = [
    ("James","","Smith","36","M",3000),
    ("Michael","Rose","","40","M",4000),
    ("Robert","","Williams","42","M",4000),
    ("Maria","Anne","Jones","39","F",4000),
    ("Jen","Mary","Brown","","F",-1),
]

In [None]:
schema = T.StructType([
    T.StructField("firetname", T.StringType(), True),
    T.StructField("middlename",T.StringType(),True), 
    T.StructField("lastname",T.StringType(),True), 
    T.StructField("age", T.StringType(), True), 
    T.StructField("gender", T.StringType(), True), 
    T.StructField("salary", T.IntegerType(), True) 
])

In [None]:
df = spark.createDataFrame(data=data, schema=schema)

In [None]:
df.printSchema()

In [None]:
df.show(truncate=False)

# Column Selection

In [None]:
data = [
    ("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL"),
]

columns = ["firstname","lastname","country","state"]
df = spark.createDataFrame(data, columns)
df.show()

## Select columns

Show all columns with `columns` property

In [None]:
df.columns

Use string as a column specifier.

In [None]:
df.select("firstname", "lastname").show()

Use data frame property as a column specifier This way has a limitation if a name contains whitespace or invalid characters for Python property name. **Not recommended** 

In [None]:
df.select(df.firstname, df.lastname).show()

Use data frame indexing (with string ) as a column specifier.

In [None]:
df.select(df["firstname"], df["lastname"]).show()

Use `col()` function to reference  a dataframe. it has an additional good point to extend operation with the column, for example, `alias()`.

In [None]:
df.select(F.col("firstname").alias("fname"), F.col("lastname")).show()

### Show all columns

In [None]:
df.select("*").show() # or only show()

In [None]:
df.select([col for col in df.columns]).show()  # List of column names

In [None]:
df.select(*df.columns).show()

Use slicing `df.columns` property to select column in range.

In [None]:
df.select(df.columns[:3]).show(3)  # first three columns

In [None]:
df.select(df.columns[2:4]).show()  # column 3 to 4

### Select nested struct

In [None]:
data = [
    (("James",None,"Smith"),"OH","M"),
    (("Anna","Rose",""),"NY","F"),
    (("Julia","","Williams"),"OH","F"),
    (("Maria","Anne","Jones"),"NY","M"),
    (("Jen","Mary","Brown"),"NY","M"),
    (("Mike","Mary","Williams"),"OH","M")
]

In [None]:
schema = T.StructType([
    T.StructField("name", T.StructType([
        T.StructField("firstname", T.StringType(), True),
        T.StructField("middlename", T.StringType(), True),
        T.StructField("lastname", T.StringType(), True),
    ])),
    T.StructField("state", T.StringType(), True),
    T.StructField("gender", T.StringType(), True),
])

In [None]:
df2 = spark.createDataFrame(data, schema)

In [None]:
df2.printSchema()

In [None]:
df2.show(truncate=False)

In [None]:
df2.select("name").show(truncate=False)

In [None]:
df2.select("name.firstname", "name.lastname").show(truncate=False)

In [None]:
df2.select("name.*").show(truncate=False)

# Add Column

In [2]:
data = [
  ('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','kkkBrown','1980-02-17','F',-1),
]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
columns = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
df = spark.createDataFrame(data=data, schema=columns)
df.printSchema()
df.show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|dob       |gender|salary|
+---------+----------+--------+----------+------+------+
|James    |          |Smith   |1991-04-01|M     |3000  |
|Michael  |Rose      |        |2000-05-19|M     |4000  |
|Robert   |          |Williams|1978-09-05|M     |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |
|Jen      |Mary      |kkkBrown|1980-02-17|F     |-1    |
+---------+----------+--------+----------+------+------+

Casting a column type, but still keep the same name.

In [15]:
df2 = df.withColumn("salary", F.col("salary").cast("integer"))
df2.printSchema()
df2.show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|dob       |gender|salary|
+---------+----------+--------+----------+------+------+
|James    |          |Smith   |1991-04-01|M     |3000  |
|Michael  |Rose      |        |2000-05-19|M     |4000  |
|Robert   |          |Williams|1978-09-05|M     |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |
|Jen      |Mary      |kkkBrown|1980-02-17|F     |-1    |
+---------+----------+--------+----------+------+------+

Applying an expression with a column, but still keep the same name.

In [20]:
df3 = df.withColumn("salary", F.col("salary") * 100)
df3.printSchema()
df3.show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|dob       |gender|salary|
+---------+----------+--------+----------+------+------+
|James    |          |Smith   |1991-04-01|M     |300000|
|Michael  |Rose      |        |2000-05-19|M     |400000|
|Robert   |          |Williams|1978-09-05|M     |400000|
|Maria    |Anne      |Jones   |1967-12-01|F     |400000|
|Jen      |Mary      |kkkBrown|1980-02-17|F     |-100  |
+---------+----------+--------+----------+------+------+

In [30]:
df4 = df.withColumn("CopiedColumn", F.col("salary") * -1)
df4.printSchema()
df4.show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- CopiedColumn: long (nullable = true)

+---------+----------+--------+----------+------+------+------------+
|firstname|middlename|lastname|dob       |gender|salary|CopiedColumn|
+---------+----------+--------+----------+------+------+------------+
|James    |          |Smith   |1991-04-01|M     |3000  |-3000       |
|Michael  |Rose      |        |2000-05-19|M     |4000  |-4000       |
|Robert   |          |Williams|1978-09-05|M     |4000  |-4000       |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |-4000       |
|Jen      |Mary      |kkkBrown|1980-02-17|F     |-1    |1           |
+---------+----------+--------+----------+------+------+------------+

Create a new column with a literal value (of any type) by using `lit()`.

In [33]:
df5 = df.withColumn("Country", F.lit("USA"))
df5.printSchema()
df5.show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- Country: string (nullable = false)

+---------+----------+--------+----------+------+------+-------+
|firstname|middlename|lastname|dob       |gender|salary|Country|
+---------+----------+--------+----------+------+------+-------+
|James    |          |Smith   |1991-04-01|M     |3000  |USA    |
|Michael  |Rose      |        |2000-05-19|M     |4000  |USA    |
|Robert   |          |Williams|1978-09-05|M     |4000  |USA    |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |USA    |
|Jen      |Mary      |kkkBrown|1980-02-17|F     |-1    |USA    |
+---------+----------+--------+----------+------+------+-------+

As a `RDD` is immutable and its operation returns a new one. We can chain operations with a DataFrame.

In [37]:
df6 = (
    df
    .withColumn("Country", F.lit("USA"))
    .withColumn("anotherColumn", F.lit("anotherValue"))
)
df6.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- Country: string (nullable = false)
 |-- anotherColumn: string (nullable = false)

## Drop Column

In [41]:
df4.show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+----------+--------+----------+------+------+------------+
|firstname|middlename|lastname|dob       |gender|salary|CopiedColumn|
+---------+----------+--------+----------+------+------+------------+
|James    |          |Smith   |1991-04-01|M     |3000  |-3000       |
|Michael  |Rose      |        |2000-05-19|M     |4000  |-4000       |
|Robert   |          |Williams|1978-09-05|M     |4000  |-4000       |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |-4000       |
|Jen      |Mary      |kkkBrown|1980-02-17|F     |-1    |1           |
+---------+----------+--------+----------+------+------+------------+

In [40]:
df4.drop("CopiedColumn").show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|dob       |gender|salary|
+---------+----------+--------+----------+------+------+
|James    |          |Smith   |1991-04-01|M     |3000  |
|Michael  |Rose      |        |2000-05-19|M     |4000  |
|Robert   |          |Williams|1978-09-05|M     |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |
|Jen      |Mary      |kkkBrown|1980-02-17|F     |-1    |
+---------+----------+--------+----------+------+------+

In [42]:
df4.drop("salary", "CopiedColumn").show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+----------+--------+----------+------+
|firstname|middlename|lastname|dob       |gender|
+---------+----------+--------+----------+------+
|James    |          |Smith   |1991-04-01|M     |
|Michael  |Rose      |        |2000-05-19|M     |
|Robert   |          |Williams|1978-09-05|M     |
|Maria    |Anne      |Jones   |1967-12-01|F     |
|Jen      |Mary      |kkkBrown|1980-02-17|F     |
+---------+----------+--------+----------+------+

## Rename Column

In [38]:
df.withColumnRenamed("gender", "sex").show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+----------+--------+----------+---+------+
|firstname|middlename|lastname|dob       |sex|salary|
+---------+----------+--------+----------+---+------+
|James    |          |Smith   |1991-04-01|M  |3000  |
|Michael  |Rose      |        |2000-05-19|M  |4000  |
|Robert   |          |Williams|1978-09-05|M  |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F  |4000  |
|Jen      |Mary      |kkkBrown|1980-02-17|F  |-1    |
+---------+----------+--------+----------+---+------+