In [None]:
spark

# Collection Colum Types

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

## Array

In [None]:
arrayType = T.ArrayType(T.IntegerType(), False)

In [None]:
print(arrayType.jsonValue()) # schema description as JSONSchema

print(arrayType.simpleString()) # schema description in Hive style

print(arrayType.typeName()) # type name

## Map

In [None]:
mapType = T.MapType(T.StringType(), T.IntegerType())

In [None]:
print(mapType.keyType)

print(mapType.valueType)

print(mapType.valueContainsNull)

## Struct

In [None]:
data = [
    ("James","","Smith","36","M",3000),
    ("Michael","Rose","","40","M",4000),
    ("Robert","","Williams","42","M",4000),
    ("Maria","Anne","Jones","39","F",4000),
    ("Jen","Mary","Brown","","F",-1),
]

In [None]:
schema = T.StructType([
    T.StructField("firetname", T.StringType(), True),
    T.StructField("middlename",T.StringType(),True), 
    T.StructField("lastname",T.StringType(),True), 
    T.StructField("age", T.StringType(), True), 
    T.StructField("gender", T.StringType(), True), 
    T.StructField("salary", T.IntegerType(), True) 
])

In [None]:
df = spark.createDataFrame(data=data, schema=schema)

In [None]:
df.printSchema()

In [None]:
df.show(truncate=False)

# Column Selection

In [None]:
data = [
    ("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL"),
]

columns = ["firstname","lastname","country","state"]
df = spark.createDataFrame(data, columns)
df.show()

## Select columns

Show all columns with `columns` property

In [None]:
df.columns

Use string as a column specifier.

In [None]:
df.select("firstname", "lastname").show()

Use data frame property as a column specifier This way has a limitation if a name contains whitespace or invalid characters for Python property name. **Not recommended** 

In [None]:
df.select(df.firstname, df.lastname).show()

Use data frame indexing (with string ) as a column specifier.

In [None]:
df.select(df["firstname"], df["lastname"]).show()

Use `col()` function to reference  a dataframe. it has an additional good point to extend operation with the column, for example, `alias()`.

In [None]:
df.select(F.col("firstname").alias("fname"), F.col("lastname")).show()

### Show all columns

In [None]:
df.select("*").show() # or only show()

In [None]:
df.select([col for col in df.columns]).show()  # List of column names

In [None]:
df.select(*df.columns).show()

Use slicing `df.columns` property to select column in range.

In [None]:
df.select(df.columns[:3]).show(3)  # first three columns

In [None]:
df.select(df.columns[2:4]).show()  # column 3 to 4

### Select nested struct

In [None]:
data = [
    (("James",None,"Smith"),"OH","M"),
    (("Anna","Rose",""),"NY","F"),
    (("Julia","","Williams"),"OH","F"),
    (("Maria","Anne","Jones"),"NY","M"),
    (("Jen","Mary","Brown"),"NY","M"),
    (("Mike","Mary","Williams"),"OH","M")
]

In [None]:
schema = T.StructType([
    T.StructField("name", T.StructType([
        T.StructField("firstname", T.StringType(), True),
        T.StructField("middlename", T.StringType(), True),
        T.StructField("lastname", T.StringType(), True),
    ])),
    T.StructField("state", T.StringType(), True),
    T.StructField("gender", T.StringType(), True),
])

In [None]:
df2 = spark.createDataFrame(data, schema)

In [None]:
df2.printSchema()

In [None]:
df2.show(truncate=False)

In [None]:
df2.select("name").show(truncate=False)

In [None]:
df2.select("name.firstname", "name.lastname").show(truncate=False)

In [None]:
df2.select("name.*").show(truncate=False)

## Adding Column

In [None]:
data = [
  ('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','kkkBrown','1980-02-17','F',-1),
]

In [None]:
columns = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]

In [None]:
df = spark.createDataFrame(data=data, schema=columns)
df.printSchema()
df.show(truncate=False)

### Cast
Casting a column type, but still keep the same name.

In [None]:
df2 = df.withColumn("salary", F.col("salary").cast("integer"))
df2.printSchema()
df2.show(truncate=False)

### Expression
Applying an expression with a column, but still keep the same name.

In [None]:
df3 = df.withColumn("salary", F.col("salary") * 100)
df3.printSchema()
df3.show(truncate=False)

In [None]:
df4 = df.withColumn("CopiedColumn", F.col("salary") * -1)
df4.printSchema()
df4.show(truncate=False)

### Literal
Create a new column with a literal value (of any type) by using `lit()`.

In [None]:
df5 = df.withColumn("Country", F.lit("USA"))
df5.printSchema()
df5.show(truncate=False)

More examples about `lit()`:

In [None]:
data = [("111",50000),("222",60000),("333",40000)]
columns = ["EmpId", "Salary"]

df5_1 = spark.createDataFrame(data=data, schema=columns)
df5_1.printSchema()
df5_1.show(truncate=False)

In [None]:
df5_2 = df5_1.select(F.col("EmpId"), F.col("Salary"), F.lit("1").alias("lit_value1"))
df5_2.show()

### Chain adding columns
As a `RDD` is immutable and its operation returns a new one. We can chain operations with a DataFrame.

In [None]:
df6 = (
    df
    .withColumn("Country", F.lit("USA"))
    .withColumn("anotherColumn", F.lit("anotherValue"))
)
df6.printSchema()

## Droping Column

In [None]:
data = [
    ("James","","Smith","36636","NewYork",3100),
    ("Michael","Rose","","40288","California",4300),
    ("Robert","","Williams","42114","Florida",1400),
    ("Maria","Anne","Jones","39192","Florida",5500),
    ("Jen","Mary","Brown","34561","NewYork",3000),
]

columns = ["firstname", "middlename", "lastname", "id", "location", "salary"]

In [None]:
df = spark.createDataFrame(data=data, schema=columns)
df.printSchema()
df.show()

Drop a column by using `drop()` with a column name.

In [None]:
df.drop("firstname").printSchema()

Use `col()` to reference a column in a dataframe.

In [None]:
df.drop(F.col("firstname")).printSchema()

Use a column property in a dataframe.

In [None]:
df.drop(df.firstname).printSchema()

To Drop multiple columns within one time, use multiple column names as arguments with `drop()`.

In [None]:
df.drop("firstname", "middlename", "lastname")

As an exmaple above, we can collect all columns to drop in an array.

In [None]:
cols = ("firstname", "middlename", "lastname")

df.drop(*cols).printSchema()

## Renaming Column

In [None]:
data = [
  (('James','','Smith'),'1991-04-01','M',3000),
  (('Michael','Rose',''),'2000-05-19','M',4000),
  (('Robert','','Williams'),'1978-09-05','M',4000),
  (('Maria','Anne','Jones'),'1967-12-01','F',4000),
  (('Jen','Mary','Brown'),'1980-02-17','F',-1),
]

In [None]:
schema = T.StructType([
    T.StructField('name', T.StructType([
        T.StructField('firstname', T.StringType(), True),
        T.StructField('middlename', T.StringType(), True),
        T.StructField('lastname', T.StringType(), True)
    ])),
    T.StructField('dob', T.StringType(), True),
    T.StructField('gender', T.StringType(), True),
    T.StructField('salary', T.IntegerType(), True),
])

In [None]:
df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()

**Use `withColumnRenamed()`**

In [None]:
df.withColumnRenamed("dob","DateOfBirth").printSchema()

You can chain them together

In [None]:
df2 = (
    df
    .withColumnRenamed("dob", "date_of_brith")
    .withColumnRenamed("salary", "salary_amount")
)
df2.printSchema()

**Use struct to rename nested property**

In [None]:
schema2 = T.StructType([
    T.StructField("fname", T.StringType()),
    T.StructField("middlename", T.StringType()),
    T.StructField("lname", T.StringType()),
])

In [None]:
df2 = df.select(
    F.col("name").cast(schema2),
    F.col("dob"),
    F.col("gender"),
    F.col("salary"),
)

df2.printSchema()

**Use `alias()`**

In [None]:
df3 = df.select(
    F.col("name.firstname").alias("fname"),
    F.col("name.middlename").alias("mname"),
    F.col("name.lastname").alias("lname"),
    F.col("dob"),
    F.col("gender"),
    F.col("salary"),
)

df3.printSchema()

Otherwise, it can also be used for nesting column as a struct.

In [None]:
df4 = (
    df
    .withColumn("fname", F.col("name.firstname"))
    .withColumn("mname", F.col("name.middlename"))
    .withColumn("lname", F.col("name.lastname"))
    .drop("name")
)

df4.printSchema()