In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [2]:
spark = SparkSession.builder.appName("S1").getOrCreate()

In [3]:
data = [
    ("EMA", 23, "Tokyo", "Female"),
    ("Bob", 35, "Los Angeles", "Male"),
    ("Charlie", 45, "Chicago", "Male"),
    ("Diana", 28, "Houston", "Female"),
    ("Eve", 32, "Phoenix", "Female"),
    ("Frank", 38, "San Diego", "Male"),
    ("Grace", 29, "San Francisco", "Female"),
    ("Hank", 41, "Seattle", "Male"),
    ("Ivy", 33, "Boston", "Female"),
    ("Jack", 27, "Austin", "Male"),
]

columns = ["name", "age", "city", "gender"]

In [4]:
df = spark.createDataFrame(data, columns)

In [5]:
df.show()

+-------+---+-------------+------+
|   name|age|         city|gender|
+-------+---+-------------+------+
|    EMA| 23|        Tokyo|Female|
|    Bob| 35|  Los Angeles|  Male|
|Charlie| 45|      Chicago|  Male|
|  Diana| 28|      Houston|Female|
|    Eve| 32|      Phoenix|Female|
|  Frank| 38|    San Diego|  Male|
|  Grace| 29|San Francisco|Female|
|   Hank| 41|      Seattle|  Male|
|    Ivy| 33|       Boston|Female|
|   Jack| 27|       Austin|  Male|
+-------+---+-------------+------+



In [6]:
filter_30 = df.filter(col("age")>30)
filter_30.show()

+-------+---+-----------+------+
|   name|age|       city|gender|
+-------+---+-----------+------+
|    Bob| 35|Los Angeles|  Male|
|Charlie| 45|    Chicago|  Male|
|    Eve| 32|    Phoenix|Female|
|  Frank| 38|  San Diego|  Male|
|   Hank| 41|    Seattle|  Male|
|    Ivy| 33|     Boston|Female|
+-------+---+-----------+------+



In [8]:
insert_tax = filter_30.withColumn("tax", col("age")*0.5)
insert_tax.show()

+-------+---+-----------+------+----+
|   name|age|       city|gender| tax|
+-------+---+-----------+------+----+
|    Bob| 35|Los Angeles|  Male|17.5|
|Charlie| 45|    Chicago|  Male|22.5|
|    Eve| 32|    Phoenix|Female|16.0|
|  Frank| 38|  San Diego|  Male|19.0|
|   Hank| 41|    Seattle|  Male|20.5|
|    Ivy| 33|     Boston|Female|16.5|
+-------+---+-----------+------+----+



In [9]:
rename_years = insert_tax.withColumnRenamed("age", "years")
rename_years.show()

+-------+-----+-----------+------+----+
|   name|years|       city|gender| tax|
+-------+-----+-----------+------+----+
|    Bob|   35|Los Angeles|  Male|17.5|
|Charlie|   45|    Chicago|  Male|22.5|
|    Eve|   32|    Phoenix|Female|16.0|
|  Frank|   38|  San Diego|  Male|19.0|
|   Hank|   41|    Seattle|  Male|20.5|
|    Ivy|   33|     Boston|Female|16.5|
+-------+-----+-----------+------+----+



In [10]:
dropped_df = rename_years.drop("city", "gender")
dropped_df.show()

+-------+-----+----+
|   name|years| tax|
+-------+-----+----+
|    Bob|   35|17.5|
|Charlie|   45|22.5|
|    Eve|   32|16.0|
|  Frank|   38|19.0|
|   Hank|   41|20.5|
|    Ivy|   33|16.5|
+-------+-----+----+



In [11]:
spark.stop()