In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [12]:
spark = SparkSession.builder.appName("DVPT_1").getOrCreate()

In [13]:
data = [
    ("Alice", 25, "New York", "Female"),
    ("Bob", 35, "Los Angeles", "Male"),
    ("Charlie", 45, "Chicago", "Male"),
    ("Diana", 28, "Houston", "Female"),
    ("Eve", 32, "Phoenix", "Female"),
    ("Frank", 38, "San Diego", "Male"),
    ("Grace", 29, "San Francisco", "Female"),
    ("Hank", 41, "Seattle", "Male"),
    ("Ivy", 33, "Boston", "Female"),
    ("Jack", 27, "Austin", "Male"),
]
columns = ["name", "age", "city", "gender"]

In [14]:
df = spark.createDataFrame(data, columns)

In [15]:
df_1 = df.filter(col("age") > 30)
df_1.show()

+-------+---+-----------+------+
|   name|age|       city|gender|
+-------+---+-----------+------+
|    Bob| 35|Los Angeles|  Male|
|Charlie| 45|    Chicago|  Male|
|    Eve| 32|    Phoenix|Female|
|  Frank| 38|  San Diego|  Male|
|   Hank| 41|    Seattle|  Male|
|    Ivy| 33|     Boston|Female|
+-------+---+-----------+------+



In [16]:
df_2 =  df_1. withColumn("tax", col("age")* 0.5)
df_2.show()

+-------+---+-----------+------+----+
|   name|age|       city|gender| tax|
+-------+---+-----------+------+----+
|    Bob| 35|Los Angeles|  Male|17.5|
|Charlie| 45|    Chicago|  Male|22.5|
|    Eve| 32|    Phoenix|Female|16.0|
|  Frank| 38|  San Diego|  Male|19.0|
|   Hank| 41|    Seattle|  Male|20.5|
|    Ivy| 33|     Boston|Female|16.5|
+-------+---+-----------+------+----+



In [17]:
df_3 = df_2.withColumnRenamed("age", "years")
df_3.show()

+-------+-----+-----------+------+----+
|   name|years|       city|gender| tax|
+-------+-----+-----------+------+----+
|    Bob|   35|Los Angeles|  Male|17.5|
|Charlie|   45|    Chicago|  Male|22.5|
|    Eve|   32|    Phoenix|Female|16.0|
|  Frank|   38|  San Diego|  Male|19.0|
|   Hank|   41|    Seattle|  Male|20.5|
|    Ivy|   33|     Boston|Female|16.5|
+-------+-----+-----------+------+----+



In [19]:
df_4 = df_3.drop("city", "gender")
df_4.show()

+-------+-----+----+
|   name|years| tax|
+-------+-----+----+
|    Bob|   35|17.5|
|Charlie|   45|22.5|
|    Eve|   32|16.0|
|  Frank|   38|19.0|
|   Hank|   41|20.5|
|    Ivy|   33|16.5|
+-------+-----+----+



In [20]:
spark.stop()