In [1]:
#Practical-2: Practical on the DataFrame operations
#Aim: A) Demonstrate the use of next mentioned operations: Create an empty DataFrame,
#Create an empty DataSet, use of Rename nested column, Adding or Updating a column on DataFrame,
# Drop a column on DataFrame, Adding literal constant to DataFrame,
#Changing column data type, Pivot and Unpivot a DataFrame, Create a DataFrame using StructType & StructField schema.

In [2]:
!pip install pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, split, concat, first, udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Start Spark session
spark = SparkSession.builder.appName("Practical2").getOrCreate()



In [3]:
# A) DataFrame Operations

# a) Create Empty DataFrame
emptyDF = spark.createDataFrame([], StructType([]))
emptyDF.show()

++
||
++
++



In [4]:
# b) Rename Nested Column (simulate simple renaming)
df = spark.createDataFrame([(1, (2, 3))], ["a", "b"])
renamedDF = df.withColumnRenamed("b", "c")
renamedDF.show()

+---+------+
|  a|     c|
+---+------+
|  1|{2, 3}|
+---+------+



In [5]:
# c) Add / Update Column
updatedDF = df.withColumn("d", lit(4))
updatedDF.show()

+---+------+---+
|  a|     b|  d|
+---+------+---+
|  1|{2, 3}|  4|
+---+------+---+



In [6]:
# d) Drop Column
droppedDF = df.drop("b")
droppedDF.show()

+---+
|  a|
+---+
|  1|
+---+



In [7]:
# e) Add Literal Constant
literalDF = df.withColumn("e", lit("constant"))
literalDF.show()

+---+------+--------+
|  a|     b|       e|
+---+------+--------+
|  1|{2, 3}|constant|
+---+------+--------+



In [8]:
# f) Change Column Data Type
changedTypeDF = df.withColumn("a", col("a").cast("string"))
changedTypeDF.printSchema()

root
 |-- a: string (nullable = true)
 |-- b: struct (nullable = true)
 |    |-- _1: long (nullable = true)
 |    |-- _2: long (nullable = true)



In [10]:
# g) Pivot
df2 = spark.createDataFrame([(1, "x"), (1, "y"), (2, "x")], ["a", "b"])
pivotDF = df2.groupBy("a").pivot("b").count()
pivotDF.show()

+---+---+----+
|  a|  x|   y|
+---+---+----+
|  1|  1|   1|
|  2|  1|NULL|
+---+---+----+



In [11]:
# h) Create DataFrame using Schema
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])
data = [("Alice", 29), ("Bob", 35)]
df_schema = spark.createDataFrame(data, schema)
df_schema.show()

+-----+---+
| name|age|
+-----+---+
|Alice| 29|
|  Bob| 35|
+-----+---+



In [12]:
#Aim: B) Use of next mentioned operations: Selecting the first row of each group, Sort DataFrame,
# Union DataFrame, Drop Rows with null values from DataFrame,
# Split single to multiple columns, Concatenate multiple columns, Replace null values in DataFrame, Remove duplicate rows on DataFrame,
# Remove distinct on multiple selected columns, Spark UDF

In [13]:
# B) More DataFrame Operations

# Sample DataFrame
data = [("A", 10), ("A", 20), ("B", 30), ("B", None), ("C", 40)]
df = spark.createDataFrame(data, ["group", "value"])
df.show()

+-----+-----+
|group|value|
+-----+-----+
|    A|   10|
|    A|   20|
|    B|   30|
|    B| NULL|
|    C|   40|
+-----+-----+



In [14]:
# a) Selecting First Row of Each Group
firstRowDF = df.groupBy("group").agg(first("value"))
firstRowDF.show()

+-----+------------+
|group|first(value)|
+-----+------------+
|    A|          10|
|    B|          30|
|    C|          40|
+-----+------------+



In [15]:
# b) Sort DataFrame
sortedDF = df.sort("value")
sortedDF.show()

+-----+-----+
|group|value|
+-----+-----+
|    B| NULL|
|    A|   10|
|    A|   20|
|    B|   30|
|    C|   40|
+-----+-----+



In [16]:
# c) Union DataFrame
df1 = spark.createDataFrame([(1, "x"), (2, "y")], ["id", "col"])
df2 = spark.createDataFrame([(3, "z")], ["id", "col"])
unionDF = df1.union(df2)
unionDF.show()

+---+---+
| id|col|
+---+---+
|  1|  x|
|  2|  y|
|  3|  z|
+---+---+



In [17]:
# d) Drop Rows with Null
nonNullDF = df.na.drop()
nonNullDF.show()

+-----+-----+
|group|value|
+-----+-----+
|    A|   10|
|    A|   20|
|    B|   30|
|    C|   40|
+-----+-----+



In [18]:
# e) Split Single Column
df3 = spark.createDataFrame([("a,b,c",)], ["column"])
splitDF = df3.withColumn("split_col", split(col("column"), ","))
splitDF.show()

+------+---------+
|column|split_col|
+------+---------+
| a,b,c|[a, b, c]|
+------+---------+



In [19]:
# f) Concatenate Columns
df4 = spark.createDataFrame([("foo", "bar")], ["col1", "col2"])
concatDF = df4.withColumn("concat_col", concat(col("col1"), col("col2")))
concatDF.show()

+----+----+----------+
|col1|col2|concat_col|
+----+----+----------+
| foo| bar|    foobar|
+----+----+----------+



In [20]:
# g) Replace Null Values
replacedDF = df.na.fill("replacement")
replacedDF.show()

+-----+-----+
|group|value|
+-----+-----+
|    A|   10|
|    A|   20|
|    B|   30|
|    B| NULL|
|    C|   40|
+-----+-----+



In [21]:
# h) Remove Duplicate Rows
distinctDF = df.distinct()
distinctDF.show()

+-----+-----+
|group|value|
+-----+-----+
|    A|   20|
|    A|   10|
|    C|   40|
|    B|   30|
|    B| NULL|
+-----+-----+



In [22]:
# i) Remove Duplicates on Selected Columns
df5 = spark.createDataFrame([(1, "x"), (1, "x"), (2, "y")], ["col1", "col2"])
distinctColsDF = df5.dropDuplicates(["col1", "col2"])
distinctColsDF.show()

+----+----+
|col1|col2|
+----+----+
|   1|   x|
|   2|   y|
+----+----+



In [23]:
# j) Spark UDF
upper_udf = udf(lambda x: x.upper() if x else None, StringType())
df6 = spark.createDataFrame([("hello",), ("world",)], ["column"])
udfDF = df6.withColumn("upper_col", upper_udf(col("column")))
udfDF.show()

+------+---------+
|column|upper_col|
+------+---------+
| hello|    HELLO|
| world|    WORLD|
+------+---------+

