In [1]:
!pip install pyspark

from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("Practical1").getOrCreate()
sc = spark.sparkContext



In [43]:
# Pratical 1A Basic Scala operations and transformations on RDD (Resilient Distributed Dataset)
#Aim: A) Demonstrate use of Parallelize, Read Text File, Read CSV, Create RDD, Actions, Pair Functions, Repartition and Coalesce, Shuffle Partitions, Broadcast Variables, Accumulator Variables, Convert RDD to DataFrame


In [32]:
#a) Parallelize
rdd = sc.parallelize([1, 2, 3, 4, 5])
print("Parallelize:", rdd.collect())

Parallelize: [1, 2, 3, 4, 5]


In [33]:
# b) Read Text File
textFileRDD = sc.textFile("Demo_data_engineering.txt")
print("TextFile:", textFileRDD.collect())

TextFile: ['2 Practical on the DataFrame operations ', 'a) Demonstrate the use of next mentioned operations: Create an empty DataFrame, ', 'Create an empty DataSet, use of Rename nested column, Adding or Updating a column ', 'on DataFrame, Drop a column on DataFrame, Adding literal constant to DataFrame, ', 'Changing column data type, Pivot and Unpivot a DataFrame, Create a DataFrame ', 'using StructType & StructField schema ', 'b) Use of next mentioned operations: Selecting the first row of each group, Sort ', 'DataFrame, Union DataFrame, Drop Rows with null values from DataFrame, Split ', 'single to multiple columns, Concatenate multiple columns, Replace null values in ', 'DataFrame, Remove duplicate rows on DataFrame, Remove distinct on multiple ', 'selected columns, Spark UDF ', '3 Practical on the Spark Array and Map operations ', 'a) Use of next mentioned operations: Create an Array (ArrayType) column on ', 'DataFrame, Create a Map (MapType) column on DataFrame, Convert an Array 

In [34]:
# c) Read CSV
from google.colab import files
uploaded = files.upload()

csvRDD = sc.textFile("data_mining_1.csv")
print("CSV:", csvRDD.collect())

Saving data_mining_1.csv to data_mining_1 (1).csv
CSV: ['empid,name,salary,gender,contact', '101,Nida,10000,female,525456252', '102,Swarnima,20000,female,25255141', '103,Sonal,30000,female,125421255', '104,Aun,55000,male,22633622', '105,Raunak,25000,male,145255224']


In [35]:
# d) Create RDD
data = [("Alice", 29), ("Bob", 35), ("Cathy", 25)]
rdd = sc.parallelize(data)
print("Created RDD:", rdd.collect())

Created RDD: [('Alice', 29), ('Bob', 35), ('Cathy', 25)]


In [36]:
# e) Actions
print("Count:", rdd.count())
print("First:", rdd.first())

Count: 3
First: ('Alice', 29)


In [37]:
# f) Pair Functions
pairRDD = rdd.map(lambda x: (x[0], x[1]))
print("Pair RDD:", pairRDD.collect())

Pair RDD: [('Alice', 29), ('Bob', 35), ('Cathy', 25)]


In [38]:
# g) Repartition and Coalesce
repartitionedRDD = rdd.repartition(4)
print("Repartition:", repartitionedRDD.getNumPartitions())

coalescedRDD = repartitionedRDD.coalesce(2)
print("Coalesce:", coalescedRDD.getNumPartitions())

Repartition: 4
Coalesce: 2


In [39]:
# h) Shuffle Partitions
shuffledRDD = rdd.sortBy(lambda x: x[1])
print("Shuffled:", shuffledRDD.collect())

Shuffled: [('Cathy', 25), ('Alice', 29), ('Bob', 35)]


In [40]:
# i) Broadcast Variables
broadcastVar = sc.broadcast([1, 2, 3])
print("Broadcast:", broadcastVar.value)

Broadcast: [1, 2, 3]


In [41]:
# j) Accumulator Variables
accum = sc.accumulator(0)
rdd.foreach(lambda x: accum.add(x[1]))
print("Accumulator (Sum of ages):", accum.value)

Accumulator (Sum of ages): 89


In [42]:
# k) Convert RDD to DataFrame
df = rdd.toDF(["Name", "Age"])
df.show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 29|
|  Bob| 35|
|Cathy| 25|
+-----+---+



In [44]:
#Partical 1B ) Demonstrate the following:
# Read Multiple Text Files into RDD,
# Read CSV File into RDD, Create an Empty RDD,
# RDD Pair Functions, Generate DataFrame from RDD


In [45]:
# a) Read Multiple Text Files
multiFilesRDD = sc.textFile("*.txt")  # wildcard
print("Multiple Files:", multiFilesRDD.collect())

Multiple Files: ['2 Practical on the DataFrame operations ', 'a) Demonstrate the use of next mentioned operations: Create an empty DataFrame, ', 'Create an empty DataSet, use of Rename nested column, Adding or Updating a column ', 'on DataFrame, Drop a column on DataFrame, Adding literal constant to DataFrame, ', 'Changing column data type, Pivot and Unpivot a DataFrame, Create a DataFrame ', 'using StructType & StructField schema ', 'b) Use of next mentioned operations: Selecting the first row of each group, Sort ', 'DataFrame, Union DataFrame, Drop Rows with null values from DataFrame, Split ', 'single to multiple columns, Concatenate multiple columns, Replace null values in ', 'DataFrame, Remove duplicate rows on DataFrame, Remove distinct on multiple ', 'selected columns, Spark UDF ', '3 Practical on the Spark Array and Map operations ', 'a) Use of next mentioned operations: Create an Array (ArrayType) column on ', 'DataFrame, Create a Map (MapType) column on DataFrame, Convert an 

In [47]:
# b) Read CSV File into RDD
csvRDD = sc.textFile("data_mining_1.csv")
print("CSV RDD:", csvRDD.collect())

CSV RDD: ['empid,name,salary,gender,contact', '101,Nida,10000,female,525456252', '102,Swarnima,20000,female,25255141', '103,Sonal,30000,female,125421255', '104,Aun,55000,male,22633622', '105,Raunak,25000,male,145255224']


In [48]:
# c) Empty RDD
emptyRDD = sc.emptyRDD()
print("Empty RDD:", emptyRDD.collect())

Empty RDD: []


In [49]:
# d) RDD Pair Functions
pairRDD2 = rdd.map(lambda x: (x[0], x[1]))
print("Pair RDD2:", pairRDD2.collect())

Pair RDD2: [('Alice', 29), ('Bob', 35), ('Cathy', 25)]


In [50]:
# e) Generate DataFrame from RDD
df2 = pairRDD2.toDF(["Name", "Age"])
df2.show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 29|
|  Bob| 35|
|Cathy| 25|
+-----+---+

