In [1]:
pip install findspark

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import findspark
findspark.init()


In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark


In [6]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder.appName("JupyterPySpark").getOrCreate()

# Check Spark version
print(spark.version)


3.5.5


In [7]:
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
columns = ["Name", "Age"]

df = spark.createDataFrame(data, columns)
df.show()


+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [9]:
df_csv = spark.read.csv(
   "file:///home/hp/1SI24AD401/Lab program-1/data.csv",  # path to your CSV file
    header=True,
    inferSchema=True
)
df_csv.show()


+---+-------+----------+------+
| id|   name|department|salary|
+---+-------+----------+------+
|  1|  Alice|        HR| 45000|
|  2|    Bob|        IT| 60000|
|  3|Charlie|        IT| 55000|
|  4|  Diana|        HR| 48000|
|  5| Edward|   Finance| 52000|
|  6|  Fiona|        IT| 61000|
+---+-------+----------+------+



In [10]:
df_csv.describe().show()


26/01/29 15:22:48 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+------------------+-----+----------+------------------+
|summary|                id| name|department|            salary|
+-------+------------------+-----+----------+------------------+
|  count|                 6|    6|         6|                 6|
|   mean|               3.5| NULL|      NULL|           53500.0|
| stddev|1.8708286933869707| NULL|      NULL|6410.9281699298435|
|    min|                 1|Alice|   Finance|             45000|
|    max|                 6|Fiona|        IT|             61000|
+-------+------------------+-----+----------+------------------+



In [11]:
print("Total rows:", df_csv.count())


Total rows: 6


In [12]:
unique_departments = df_csv.select("department").distinct().count()
print("Unique departments:", unique_departments)


Unique departments: 3


In [13]:
df_csv_updated = df_csv.withColumn(
    "salary",
    when(col("name") == "Bob", 62000).otherwise(col("salary"))
)
df_csv_updated.show()


+---+-------+----------+------+
| id|   name|department|salary|
+---+-------+----------+------+
|  1|  Alice|        HR| 45000|
|  2|    Bob|        IT| 62000|
|  3|Charlie|        IT| 55000|
|  4|  Diana|        HR| 48000|
|  5| Edward|   Finance| 52000|
|  6|  Fiona|        IT| 61000|
+---+-------+----------+------+



In [14]:
#Experiment 2: JSON Operations

In [16]:
df_json = spark.read.json("file:///home/hp/1SI24AD401/Lab program-1/data.json")  # path to your JSON file
df_json.show()


+----------+---+-------+------+
|department| id|   name|salary|
+----------+---+-------+------+
|        HR|  1|  Alice| 45000|
|        IT|  2|    Bob| 60000|
|        IT|  3|Charlie| 55000|
|        HR|  4|  Diana| 48000|
|   Finance|  5| Edward| 52000|
|        IT|  6|  Fiona| 61000|
+----------+---+-------+------+



In [17]:
print("Total records:", df_json.count())


Total records: 6


In [18]:
new_data = [(7, "George", "Finance", 53000)]
new_df = spark.createDataFrame(new_data, df_json.columns)

df_json_new = df_json.union(new_df)
df_json_new.show()


+----------+------+-------+------+
|department|    id|   name|salary|
+----------+------+-------+------+
|        HR|     1|  Alice| 45000|
|        IT|     2|    Bob| 60000|
|        IT|     3|Charlie| 55000|
|        HR|     4|  Diana| 48000|
|   Finance|     5| Edward| 52000|
|        IT|     6|  Fiona| 61000|
|         7|George|Finance| 53000|
+----------+------+-------+------+



In [19]:
df_json_updated = df_json_new.withColumn(
    "salary",
    when(col("name") == "Alice", 47000).otherwise(col("salary"))
)
df_json_updated.show()


+----------+------+-------+------+
|department|    id|   name|salary|
+----------+------+-------+------+
|        HR|     1|  Alice| 47000|
|        IT|     2|    Bob| 60000|
|        IT|     3|Charlie| 55000|
|        HR|     4|  Diana| 48000|
|   Finance|     5| Edward| 52000|
|        IT|     6|  Fiona| 61000|
|         7|George|Finance| 53000|
+----------+------+-------+------+



In [27]:
df.write.mode("overwrite").json(
    "file:///home/hp/1SI24AD401/Lab program-1/output_json"
)