# Lab 2 - Simple PySpark Programs

In [1]:
import pyspark
import os
import sys
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, avg
from pyspark.sql.functions import explode, split, col
from pyspark.sql.functions import col, lit, when
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

## Question 1

In [2]:
spark = SparkSession.builder.appName("Transformations").getOrCreate()

data = [("Alice", 25), ("Bob", 30), ("Charlie", 29), ("Keshav", 35)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

filtered_df = df.filter(col("Age") > 25)
transformed_df = filtered_df.withColumn("AgePlusTen", col("Age") + lit(10))

transformed_df.show()



+-------+---+----------+
|   Name|Age|AgePlusTen|
+-------+---+----------+
|    Bob| 30|        40|
|Charlie| 29|        39|
| Keshav| 35|        45|
+-------+---+----------+



## Question 2 

In [3]:
spark = SparkSession.builder.appName("Actions").getOrCreate()

data = [("Alice", 25), ("Bob", 30), ("Charlie", 29), ("Keshav", 35)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

count = df.count()
print(f"Number of rows: {count}")

df.show()


Number of rows: 4
+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 29|
| Keshav| 35|
+-------+---+



## Question 3

In [4]:
spark = SparkSession.builder.appName("Aggregations").getOrCreate()

data = [("Alice", 25), ("Bob", 30), ("Charlie", 29), ("Keshav", 35)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

df.agg(sum("Age").alias("TotalAge"), avg("Age").alias("AverageAge")).show()


+--------+----------+
|TotalAge|AverageAge|
+--------+----------+
|     119|     29.75|
+--------+----------+



## Question 4

In [5]:
spark = SparkSession.builder.appName("WriteCSV").getOrCreate()

data = [("Alice", 25), ("Bob", 30), ("Charlie", 29), ("Keshav", 35)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

# Write DataFrame to CSV
df.write.csv("output.csv", header=True)


## Question 5

In [6]:
spark = SparkSession.builder.appName("WordCount").getOrCreate()

data = [("Hello world",), ("Hello from PySpark",)]
columns = ["Text"]
df = spark.createDataFrame(data, columns)

words_df = df.withColumn("Word", explode(split(col("Text"), " ")))

word_counts = words_df.groupBy("Word").count()

word_counts.show()

+-------+-----+
|   Word|count|
+-------+-----+
|  Hello|    2|
|  world|    1|
|   from|    1|
|PySpark|    1|
+-------+-----+

