# library

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("SparkBasics").getOrCreate()
spark

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/05 10:08:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
print(spark)

<pyspark.sql.session.SparkSession object at 0x10c3eb2e0>


# creating data

In [4]:
# Method 1: Simple list to DataFrame
data = [("Alice", 25, "Engineer"),
        ("Bob", 30, "Manager"), 
        ("Charlie", 35, "Analyst")]

columns = ["name", "age", "role"]
data

[('Alice', 25, 'Engineer'), ('Bob', 30, 'Manager'), ('Charlie', 35, 'Analyst')]

In [11]:
df = spark.createDataFrame(data
                           )
df.show()

+-------+---+--------+
|     _1| _2|      _3|
+-------+---+--------+
|  Alice| 25|Engineer|
|    Bob| 30| Manager|
|Charlie| 35| Analyst|
+-------+---+--------+



In [12]:
df.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)
 |-- _3: string (nullable = true)



In [13]:
df = spark.createDataFrame(data, 
                           columns)
df.show()

+-------+---+--------+
|   name|age|    role|
+-------+---+--------+
|  Alice| 25|Engineer|
|    Bob| 30| Manager|
|Charlie| 35| Analyst|
+-------+---+--------+



In [14]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- role: string (nullable = true)



In [15]:
df.count()

3

In [16]:
df.columns

['name', 'age', 'role']

# explicit data types

In [17]:
from pyspark.sql.types import StringType, IntegerType, StructField,StructType

schema = StructType([
    StructField("name", StringType(), True),
    StructField("age",IntegerType(), True),
    StructField("role",StringType(), True)
])
schema

StructType([StructField('name', StringType(), True), StructField('age', IntegerType(), True), StructField('role', StringType(), True)])

In [18]:
df_type = spark.createDataFrame(data, schema)
df_type.show()

+-------+---+--------+
|   name|age|    role|
+-------+---+--------+
|  Alice| 25|Engineer|
|    Bob| 30| Manager|
|Charlie| 35| Analyst|
+-------+---+--------+



In [19]:
df_type.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- role: string (nullable = true)



# selecting, filtering, sorting

In [20]:
df.select('name','age').show()

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [21]:
df.filter(df.age > 30).show()

+-------+---+-------+
|   name|age|   role|
+-------+---+-------+
|Charlie| 35|Analyst|
+-------+---+-------+



In [22]:
df.filter(df.role =='Manager').show()

+----+---+-------+
|name|age|   role|
+----+---+-------+
| Bob| 30|Manager|
+----+---+-------+



In [23]:
df.orderBy('age').show()

+-------+---+--------+
|   name|age|    role|
+-------+---+--------+
|  Alice| 25|Engineer|
|    Bob| 30| Manager|
|Charlie| 35| Analyst|
+-------+---+--------+



In [24]:
df.orderBy(df.age.desc()).show()

+-------+---+--------+
|   name|age|    role|
+-------+---+--------+
|Charlie| 35| Analyst|
|    Bob| 30| Manager|
|  Alice| 25|Engineer|
+-------+---+--------+



In [25]:
df.orderBy(df.age.asc()).show()

+-------+---+--------+
|   name|age|    role|
+-------+---+--------+
|  Alice| 25|Engineer|
|    Bob| 30| Manager|
|Charlie| 35| Analyst|
+-------+---+--------+



In [27]:
df.orderBy('role').show()

+-------+---+--------+
|   name|age|    role|
+-------+---+--------+
|Charlie| 35| Analyst|
|  Alice| 25|Engineer|
|    Bob| 30| Manager|
+-------+---+--------+



# column operations

In [28]:
from pyspark.sql.functions import col 


In [29]:
df.select('name', (col("age") + 5).alias('age_plus_five')).show()

+-------+-------------+
|   name|age_plus_five|
+-------+-------------+
|  Alice|           30|
|    Bob|           35|
|Charlie|           40|
+-------+-------------+



In [31]:
df1= df.select('name', (col('age')+5.2).alias('age_plus_fiveish'))
df1.show()

+-------+----------------+
|   name|age_plus_fiveish|
+-------+----------------+
|  Alice|            30.2|
|    Bob|            35.2|
|Charlie|            40.2|
+-------+----------------+



In [32]:
df1.printSchema()

root
 |-- name: string (nullable = true)
 |-- age_plus_fiveish: double (nullable = true)



# grouping and aggregations

In [33]:
data = [
    ("James", "Sales", "NY", 90000, 34, 10000),
    ("Michael", "Sales", "NY", 86000, 56, 20000),
    ("Robert", "Sales", "CA", 81000, 30, 23000),
    ("Maria", "Finance", "CA", 90000, 24, 23000),
    ("Raman", "Finance", "CA", 99000, 40, 24000),
    ("Scott", "Finance", "NY", 83000, 36, 19000),
    ("Jen", "Finance", "NY", 79000, 53, 15000),
    ("Jeff", "Marketing", "CA", 80000, 25, 18000),
    ("Kumar", "Marketing", "NY", 91000, 50, 21000)
]
columns = ["employee_name", "department", "state", "salary", "age", "bonus"]
data

[('James', 'Sales', 'NY', 90000, 34, 10000),
 ('Michael', 'Sales', 'NY', 86000, 56, 20000),
 ('Robert', 'Sales', 'CA', 81000, 30, 23000),
 ('Maria', 'Finance', 'CA', 90000, 24, 23000),
 ('Raman', 'Finance', 'CA', 99000, 40, 24000),
 ('Scott', 'Finance', 'NY', 83000, 36, 19000),
 ('Jen', 'Finance', 'NY', 79000, 53, 15000),
 ('Jeff', 'Marketing', 'CA', 80000, 25, 18000),
 ('Kumar', 'Marketing', 'NY', 91000, 50, 21000)]

In [34]:
df = spark.createDataFrame(data, columns)
df.printSchema()

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)



In [36]:
from pyspark.sql.functions import sum, avg, min, max

df.groupBy('state').agg(sum('salary'),max('age')).show()

+-----+-----------+--------+
|state|sum(salary)|max(age)|
+-----+-----------+--------+
|   NY|     429000|      56|
|   CA|     350000|      40|
+-----+-----------+--------+



In [37]:
df.groupBy('department').agg(max('bonus').alias('highest_bonus'), min('bonus').alias('lowest_bonus')).show()

+----------+-------------+------------+
|department|highest_bonus|lowest_bonus|
+----------+-------------+------------+
|     Sales|        23000|       10000|
|   Finance|        24000|       15000|
| Marketing|        21000|       18000|
+----------+-------------+------------+



In [39]:
df.groupBy(
    "state"
).count().show()

+-----+-----+
|state|count|
+-----+-----+
|   NY|    5|
|   CA|    4|
+-----+-----+



# window 

In [40]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank


In [42]:
window_spec = Window.partitionBy('department').orderBy('Salary')
print(window_spec)

<pyspark.sql.classic.window.WindowSpec object at 0x10c4233d0>


In [43]:
df.withColumn('salary_rank', rank().over(window_spec)).show()

+-------------+----------+-----+------+---+-----+-----------+
|employee_name|department|state|salary|age|bonus|salary_rank|
+-------------+----------+-----+------+---+-----+-----------+
|          Jen|   Finance|   NY| 79000| 53|15000|          1|
|        Scott|   Finance|   NY| 83000| 36|19000|          2|
|        Maria|   Finance|   CA| 90000| 24|23000|          3|
|        Raman|   Finance|   CA| 99000| 40|24000|          4|
|         Jeff| Marketing|   CA| 80000| 25|18000|          1|
|        Kumar| Marketing|   NY| 91000| 50|21000|          2|
|       Robert|     Sales|   CA| 81000| 30|23000|          1|
|      Michael|     Sales|   NY| 86000| 56|20000|          2|
|        James|     Sales|   NY| 90000| 34|10000|          3|
+-------------+----------+-----+------+---+-----+-----------+



In [44]:
window_spec = Window.partitionBy('department').orderBy(df.age.desc())
df.withColumn('aged_person',rank().over(window_spec)).show()

+-------------+----------+-----+------+---+-----+-----------+
|employee_name|department|state|salary|age|bonus|aged_person|
+-------------+----------+-----+------+---+-----+-----------+
|          Jen|   Finance|   NY| 79000| 53|15000|          1|
|        Raman|   Finance|   CA| 99000| 40|24000|          2|
|        Scott|   Finance|   NY| 83000| 36|19000|          3|
|        Maria|   Finance|   CA| 90000| 24|23000|          4|
|        Kumar| Marketing|   NY| 91000| 50|21000|          1|
|         Jeff| Marketing|   CA| 80000| 25|18000|          2|
|      Michael|     Sales|   NY| 86000| 56|20000|          1|
|        James|     Sales|   NY| 90000| 34|10000|          2|
|       Robert|     Sales|   CA| 81000| 30|23000|          3|
+-------------+----------+-----+------+---+-----+-----------+



# joins

In [46]:
dept_data = [("Finance", 10), ("Marketing", 20), ("Sales", 30)]
dept_columns = ["dept_name", "dept_id"]
dept_df = spark.createDataFrame(dept_data, dept_columns)

emp_data = [(1,"John",10), (2,"Maria",20), (3,"David",10)]
emp_columns = ["emp_id", "name", "emp_dept_id"]
emp_df = spark.createDataFrame(emp_data, emp_columns)
dept_df.show()

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
+---------+-------+



In [47]:
emp_df.show()

+------+-----+-----------+
|emp_id| name|emp_dept_id|
+------+-----+-----------+
|     1| John|         10|
|     2|Maria|         20|
|     3|David|         10|
+------+-----+-----------+



In [50]:
joined = emp_df.join(dept_df, dept_df.dept_id == emp_df.emp_dept_id, how='inner')
joined.show()

+------+-----+-----------+---------+-------+
|emp_id| name|emp_dept_id|dept_name|dept_id|
+------+-----+-----------+---------+-------+
|     1| John|         10|  Finance|     10|
|     3|David|         10|  Finance|     10|
|     2|Maria|         20|Marketing|     20|
+------+-----+-----------+---------+-------+



In [51]:
joined = emp_df.join(dept_df, dept_df.dept_id == emp_df.emp_dept_id, how='left')
joined.show()

+------+-----+-----------+---------+-------+
|emp_id| name|emp_dept_id|dept_name|dept_id|
+------+-----+-----------+---------+-------+
|     1| John|         10|  Finance|     10|
|     2|Maria|         20|Marketing|     20|
|     3|David|         10|  Finance|     10|
+------+-----+-----------+---------+-------+



In [52]:
joined = emp_df.join(dept_df, dept_df.dept_id == emp_df.emp_dept_id, how='right')
joined.show()

+------+-----+-----------+---------+-------+
|emp_id| name|emp_dept_id|dept_name|dept_id|
+------+-----+-----------+---------+-------+
|     3|David|         10|  Finance|     10|
|     1| John|         10|  Finance|     10|
|     2|Maria|         20|Marketing|     20|
|  NULL| NULL|       NULL|    Sales|     30|
+------+-----+-----------+---------+-------+



In [53]:
joined = emp_df.join(dept_df, dept_df.dept_id == emp_df.emp_dept_id, how='outer')
joined.show()

+------+-----+-----------+---------+-------+
|emp_id| name|emp_dept_id|dept_name|dept_id|
+------+-----+-----------+---------+-------+
|     1| John|         10|  Finance|     10|
|     3|David|         10|  Finance|     10|
|     2|Maria|         20|Marketing|     20|
|  NULL| NULL|       NULL|    Sales|     30|
+------+-----+-----------+---------+-------+



# caching

In [54]:
df.printSchema()

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)



In [55]:
df.cache()

DataFrame[employee_name: string, department: string, state: string, salary: bigint, age: bigint, bonus: bigint]

In [56]:
df.count()

9

In [57]:
from pyspark import StorageLevel

In [59]:
joined.persist(StorageLevel.MEMORY_AND_DISK)
joined.count()

4

In [60]:
joined.unpersist()

DataFrame[emp_id: bigint, name: string, emp_dept_id: bigint, dept_name: string, dept_id: bigint]

In [61]:
# transformation - lazy
older = df.filter(df.age>30)

# action - triggers job
older.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



# distributed computing

In [64]:
df.rdd.getNumPartitions()

8

In [65]:
# increase partitions  - shuffle

df2 = df.repartition(10)
df2.rdd.getNumPartitions()

10

In [66]:
df3 = df.coalesce(2)
df3.rdd.getNumPartitions()

2