# Install Pyspark

In [None]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

import os
import sys
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"


import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import DataFrame, SparkSession
from typing import List
import pyspark.sql.types as T
import pyspark.sql.functions as F

spark= SparkSession \
       .builder \
       .appName("Our First Spark Example") \
       .getOrCreate()

spark

[33m0% [Working][0m            Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
44 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire

# Processing Column Data


Here is a breakdown of the PySpark functions from the import statement, explained as points:

### String Manipulation Functions
1. **`col`**: Refers to columns within a DataFrame. Used to apply operations on columns.
2. **`lower`**: Converts string columns to lowercase.
3. **`upper`**: Converts string columns to uppercase.
4. **`length`**: Returns the length of a string column.
5. **`substring`**: Extracts a substring from a string column, based on start and length.
6. **`split`**: Splits a string into an array of substrings based on a delimiter.
7. **`trim`**: Removes leading and trailing whitespace from a string.
8. **`ltrim`**: Removes leading whitespace from a string.
9. **`rtrim`**: Removes trailing whitespace from a string.
10. **`lpad`**: Pads the left side of a string column to a specified length with a padding character.
11. **`rpad`**: Pads the right side of a string column to a specified length with a padding character.
12. **`concat`**: Concatenates two or more columns into a single column.
13. **`concat_ws`**: Concatenates two or more columns with a separator (e.g., `-` or `,`).

### Date Manipulation Functions
14. **`current_date`**: Returns the current date.
15. **`current_timestamp`**: Returns the current timestamp.
16. **`date_add`**: Adds a specified number of days to a date column.
17. **`date_sub`**: Subtracts a specified number of days from a date column.
18. **`datediff`**: Returns the difference in days between two dates.
19. **`months_between`**: Returns the number of months between two dates.
20. **`add_months`**: Adds a specified number of months to a date column.
21. **`next_day`**: Returns the next specified day of the week after a given date.
22. **`last_day`**: Returns the last day of the month for a given date.
23. **`trunc`**: Truncates a date to the specified unit (e.g., year, month).
24. **`date_trunc`**: Truncates a timestamp to the specified unit (e.g., hour, day).
25. **`date_format`**: Formats a date according to a specified pattern (e.g., `yyyy/MM/dd`).
26. **`dayofyear`**: Extracts the day of the year from a date.
27. **`dayofmonth`**: Extracts the day of the month from a date.
28. **`dayofweek`**: Extracts the day of the week from a date.
29. **`year`**: Extracts the year from a date.
30. **`month`**: Extracts the month from a date.

### Aggregate Functions
31. **`count`**: Returns the number of rows in a group.
32. **`countDistinct`**: Returns the number of distinct rows in a group.
33. **`sum`**: Returns the sum of values in a column.
34. **`avg`**: Returns the average of values in a column.
35. **`min`**: Returns the minimum value of a column.
36. **`max`**: Returns the maximum value of a column.

### Conditional and Type Functions
37. **`when`**: Used for conditional expressions (similar to `CASE WHEN` in SQL).
38. **`lit`**: Creates a literal value in a DataFrame column.

### Special Types - ARRAY, MAP, STRUCT, CAST
39. **`array`**: Creates an array column from multiple columns.
40. **`map_from_arrays`**: Creates a map (key-value pairs) from two arrays.
41. **`struct`**: Combines multiple columns into a single struct column (nested columns).
42. **`cast`**: Casts the data type of a column to another type (e.g., `int` to `string`).

These points will help explain each function step by step to your students.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, upper, length, substring, split, trim, ltrim, rtrim, \
    lpad, rpad, concat, concat_ws, current_date, current_timestamp, date_add, date_sub, datediff, \
    months_between, add_months, next_day, last_day, trunc, date_trunc, date_format, dayofyear, \
    dayofmonth, dayofweek, year, month, count, countDistinct, sum, avg, min, max, when, lit, \
    array, map_from_arrays, struct, cast

In [None]:
df=spark.read.csv("/content/sales_data_1000.csv",header=True,inferSchema=True)
df.show()

#inferSchema=True -> It automatically infer column data types
#encoding="UTF-8"
#nullValue ="NA" -> treat NA as null
#sep="," ->Filed Delimiter ,Default

#df2=df.withColumn("category_lenght", len(col("category")))
#df2.show()

#df3=df.withColumn("date_add",date_add(col("sales_date"),1)).show()
#df3.show()
df3=df.withColumn("date_sub",date_sub(col("sales_date"),10))
df3.show()

+--------------+-----------+----------+-----------+--------+--------+---------+----------+
|transaction_id|customer_id|   product|   category|quantity|   price|     city| sale_date|
+--------------+-----------+----------+-----------+--------+--------+---------+----------+
|             1|        178|    Tablet|Electronics|       6|29546.59|Hyderabad|2024-11-06|
|             2|        126|    Laptop|Electronics|       9|51856.81|    Delhi|2024-08-30|
|             3|        177|   Charger|Accessories|       2|60725.66|    Delhi|2024-05-26|
|             4|        271|    Camera|Electronics|       7|80424.94|   Mumbai|2024-09-19|
|             5|        281|     Watch|  Wearables|       3|15546.03|   Mumbai|2024-01-06|
|             6|         68|    Laptop|Electronics|      10| 25425.0|    Delhi|2024-11-29|
|             7|         64|    Laptop|Electronics|       2|24791.55|     Pune|2024-05-05|
|             8|        179|    Laptop|Electronics|       5| 89351.3|  Chennai|2024-02-09|

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `sales_date` cannot be resolved. Did you mean one of the following? [`sale_date`, `category`, `product`, `city`, `price`].;
'Project [transaction_id#488, customer_id#489, product#490, category#491, quantity#492, price#493, city#494, sale_date#495, date_sub('sales_date, 10) AS date_sub#546]
+- Relation [transaction_id#488,customer_id#489,product#490,category#491,quantity#492,price#493,city#494,sale_date#495] csv


In [None]:
df.printSchema()
df.show(5)

root
 |-- transaction_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- city: string (nullable = true)
 |-- sale_date: date (nullable = true)

+--------------+-----------+-------+-----------+--------+--------+---------+----------+
|transaction_id|customer_id|product|   category|quantity|   price|     city| sale_date|
+--------------+-----------+-------+-----------+--------+--------+---------+----------+
|             1|        178| Tablet|Electronics|       6|29546.59|Hyderabad|2024-11-06|
|             2|        126| Laptop|Electronics|       9|51856.81|    Delhi|2024-08-30|
|             3|        177|Charger|Accessories|       2|60725.66|    Delhi|2024-05-26|
|             4|        271| Camera|Electronics|       7|80424.94|   Mumbai|2024-09-19|
|             5|        281|  Watch|  Wearables|       

In [None]:
# with column is uses to add new col to the existing data frame
or update the existing columns or replace the existing columns with new values

In [None]:
df=spark.read.csv("/content/person_data.csv",header=True,inferSchema=True)
df1=df.select(col("First_Name"))
df1.show(5)
df2=df.withColumn("Upper_case_name", upper(col("First_Name")))
df3=df2.withColumn("country",lit("India"))
df4=df3.withColumn("age+5",col("Age")+5)
df5=df4.withColumn("Birth_year",year("DOB")) \
      .withColumn("Birth_month",month("DOB")) \
      .withColumn("Birth_day",dayofmonth("DOB"))

df_flag =df4.withColumn(
    "Is_Adult",
    when(col("Age")>=25,lit("Y")).otherwise(lit("N"))
)
df_flag.show()

#Extract the year into a new column called JoinYear.



In [None]:
a ="i am learning pyspark"
len(a)

21

In [None]:
df1=df.select(col("First_Name"))
df2=df1.withColumn("First_Name,Lower", lower(col("First_Name")))
df2.withColumn("First_Name_upper", upper(col("First_Name")))
df4=df2.withColumn("first_name_length", length(col("First_Name")))
df4.show()

In [None]:
# extract the substring

df.withColumn("sub_name",substring(col("First_name"),2,4)).show()


In [None]:
# split

df.withColumn("split_name",split(col("DOB"),"-")).show()

In [None]:
#trim,ltrim,rtrim
#df.withColumn("trim_name",trim(col("First_Name"))).show()
df.withColumn("ltrim_name",ltrim(col("First_Name"))).show()
#df.withColumn("rtrim_name",rtrim(col("First_Name"))).show()

In [None]:
#lpad,rpad
df.withColumn("lpad_name",lpad(col("First_Name"),15,"0")).show()
df.withColumn("lpad_name",rpad(col("First_Name"),15,"|")).show()


In [None]:
#concat
#df.withColumn("concat_name",concat_ws("|",col("First_Name"),col("Last_Name"))).show()
df.withColumn("concat1_name",concat(col("First_Name"),lit("-"),col("Last_Name"))).show()
#

In [None]:
# Date
df.withColumn("current_date",current_date()).show()
df.withColumn("current_timestamp",current_timestamp()).show()


In [None]:
#date_add
df.withColumn("date_add",date_add(col("DOB"),1)).show()
#df.withColumn("date_sub",date_sub(col("DOB"),10)).show()

In [None]:
#months_between,add months

df.withColumn("months_between",months_between(current_date(),col("DOB"))).show()
df=df.withColumn("add_months",add_months(col("DOB"),3)).show()
df.show()


In [None]:
# nextday,lastday

df.withColumn("next_day",next_day(col("DOB"),"Sunday")).show()
df.withColumn("last_day",last_day(col("DOB"))).show()


In [None]:
#dayofYear dayofmonth

df.withColumn("dayofYear",dayofyear(col("DOB"))).show()
df.withColumn("dayofmonth",dayofmonth(col("DOB"))).show()



In [None]:
#dayofweek year
df.withColumn("dayofweek",dayofweek(col("DOB"))).show()
df.withColumn("year",year(col("DOB"))).show()
df.withColumn("month",month(col("DOB"))).show()

In [None]:
df.groupBy("First_Name").agg(count("First_Name").alias("count")).show()
df.groupBy("Last_Name").agg(countDistinct("First_Name").alias("dis_count")).show()

In [None]:
# SUM MAX AVG
df.agg(sum("age")).alias("total_age").show()
df.agg(max("age")).alias("max_age").show()
df.agg(avg("age")).alias("avg_age").show()
df.agg(min("age")).alias("min_age").show()

#

In [None]:
df4=df.withColumn("age_group",when(col("Age")<30,"young").otherwise("old"))
df4.show()

In [None]:
# when

df4 =df.withColumn("age_group",when(col("Age")<30,"young").otherwise("old")).show()


In [None]:
df.withColumn("array_col",array(col("First_Name"),col("Last_Name"))).show()
df.withColumn("map_col",map_from_arrays(array(lit("first_name"),lit("last_name")),array(col("First_Name"),col("Last_Name")))).show()
df.withColumn("struct_col",struct(col("First_Name"),col("Last_Name"))).show()
#

In [None]:
df2 = df.withColumn("First_name_lower",lower(col("First_Name")))
df2.show()

In [None]:
df2 = df.withColumn("First_name_lower",lower(col("First_Name")))

cols = df2.columns

df3=df2.select(cols[0],"First_name_lower",*[c for c in cols[2:] if c != "First_name_lower"])

df3.show()