In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import *
import pandas as pd

In [3]:
spark = SparkSession.builder.appName('Kishor').getOrCreate()
spark

In [None]:
# what if j have 3 keys in all line and 4 key in one line?
# It will print Null for rest of the rows

In [10]:
rj = spark.read.format('json')\
      .option('inferSchema', True)\
      .option('mode', "PERMISIVE")\
      .load('single_file_json_with_extra_field.json')
rj.show()

+---+------+--------+------+
|age|gender|    name|salary|
+---+------+--------+------+
| 20|  NULL|  Manish| 20000|
| 25|  NULL|  Nikita| 21000|
| 16|  NULL|  Pritam| 22000|
| 35|  NULL|Prantosh| 25000|
| 67|     M|  Vikash| 40000|
+---+------+--------+------+



In [None]:
# Reading multiline json file

In [15]:
rj = spark.read.format('json')\
      .option('inferSchema', True)\
      .option('mode', "PERMISIVE")\
      .option('multiline',True)\
      .load('Multi_line_correct.json')
rj.show()

+---+--------+------+
|age|    name|salary|
+---+--------+------+
| 20|  Manish| 20000|
| 25|  Nikita| 21000|
| 16|  Pritam| 22000|
| 35|Prantosh| 25000|
| 67|  Vikash| 40000|
+---+--------+------+



In [None]:
# multiline inccorect json file ( will print only one line, need to provide list [])

In [16]:
rj = spark.read.format('json')\
      .option('inferSchema', True)\
      .option('mode', "PERMISIVE")\
      .option('multiline',True)\
      .load('Multi_line_incorrect.json')
rj.show()

+---+------+------+
|age|  name|salary|
+---+------+------+
| 20|Manish| 20000|
+---+------+------+



In [None]:
# corrupted json file ( will create new column for corrupt data)

In [18]:
rj = spark.read.format('json')\
      .option('inferSchema', True)\
      .option('mode', "PERMISIVE")\
      .load('corrupted_json.json')
rj.show(truncate = False)

+----------------------------------------+----+--------+------+
|_corrupt_record                         |age |name    |salary|
+----------------------------------------+----+--------+------+
|NULL                                    |20  |Manish  |20000 |
|NULL                                    |25  |Nikita  |21000 |
|NULL                                    |16  |Pritam  |22000 |
|NULL                                    |35  |Prantosh|25000 |
|{"name":"Vikash","age":67,"salary":40000|NULL|NULL    |NULL  |
+----------------------------------------+----+--------+------+



# Write dadaframe

In [21]:
df = spark.read.format('csv')\
      .option('header', True)\
      .option('inferSchema', True)\
      .option('mode', "PERMISIVE")\
      .load('csv_write.csv')
df.show(truncate = False)

+---+----------+--------+--------+----------+-----------+
|id | name     |     age|  salary|   address|    gender |
+---+----------+--------+--------+----------+-----------+
|1  |  Manish  |26      |75000   |    INDIA |        m  |
|2  |  Nikita  |23      |100000  |   USA    |          f|
|3  |  Pritam  |22      |150000  |   INDIA  |        m  |
|4  |  Prantosh|17      |200000  |   JAPAN  |        m  |
|5  |  Vikash  |31      |300000  |   USA    |          m|
|6  |  Rahul   |55      |300000  |   INDIA  |        m  |
|7  |  Raju    |67      |540000  |   USA    |          m|
|8  |  Praveen |28      |70000   |    JAPAN |        m  |
|9  |  Dev     |32      |150000  |   JAPAN  |        m  |
|10 | Sherin   |16      |25000   |    RUSSIA|       f   |
|11 | Ragu     |12      |35000   |    INDIA |        f  |
|12 | Sweta    |43      |200000  |   INDIA  |        f  |
|13 | Raushan  |48      |650000  |   USA    |          m|
|14 | Mukesh   |36      |95000   |    RUSSIA|       m   |
|15 | Prakash 

In [22]:
df.write.format('csv')\
        .option('header', True)\
        .option('mode', 'overwrite')\
        .option('path','csv_w/')\
        .save()

In [24]:
# created partition with 3 csv files

df.repartition(3).write.format('csv')\
        .option('header', True)\
        .option('mode', 'overwrite')\
        .option('path','csv_wr/')\
        .save()

# Select column multiple ways and expr

In [10]:
df = spark.read.format('csv')\
          .option('header', True)\
          .option('inferSchema', True)\
          .load('csv_write.csv')
df.show(5)

+---+----------+---+------+---------+-----------+
| id|      name|age|salary|  address|     gender|
+---+----------+---+------+---------+-----------+
|  1|    Manish| 26| 75000|    INDIA|          m|
|  2|    Nikita| 23|100000|      USA|          f|
|  3|    Pritam| 22|150000|    INDIA|          m|
|  4|  Prantosh| 17|200000|    JAPAN|          m|
|  5|    Vikash| 31|300000|      USA|          m|
+---+----------+---+------+---------+-----------+
only showing top 5 rows



In [11]:
df.select("name").show(5)

+----------+
|      name|
+----------+
|    Manish|
|    Nikita|
|    Pritam|
|  Prantosh|
|    Vikash|
+----------+
only showing top 5 rows



In [22]:
df.select("*").show(5)

+---+----------+---+------+---------+-----------+
| id|      name|age|salary|  address|     gender|
+---+----------+---+------+---------+-----------+
|  1|    Manish| 26| 75000|    INDIA|          m|
|  2|    Nikita| 23|100000|      USA|          f|
|  3|    Pritam| 22|150000|    INDIA|          m|
|  4|  Prantosh| 17|200000|    JAPAN|          m|
|  5|    Vikash| 31|300000|      USA|          m|
+---+----------+---+------+---------+-----------+
only showing top 5 rows



In [12]:
df.select(col("name")).show(5)

+----------+
|      name|
+----------+
|    Manish|
|    Nikita|
|    Pritam|
|  Prantosh|
|    Vikash|
+----------+
only showing top 5 rows



In [17]:
#df.select("name", 'age', "salary").show(5)
#df.select(col("name"), col("age"), col("salary")).show(5)

df.select("name", col("name"), df["name"], df.id).show(5)

+----------+----------+----------+---+
|      name|      name|      name| id|
+----------+----------+----------+---+
|    Manish|    Manish|    Manish|  1|
|    Nikita|    Nikita|    Nikita|  2|
|    Pritam|    Pritam|    Pritam|  3|
|  Prantosh|  Prantosh|  Prantosh|  4|
|    Vikash|    Vikash|    Vikash|  5|
+----------+----------+----------+---+
only showing top 5 rows



In [21]:
# using expr manupulate col with select method 

df.select(expr("id + 10")).show(5)

+---------+
|(id + 10)|
+---------+
|       11|
|       12|
|       13|
|       14|
|       15|
+---------+
only showing top 5 rows



In [26]:
df.select(expr("id as employee_id"), expr('name as New_Name'), expr("concat(name , address)")).show(5)

+-----------+----------+---------------------+
|employee_id|  New_Name|concat(name, address)|
+-----------+----------+---------------------+
|          1|    Manish|      Manish    INDIA|
|          2|    Nikita|         Nikita   USA|
|          3|    Pritam|       Pritam   INDIA|
|          4|  Prantosh|     Prantosh   JAPAN|
|          5|    Vikash|         Vikash   USA|
+-----------+----------+---------------------+
only showing top 5 rows



In [28]:
df = spark.read.format('csv')\
          .option('header', True)\
          .option('inferSchema', True)\
          .load('employee_data.csv')
df.show(5)

+---+--------+---+------+------------+--------+--------+
| id|    name|age|salary|     address| nominee|     _c6|
+---+--------+---+------+------------+--------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|    NULL|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|    NULL|
|  3|  Pritam| 22|150000|   Bangalore|   India|nominee3|
|  4|Prantosh| 17|200000|     Kolkata|   India|nominee4|
|  5|  Vikash| 31|300000|        NULL|nominee5|    NULL|
+---+--------+---+------+------------+--------+--------+



In [30]:
df.select(col("id").alias('employee_id'), 'name', 'salary').show(5)

+-----------+--------+------+
|employee_id|    name|salary|
+-----------+--------+------+
|          1|  Manish| 75000|
|          2|  Nikita|100000|
|          3|  Pritam|150000|
|          4|Prantosh|200000|
|          5|  Vikash|300000|
+-----------+--------+------+



In [33]:
df.select(col("salary") > 100000).show(5)
df.filter(col("salary") > 100000).show()
df.where(col("salary") > 100000).show()

+-----------------+
|(salary > 100000)|
+-----------------+
|            false|
|            false|
|             true|
|             true|
|             true|
+-----------------+

+---+--------+---+------+---------+--------+--------+
| id|    name|age|salary|  address| nominee|     _c6|
+---+--------+---+------+---------+--------+--------+
|  3|  Pritam| 22|150000|Bangalore|   India|nominee3|
|  4|Prantosh| 17|200000|  Kolkata|   India|nominee4|
|  5|  Vikash| 31|300000|     NULL|nominee5|    NULL|
+---+--------+---+------+---------+--------+--------+

+---+--------+---+------+---------+--------+--------+
| id|    name|age|salary|  address| nominee|     _c6|
+---+--------+---+------+---------+--------+--------+
|  3|  Pritam| 22|150000|Bangalore|   India|nominee3|
|  4|Prantosh| 17|200000|  Kolkata|   India|nominee4|
|  5|  Vikash| 31|300000|     NULL|nominee5|    NULL|
+---+--------+---+------+---------+--------+--------+



In [37]:
df.filter((col("salary") > 100000) & (col("age") < 30)).show(5)

+---+--------+---+------+---------+-------+--------+
| id|    name|age|salary|  address|nominee|     _c6|
+---+--------+---+------+---------+-------+--------+
|  3|  Pritam| 22|150000|Bangalore|  India|nominee3|
|  4|Prantosh| 17|200000|  Kolkata|  India|nominee4|
+---+--------+---+------+---------+-------+--------+



In [38]:
df.select("*", lit("Kishor").alias('Last_name')).show(5)

+---+--------+---+------+------------+--------+--------+---------+
| id|    name|age|salary|     address| nominee|     _c6|Last_name|
+---+--------+---+------+------------+--------+--------+---------+
|  1|  Manish| 26| 75000|       bihar|nominee1|    NULL|   Kishor|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|    NULL|   Kishor|
|  3|  Pritam| 22|150000|   Bangalore|   India|nominee3|   Kishor|
|  4|Prantosh| 17|200000|     Kolkata|   India|nominee4|   Kishor|
|  5|  Vikash| 31|300000|        NULL|nominee5|    NULL|   Kishor|
+---+--------+---+------+------------+--------+--------+---------+



In [40]:
df.withColumn('sur_name', lit('thakre')).show()

+---+--------+---+------+------------+--------+--------+--------+
| id|    name|age|salary|     address| nominee|     _c6|sur_name|
+---+--------+---+------+------------+--------+--------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|    NULL|  thakre|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|    NULL|  thakre|
|  3|  Pritam| 22|150000|   Bangalore|   India|nominee3|  thakre|
|  4|Prantosh| 17|200000|     Kolkata|   India|nominee4|  thakre|
|  5|  Vikash| 31|300000|        NULL|nominee5|    NULL|  thakre|
+---+--------+---+------+------------+--------+--------+--------+



In [41]:
df.withColumnRenamed('id', 'employee_id').show()

+-----------+--------+---+------+------------+--------+--------+
|employee_id|    name|age|salary|     address| nominee|     _c6|
+-----------+--------+---+------+------------+--------+--------+
|          1|  Manish| 26| 75000|       bihar|nominee1|    NULL|
|          2|  Nikita| 23|100000|uttarpradesh|nominee2|    NULL|
|          3|  Pritam| 22|150000|   Bangalore|   India|nominee3|
|          4|Prantosh| 17|200000|     Kolkata|   India|nominee4|
|          5|  Vikash| 31|300000|        NULL|nominee5|    NULL|
+-----------+--------+---+------+------------+--------+--------+



In [42]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)
 |-- _c6: string (nullable = true)



In [43]:
df.withColumn("id", col('id').cast('string'))\
    .withColumn("salary", col('salary').cast('long')).printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: long (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)
 |-- _c6: string (nullable = true)



In [44]:
df.drop('id', col('salary')).show()

+--------+---+------------+--------+--------+
|    name|age|     address| nominee|     _c6|
+--------+---+------------+--------+--------+
|  Manish| 26|       bihar|nominee1|    NULL|
|  Nikita| 23|uttarpradesh|nominee2|    NULL|
|  Pritam| 22|   Bangalore|   India|nominee3|
|Prantosh| 17|     Kolkata|   India|nominee4|
|  Vikash| 31|        NULL|nominee5|    NULL|
+--------+---+------------+--------+--------+



# union vs unionAll

union and union all will give same records in spark dataframe. 

But in pyspl $union$ will give $unique $ record from both table and $union-all$ give $all$ records including duplicate. 

In [4]:
data=[(10 ,'Anil',50000, 18),
(11 ,'Vikas',75000,  16),
(12 ,'Nisha',40000,  18),
(13 ,'Nidhi',60000,  17),
(14 ,'Priya',80000,  18),
(15 ,'Mohit',45000,  18),
(16 ,'Rajesh',90000, 10),
(17 ,'Raman',55000, 16),
(18 ,'Sam',65000,   17)] 

schema = ['id', 'name', 'sal', 'mngr_id']
m_df = spark.createDataFrame(data, schema)
m_df.show()
m_df.count()

+---+------+-----+-------+
| id|  name|  sal|mngr_id|
+---+------+-----+-------+
| 10|  Anil|50000|     18|
| 11| Vikas|75000|     16|
| 12| Nisha|40000|     18|
| 13| Nidhi|60000|     17|
| 14| Priya|80000|     18|
| 15| Mohit|45000|     18|
| 16|Rajesh|90000|     10|
| 17| Raman|55000|     16|
| 18|   Sam|65000|     17|
+---+------+-----+-------+



9

In [16]:
data1=[(19 ,'Sohan',50000, 18),
(20 ,'Sima',75000,  17)]
schema1 = ['id', 'name', 'sal', 'mngr_id']
m_df1 = spark.createDataFrame(data1, schema1)
m_df1.show()

+---+-----+-----+-------+
| id| name|  sal|mngr_id|
+---+-----+-----+-------+
| 19|Sohan|50000|     18|
| 20| Sima|75000|     17|
+---+-----+-----+-------+



In [17]:
m_df.union(m_df1).show()
m_df.union(m_df1).count()

+---+------+-----+-------+
| id|  name|  sal|mngr_id|
+---+------+-----+-------+
| 10|  Anil|50000|     18|
| 11| Vikas|75000|     16|
| 12| Nisha|40000|     18|
| 13| Nidhi|60000|     17|
| 14| Priya|80000|     18|
| 15| Mohit|45000|     18|
| 16|Rajesh|90000|     10|
| 17| Raman|55000|     16|
| 18|   Sam|65000|     17|
| 19| Sohan|50000|     18|
| 20|  Sima|75000|     17|
+---+------+-----+-------+



11

In [None]:
# unordered column name union

In [18]:
wrong_column_data=[(19 ,50000, 18,'Sohan'), (20 ,75000,  17,'Sima')]
wrong_schema = ['id', 'sal', 'mngr_id', 'Name' ]
w_m_df = spark.createDataFrame(data =wrong_column_data , schema= wrong_schema)
w_m_df.show()

+---+-----+-------+-----+
| id|  sal|mngr_id| Name|
+---+-----+-------+-----+
| 19|50000|     18|Sohan|
| 20|75000|     17| Sima|
+---+-----+-------+-----+



In [19]:
m_df1.union(w_m_df).show()

+---+-----+-----+-------+
| id| name|  sal|mngr_id|
+---+-----+-----+-------+
| 19|Sohan|50000|     18|
| 20| Sima|75000|     17|
| 19|50000|   18|  Sohan|
| 20|75000|   17|   Sima|
+---+-----+-----+-------+



In [21]:
# unionByName  to match column name

In [20]:
m_df1.unionByName(w_m_df).show()

+---+-----+-----+-------+
| id| name|  sal|mngr_id|
+---+-----+-----+-------+
| 19|Sohan|50000|     18|
| 20| Sima|75000|     17|
| 19|Sohan|50000|     18|
| 20| Sima|75000|     17|
+---+-----+-----+-------+



In [None]:
# handling extra column 

In [22]:
wrong_column_data=[(19 ,50000, 18,'Sohan',10),
(20 ,75000,  17,'Sima',20)]
wrong_schema = ['id', 'sal', 'mngr_id', 'Name','bonus']
w_m_df1 = spark.createDataFrame(data =wrong_column_data , schema= wrong_schema)
w_m_df1.show()

+---+-----+-------+-----+-----+
| id|  sal|mngr_id| Name|bonus|
+---+-----+-------+-----+-----+
| 19|50000|     18|Sohan|   10|
| 20|75000|     17| Sima|   20|
+---+-----+-------+-----+-----+



In [27]:
w_m_df1.select('id', 'sal', 'mngr_id', 'Name').union(w_m_df).show()

+---+-----+-------+-----+
| id|  sal|mngr_id| Name|
+---+-----+-------+-----+
| 19|50000|     18|Sohan|
| 20|75000|     17| Sima|
| 19|50000|     18|Sohan|
| 20|75000|     17| Sima|
+---+-----+-------+-----+



# if else in spark (case when then)

In [35]:
emp_data = [
(1,'manish',26,20000,'india','IT'),
(2,'rahul',None,40000,'germany','engineering'),
(3,'pawan',12,60000,'india','sales'),
(4,'roshini',44,None,'uk','engineering'),
(5,'raushan',35,70000,'india','sales'),
(6,None,29,200000,'uk','IT'),
(7,'adam',37,65000,'us','IT'),
(8,'chris',16,40000,'us','sales'),
(None,None,None,None,None,None),
(7,'adam',37,65000,'us','IT')
]
emp_schema = ['id', 'name', 'age', 'salary', 'country', 'dept']
e_df = spark.createDataFrame(data = emp_data, schema=emp_schema)
e_df.show()

+----+-------+----+------+-------+-----------+
|  id|   name| age|salary|country|       dept|
+----+-------+----+------+-------+-----------+
|   1| manish|  26| 20000|  india|         IT|
|   2|  rahul|NULL| 40000|germany|engineering|
|   3|  pawan|  12| 60000|  india|      sales|
|   4|roshini|  44|  NULL|     uk|engineering|
|   5|raushan|  35| 70000|  india|      sales|
|   6|   NULL|  29|200000|     uk|         IT|
|   7|   adam|  37| 65000|     us|         IT|
|   8|  chris|  16| 40000|     us|      sales|
|NULL|   NULL|NULL|  NULL|   NULL|       NULL|
|   7|   adam|  37| 65000|     us|         IT|
+----+-------+----+------+-------+-----------+



In [36]:
e_df.withColumn('adult', when(col('age')<18, 'No').when(col('age')>18, 'Yes').otherwise('Novalues')).show()

+----+-------+----+------+-------+-----------+--------+
|  id|   name| age|salary|country|       dept|   adult|
+----+-------+----+------+-------+-----------+--------+
|   1| manish|  26| 20000|  india|         IT|     Yes|
|   2|  rahul|NULL| 40000|germany|engineering|Novalues|
|   3|  pawan|  12| 60000|  india|      sales|      No|
|   4|roshini|  44|  NULL|     uk|engineering|     Yes|
|   5|raushan|  35| 70000|  india|      sales|     Yes|
|   6|   NULL|  29|200000|     uk|         IT|     Yes|
|   7|   adam|  37| 65000|     us|         IT|     Yes|
|   8|  chris|  16| 40000|     us|      sales|      No|
|NULL|   NULL|NULL|  NULL|   NULL|       NULL|Novalues|
|   7|   adam|  37| 65000|     us|         IT|     Yes|
+----+-------+----+------+-------+-----------+--------+



In [38]:
e_df.withColumn('age', when(col('age').isNull(),lit(19)).otherwise(col('age')))\
    .withColumn('adult',when(col('age')>18, 'Yes').otherwise('No')).show()

+----+-------+---+------+-------+-----------+-----+
|  id|   name|age|salary|country|       dept|adult|
+----+-------+---+------+-------+-----------+-----+
|   1| manish| 26| 20000|  india|         IT|  Yes|
|   2|  rahul| 19| 40000|germany|engineering|  Yes|
|   3|  pawan| 12| 60000|  india|      sales|   No|
|   4|roshini| 44|  NULL|     uk|engineering|  Yes|
|   5|raushan| 35| 70000|  india|      sales|  Yes|
|   6|   NULL| 29|200000|     uk|         IT|  Yes|
|   7|   adam| 37| 65000|     us|         IT|  Yes|
|   8|  chris| 16| 40000|     us|      sales|   No|
|NULL|   NULL| 19|  NULL|   NULL|       NULL|  Yes|
|   7|   adam| 37| 65000|     us|         IT|  Yes|
+----+-------+---+------+-------+-----------+-----+



In [42]:
e_df.withColumn('age_wise', when((col('age')>0) & (col('age')<18),'Minor')\
                           .when((col('age')>18) & (col('age')<30), 'Mid')
                           s.otherwise('Major')).show()

+----+-------+----+------+-------+-----------+--------+
|  id|   name| age|salary|country|       dept|age_wise|
+----+-------+----+------+-------+-----------+--------+
|   1| manish|  26| 20000|  india|         IT|     Mid|
|   2|  rahul|NULL| 40000|germany|engineering|   Major|
|   3|  pawan|  12| 60000|  india|      sales|   Minor|
|   4|roshini|  44|  NULL|     uk|engineering|   Major|
|   5|raushan|  35| 70000|  india|      sales|   Major|
|   6|   NULL|  29|200000|     uk|         IT|     Mid|
|   7|   adam|  37| 65000|     us|         IT|   Major|
|   8|  chris|  16| 40000|     us|      sales|   Minor|
|NULL|   NULL|NULL|  NULL|   NULL|       NULL|   Major|
|   7|   adam|  37| 65000|     us|         IT|   Major|
+----+-------+----+------+-------+-----------+--------+



In [None]:
e_df.createOrReplaceTempView('table')

In [44]:
spark.sql(""" select *, 
             case when age<18 then 'minor'
			 when age> 18 then 'major'
			 else 'novalue'
			 end as adult
			 from table""").show()

+----+-------+----+------+-------+-----------+-------+
|  id|   name| age|salary|country|       dept|  adult|
+----+-------+----+------+-------+-----------+-------+
|   1| manish|  26| 20000|  india|         IT|  major|
|   2|  rahul|NULL| 40000|germany|engineering|novalue|
|   3|  pawan|  12| 60000|  india|      sales|  minor|
|   4|roshini|  44|  NULL|     uk|engineering|  major|
|   5|raushan|  35| 70000|  india|      sales|  major|
|   6|   NULL|  29|200000|     uk|         IT|  major|
|   7|   adam|  37| 65000|     us|         IT|  major|
|   8|  chris|  16| 40000|     us|      sales|  minor|
|NULL|   NULL|NULL|  NULL|   NULL|       NULL|novalue|
|   7|   adam|  37| 65000|     us|         IT|  major|
+----+-------+----+------+-------+-----------+-------+



# Unique & Sorted records

In [4]:
data=[(10 ,'Anil',50000, 18),
(11 ,'Vikas',75000,  16),
(12 ,'Nisha',40000,  18),
(13 ,'Nidhi',60000,  17),
(14 ,'Priya',80000,  18),
(15 ,'Mohit',45000,  18),
(16 ,'Rajesh',90000, 10),
(17 ,'Raman',55000, 16),
(18 ,'Sam',65000,   17),
(15 ,'Mohit',45000,  18),
(13 ,'Nidhi',60000,  17),      
(14 ,'Priya',90000,  18),  
(18 ,'Sam',65000,   17)
     ]
sch = ['id','name','salary','mngr_id']
df = spark.createDataFrame(data, schema=sch)
df.show()

+---+------+------+-------+
| id|  name|salary|mngr_id|
+---+------+------+-------+
| 10|  Anil| 50000|     18|
| 11| Vikas| 75000|     16|
| 12| Nisha| 40000|     18|
| 13| Nidhi| 60000|     17|
| 14| Priya| 80000|     18|
| 15| Mohit| 45000|     18|
| 16|Rajesh| 90000|     10|
| 17| Raman| 55000|     16|
| 18|   Sam| 65000|     17|
| 15| Mohit| 45000|     18|
| 13| Nidhi| 60000|     17|
| 14| Priya| 90000|     18|
| 18|   Sam| 65000|     17|
+---+------+------+-------+



In [7]:
df.count()

13

In [8]:
df.distinct().count()

10

In [9]:
df.select('id','name').distinct().show()

+---+------+
| id|  name|
+---+------+
| 10|  Anil|
| 11| Vikas|
| 12| Nisha|
| 13| Nidhi|
| 15| Mohit|
| 14| Priya|
| 17| Raman|
| 16|Rajesh|
| 18|   Sam|
+---+------+



In [10]:
df.drop_duplicates(['id','name','salary','mngr_id']).show()

+---+------+------+-------+
| id|  name|salary|mngr_id|
+---+------+------+-------+
| 10|  Anil| 50000|     18|
| 12| Nisha| 40000|     18|
| 11| Vikas| 75000|     16|
| 13| Nidhi| 60000|     17|
| 15| Mohit| 45000|     18|
| 14| Priya| 80000|     18|
| 16|Rajesh| 90000|     10|
| 17| Raman| 55000|     16|
| 18|   Sam| 65000|     17|
| 14| Priya| 90000|     18|
+---+------+------+-------+



In [11]:
# sort 
df.sort(col('salary')).show()

+---+------+------+-------+
| id|  name|salary|mngr_id|
+---+------+------+-------+
| 12| Nisha| 40000|     18|
| 15| Mohit| 45000|     18|
| 15| Mohit| 45000|     18|
| 10|  Anil| 50000|     18|
| 17| Raman| 55000|     16|
| 13| Nidhi| 60000|     17|
| 13| Nidhi| 60000|     17|
| 18|   Sam| 65000|     17|
| 18|   Sam| 65000|     17|
| 11| Vikas| 75000|     16|
| 14| Priya| 80000|     18|
| 14| Priya| 90000|     18|
| 16|Rajesh| 90000|     10|
+---+------+------+-------+



In [13]:
df.sort(col('salary').desc()).show()

+---+------+------+-------+
| id|  name|salary|mngr_id|
+---+------+------+-------+
| 16|Rajesh| 90000|     10|
| 14| Priya| 90000|     18|
| 14| Priya| 80000|     18|
| 11| Vikas| 75000|     16|
| 18|   Sam| 65000|     17|
| 18|   Sam| 65000|     17|
| 13| Nidhi| 60000|     17|
| 13| Nidhi| 60000|     17|
| 17| Raman| 55000|     16|
| 10|  Anil| 50000|     18|
| 15| Mohit| 45000|     18|
| 15| Mohit| 45000|     18|
| 12| Nisha| 40000|     18|
+---+------+------+-------+



In [14]:
df.sort(col('salary').desc(), col('name').desc()).show()

+---+------+------+-------+
| id|  name|salary|mngr_id|
+---+------+------+-------+
| 16|Rajesh| 90000|     10|
| 14| Priya| 90000|     18|
| 14| Priya| 80000|     18|
| 11| Vikas| 75000|     16|
| 18|   Sam| 65000|     17|
| 18|   Sam| 65000|     17|
| 13| Nidhi| 60000|     17|
| 13| Nidhi| 60000|     17|
| 17| Raman| 55000|     16|
| 10|  Anil| 50000|     18|
| 15| Mohit| 45000|     18|
| 15| Mohit| 45000|     18|
| 12| Nisha| 40000|     18|
+---+------+------+-------+



In [5]:
# Add new column from list values

from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql import Window

rating = [5,4,1]
a= spark.createDataFrame([("Dog", "Cat"), ("Cat", "Dog"), ("Mouse", "Cat")], ["Animal", "Enemy"])
a.show()

+------+-----+
|Animal|Enemy|
+------+-----+
|   Dog|  Cat|
|   Cat|  Dog|
| Mouse|  Cat|
+------+-----+



In [6]:
b = spark.createDataFrame([(l,) for l in rating], ['Rating'])
b.show()

+------+
|Rating|
+------+
|     5|
|     4|
|     1|
+------+



In [7]:
a = a.withColumn('row_idx', row_number().over(Window.orderBy(monotonically_increasing_id())))
a.show()

+------+-----+-------+
|Animal|Enemy|row_idx|
+------+-----+-------+
|   Dog|  Cat|      1|
|   Cat|  Dog|      2|
| Mouse|  Cat|      3|
+------+-----+-------+



In [8]:
b = b.withColumn('row_idx', row_number().over(Window.orderBy(monotonically_increasing_id())))
b.show()

+------+-------+
|Rating|row_idx|
+------+-------+
|     5|      1|
|     4|      2|
|     1|      3|
+------+-------+



In [9]:
a.join(b, a.row_idx == b.row_idx).drop('row_idx').show()

+------+-----+------+
|Animal|Enemy|Rating|
+------+-----+------+
|   Dog|  Cat|     5|
|   Cat|  Dog|     4|
| Mouse|  Cat|     1|
+------+-----+------+



In [11]:
df = spark.range(4)
df.show()
df.select(floor(lit(5.2))).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
+---+

+----------+
|FLOOR(5.2)|
+----------+
|         5|
|         5|
|         5|
|         5|
+----------+



In [12]:
spark.range(3, 9).toDF('rng').show()

+---+
|rng|
+---+
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
+---+



In [14]:
df=spark.createDataFrame([[142,"Big House"],[142,"Big Green Frog"]],["AnonID","Query"])
df.show()

+------+--------------+
|AnonID|         Query|
+------+--------------+
|   142|     Big House|
|   142|Big Green Frog|
+------+--------------+



In [15]:
data= df.groupBy("AnonID").agg(collect_list("Query").alias("Query"))
data.show(truncate=False)

+------+---------------------------+
|AnonID|Query                      |
+------+---------------------------+
|142   |[Big House, Big Green Frog]|
+------+---------------------------+



In [17]:
data.withColumn('distinct', array_distinct(flatten(transform(data['Query'], lambda x:split(x, ' '))))).show(truncate=False)

+------+---------------------------+-------------------------+
|AnonID|Query                      |distinct                 |
+------+---------------------------+-------------------------+
|142   |[Big House, Big Green Frog]|[Big, House, Green, Frog]|
+------+---------------------------+-------------------------+



In [18]:
#Converting DataFrame Column to List 
data = [("1", "apple"), ("2", "banana"), ("3", "cherry")]
df = spark.createDataFrame(data, ["id", "value"])
df.show()

+---+------+
| id| value|
+---+------+
|  1| apple|
|  2|banana|
|  3|cherry|
+---+------+



In [20]:
li = df.select('value').rdd.flatMap(lambda x:x).collect()
li

['apple', 'banana', 'cherry']

In [4]:
# Accumulator

data = [('2024-06-13 12:00:00', 'INFO', 'Server started'),
        ('2024-06-13 12:01:00', 'ERROR', 'Failed to connect to database'),
        ('2024-06-13 12:02:00', 'INFO', 'User login successful'),
        ('2024-06-13 12:03:00', 'ERROR', 'Timeout while reading from socket'),
        ('2024-06-13 12:04:00', 'INFO', 'File uploaded successfully')]
columns = ['timestamp', 'level', 'message']
df  = spark.createDataFrame(data, columns)
df.show(truncate=False)

+-------------------+-----+---------------------------------+
|timestamp          |level|message                          |
+-------------------+-----+---------------------------------+
|2024-06-13 12:00:00|INFO |Server started                   |
|2024-06-13 12:01:00|ERROR|Failed to connect to database    |
|2024-06-13 12:02:00|INFO |User login successful            |
|2024-06-13 12:03:00|ERROR|Timeout while reading from socket|
|2024-06-13 12:04:00|INFO |File uploaded successfully       |
+-------------------+-----+---------------------------------+



In [9]:
error_count_accumulator = spark.sparkContext.accumulator(0)

#Define a function to increment the accumulator for each error login 

def count_errors(row):
    if row['level'] == 'ERROR':
        error_count_accumulator.add(1)
        
# Use foreach to apply the function to each row in the DataFrame

df.foreach(lambda row: count_errors(row))

# After the foreach action, the accumulator will hold the count of error messages
print(f'Total number of error messages: {error_count_accumulator.value}')


Total number of error messages: 2


In [10]:
# Further processing of the DataFrame (e.g. filtering errors, saving results, etc.) 

error_df = df.filter(col('level') == 'ERROR')
error_df.show(truncate=False)

+-------------------+-----+---------------------------------+
|timestamp          |level|message                          |
+-------------------+-----+---------------------------------+
|2024-06-13 12:01:00|ERROR|Failed to connect to database    |
|2024-06-13 12:03:00|ERROR|Timeout while reading from socket|
+-------------------+-----+---------------------------------+



In [5]:
spark.stop()