In [32]:
import findspark
findspark.init()
from pyspark.sql import SparkSession, Row 
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DecimalType, FloatType, DateType
from pyspark.sql.functions import sum, avg, max, min,count, col, lower,trim, countDistinct
import pandas as pd
import os

### 1. Createa SparkSession in PySpark

In [2]:
spark = SparkSession.builder.appName('Q').getOrCreate()
spark

### 2. Read a CSV file into a DataFrame using Pyspark

In [4]:
simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]
schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData, schema = schema)
df.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



### 3. show the schema of a Dataframe in pyspark

In [5]:
df.printSchema()

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)



In [6]:
df.schema

StructType([StructField('employee_name', StringType(), True), StructField('department', StringType(), True), StructField('state', StringType(), True), StructField('salary', LongType(), True), StructField('age', LongType(), True), StructField('bonus', LongType(), True)])

### 4. Select specific columns from a dataframe in pyspark

In [8]:
df.select('age','salary').show()
df.select(df['age'], df['salary']).show()
df.select(df.age, df.salary).show() 

+---+------+
|age|salary|
+---+------+
| 34| 90000|
| 56| 86000|
| 30| 81000|
| 24| 90000|
| 40| 99000|
| 36| 83000|
| 53| 79000|
| 25| 80000|
| 50| 91000|
+---+------+

+---+------+
|age|salary|
+---+------+
| 34| 90000|
| 56| 86000|
| 30| 81000|
| 24| 90000|
| 40| 99000|
| 36| 83000|
| 53| 79000|
| 25| 80000|
| 50| 91000|
+---+------+

+---+------+
|age|salary|
+---+------+
| 34| 90000|
| 56| 86000|
| 30| 81000|
| 24| 90000|
| 40| 99000|
| 36| 83000|
| 53| 79000|
| 25| 80000|
| 50| 91000|
+---+------+



In [9]:
df.select('*').show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [10]:
# select first 3 column and top 3 rows 

df.select(df.columns[:3]).show(3)

+-------------+----------+-----+
|employee_name|department|state|
+-------------+----------+-----+
|        James|     Sales|   NY|
|      Michael|     Sales|   NY|
|       Robert|     Sales|   CA|
+-------------+----------+-----+
only showing top 3 rows



### 5. Filter rows based on a condition in pyspark dataframe.

In [11]:
df.filter('Salary < 25000').show()
df.filter(df['Salary'] < 25000).show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
+-------------+----------+-----+------+---+-----+

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
+-------------+----------+-----+------+---+-----+



In [12]:
df.filter('Salary < 25000').select(['Name', 'age']).show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `Name` cannot be resolved. Did you mean one of the following? [`age`, `state`, `bonus`, `salary`, `department`].;
'Project ['Name, age#4L]
+- Filter (Salary#3L < cast(25000 as bigint))
   +- LogicalRDD [employee_name#0, department#1, state#2, salary#3L, age#4L, bonus#5L], false


In [None]:
df.filter((df['Salary'] < 25000) & (df['Experience'] < 3 )).show()

In [None]:
# NOT Operation 

df.filter(~(df['Salary'] <= 25000)).show()

### 6. Group by a Column and perform an aggregation in pyspark

In [13]:
simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]
schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData, schema = schema)
df.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [14]:
df.groupBy('department').count().show(truncate= False)

+----------+-----+
|department|count|
+----------+-----+
|Sales     |3    |
|Finance   |4    |
|Marketing |2    |
+----------+-----+



In [15]:
df.groupBy('department').sum('salary').show(truncate = False)

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|Sales     |257000     |
|Finance   |351000     |
|Marketing |171000     |
+----------+-----------+



In [16]:
df.groupBy('department').min('salary').show(truncate = False)
df.groupBy('department').max('salary').show(truncate = False)
df.groupBy('department').avg('salary').show(truncate = False)
df.groupBy('department').mean('salary').show(truncate = False)

+----------+-----------+
|department|min(salary)|
+----------+-----------+
|Sales     |81000      |
|Finance   |79000      |
|Marketing |80000      |
+----------+-----------+

+----------+-----------+
|department|max(salary)|
+----------+-----------+
|Sales     |90000      |
|Finance   |99000      |
|Marketing |91000      |
+----------+-----------+

+----------+-----------------+
|department|avg(salary)      |
+----------+-----------------+
|Sales     |85666.66666666667|
|Finance   |87750.0          |
|Marketing |85500.0          |
+----------+-----------------+

+----------+-----------------+
|department|avg(salary)      |
+----------+-----------------+
|Sales     |85666.66666666667|
|Finance   |87750.0          |
|Marketing |85500.0          |
+----------+-----------------+



In [17]:
# GroupBy on multiple columns 

df.groupBy('department', 'state').min('salary', 'bonus').show(truncate=False)

+----------+-----+-----------+----------+
|department|state|min(salary)|min(bonus)|
+----------+-----+-----------+----------+
|Sales     |NY   |86000      |10000     |
|Sales     |CA   |81000      |23000     |
|Finance   |CA   |90000      |23000     |
|Finance   |NY   |79000      |15000     |
|Marketing |NY   |91000      |21000     |
|Marketing |CA   |80000      |18000     |
+----------+-----+-----------+----------+



In [18]:
df.groupBy('department').agg(sum('salary').alias('sum_salary'), avg('salary').alias('avg_salary'),\
                              min('bonus').alias('min_bonus'), max('bonus').alias('max_bonus')).show(truncate=False)

+----------+----------+-----------------+---------+---------+
|department|sum_salary|avg_salary       |min_bonus|max_bonus|
+----------+----------+-----------------+---------+---------+
|Sales     |257000    |85666.66666666667|10000    |23000    |
|Finance   |351000    |87750.0          |15000    |24000    |
|Marketing |171000    |85500.0          |18000    |21000    |
+----------+----------+-----------------+---------+---------+



### 7. Join two dataframes in pyspark

In [19]:
emp = [(1,"Smith",-1,"2018","10","M",3000), \
    (2,"Rose",1,"2010","20","M",4000), \
    (3,"Williams",1,"2010","10","M",1000), \
    (4,"Jones",2,"2005","10","F",2000), \
    (5,"Brown",2,"2010","40","",-1), \
      (6,"Brown",2,"2010","50","",-1) \
  ]
empColumns = ["emp_id","name","superior_emp_id","year_joined", "emp_dept_id","gender","salary"]
empdf = spark.createDataFrame(data = emp, schema = empColumns)
empdf.show(truncate=False) 

ept = [("Finance",10), ("Marketing",20), ("Sales",30), ("IT",40) ]
deptColumns = ["dept_name","dept_id"]
deptdf = spark.createDataFrame(data = ept, schema = deptColumns)
deptdf.show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |
|2     |Rose    |1              |2010       |20         |M     |4000  |
|3     |Williams|1              |2010       |10         |M     |1000  |
|4     |Jones   |2              |2005       |10         |F     |2000  |
|5     |Brown   |2              |2010       |40         |      |-1    |
|6     |Brown   |2              |2010       |50         |      |-1    |
+------+--------+---------------+-----------+-----------+------+------+

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



In [20]:
# Pyspark Inner Join dataframe 

empdf.join(deptdf, empdf.emp_dept_id == deptdf.dept_id, 'inner').show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [21]:
# Pyspark Left Outer Join

In [22]:
empdf.join(deptdf, empdf.emp_dept_id == deptdf.dept_id, 'left').show(truncate =False)
#empdf.join(deptdf, empdf.emp_dept_id == deptdf.dept_id, 'leftouter').show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
|6     |Brown   |2              |2010       |50         |      |-1    |NULL     |NULL   |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [23]:
# Right Outer Join 

In [24]:
empdf.join(deptdf, empdf.emp_dept_id == deptdf.dept_id, 'right').show(truncate=False)
#empdf.join(deptdf, empdf.emp_dept_id == deptdf.dept_id, 'rightouter').show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|NULL  |NULL    |NULL           |NULL       |NULL       |NULL  |NULL  |Sales    |30     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [25]:
# PYspark Full Outer Join

empdf.join(deptdf, empdf.emp_dept_id == deptdf.dept_id, 'outer').show(truncate=False)
empdf.join(deptdf, empdf.emp_dept_id == deptdf.dept_id, 'full').show(truncate=False)
empdf.join(deptdf, empdf.emp_dept_id == deptdf.dept_id, 'fullouter').show(truncate=False)


+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|NULL  |NULL    |NULL           |NULL       |NULL       |NULL  |NULL  |Sales    |30     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
|6     |Brown   |2              |2010       |50         |      |-1    |NULL     |NULL   |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+

+------+-

In [26]:
# left semi join

empdf.join(deptdf, empdf.emp_dept_id == deptdf.dept_id, 'leftsemi').show(truncate=False)


+------+--------+---------------+-----------+-----------+------+------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |
|3     |Williams|1              |2010       |10         |M     |1000  |
|4     |Jones   |2              |2005       |10         |F     |2000  |
|2     |Rose    |1              |2010       |20         |M     |4000  |
|5     |Brown   |2              |2010       |40         |      |-1    |
+------+--------+---------------+-----------+-----------+------+------+



In [28]:
# left anti join 

empdf.join(deptdf, empdf.emp_dept_id == deptdf.dept_id, 'leftanti').show(truncate=False)


+------+-----+---------------+-----------+-----------+------+------+
|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+-----+---------------+-----------+-----------+------+------+
|6     |Brown|2              |2010       |50         |      |-1    |
+------+-----+---------------+-----------+-----------+------+------+



### 8. Rename columns in a Pyspark dataframe

In [33]:
dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
  (('Michael','Rose',''),'2000-05-19','M',4000),
  (('Robert','','Williams'),'1978-09-05','M',4000),
  (('Maria','Anne','Jones'),'1967-12-01','F',4000),
  (('Jen','Mary','Brown'),'1980-02-17','F',-1)
]
schema = StructType([StructField('name', StructType([StructField('firstname', StringType(), True),
                                                    
                                                     StructField('middlename', StringType(), True),
                                                     StructField('lastname', StringType(), True)])),
                    StructField('dob', DateType(), True),
                    StructField('gender', StringType(), True),
                    StructField('salary', IntegerType(), True)
                    ])
df = spark.createDataFrame(data = dataDF, schema= schema)
df.printSchema()
df.show(truncate=False)

PySparkTypeError: [CANNOT_ACCEPT_OBJECT_IN_TYPE] `DateType()` can not accept object `1991-04-01` in type `str`.

In [None]:
# pyspark withcolumnRenamed() 

In [None]:
df.withColumnRenamed('dob', 'DateofBirth').show()

In [None]:
df.show()

In [None]:
# rename multiple columns

In [None]:
df2 = df.withColumnRenamed('dob', 'DateOfBirth').withColumnRenamed('salary', 'salary_amount')
df2.show()

In [None]:
# using pyspark StrutType - To rename a nested column in DF

In [None]:
schema2 = StructType([
    StructField('fname', StringType()),
    StructField('middlename', StringType()),
    StructField('Lname', StringType())
])

In [None]:
df3 = df.select(col('name').cast(schema2), col('dob'),col('gender'), col('salary'))
df3.show()

In [None]:
df3.select('name.*').show()

In [None]:
# using pyspark dataframe withcolumn - to rename nested columns 

In [None]:
df4 = df.withColumn('fname', col('name.firstname')).withColumn('mname', col('name.middlename')).withColumn('lname',col('name.lastname'))

In [None]:
df4.show()

In [None]:
# Using toDF() - to change all columns in a pyspark dataframe

In [None]:
newColumns = ['newCol1', 'newCol2', 'newCol3', 'newCol4']
df.toDF(*newColumns).show()

### 9. Handle missing or null values in DF

In [34]:
df = spark.read.csv('small_zipcode.csv', header=True, inferSchema=True)
df.show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               NULL|   PR|     30100|
|  2|    704|    NULL|PASEO COSTA DEL SUR|   PR|      NULL|
|  3|    709|    NULL|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               NULL|   TX|      NULL|
+---+-------+--------+-------------------+-----+----------+



In [35]:
# Replace 0 for null for all integer columns
df.na.fill(value=0).show()

# Replace 0 for null on only population column
df.na.fill(value=0, subset=['population']).show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               NULL|   PR|     30100|
|  2|    704|    NULL|PASEO COSTA DEL SUR|   PR|         0|
|  3|    709|    NULL|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               NULL|   TX|         0|
+---+-------+--------+-------------------+-----+----------+

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               NULL|   PR|     30100|
|  2|    704|    NULL|PASEO COSTA DEL SUR|   PR|         0|
|  3|    709|    NULL|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               NU

In [36]:
df.na.fill('').show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|                   |   PR|     30100|
|  2|    704|        |PASEO COSTA DEL SUR|   PR|      NULL|
|  3|    709|        |       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|                   |   TX|      NULL|
+---+-------+--------+-------------------+-----+----------+



In [37]:
df.na.fill('--HELLO--', ['city']).na.fill('----HI----', ['type']).show()

+---+-------+----------+-------------------+-----+----------+
| id|zipcode|      type|               city|state|population|
+---+-------+----------+-------------------+-----+----------+
|  1|    704|  STANDARD|          --HELLO--|   PR|     30100|
|  2|    704|----HI----|PASEO COSTA DEL SUR|   PR|      NULL|
|  3|    709|----HI----|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|    UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|  STANDARD|          --HELLO--|   TX|      NULL|
+---+-------+----------+-------------------+-----+----------+



In [38]:
df.na.fill({'type':'YO', 'city':'empty'}).show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|              empty|   PR|     30100|
|  2|    704|      YO|PASEO COSTA DEL SUR|   PR|      NULL|
|  3|    709|      YO|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|              empty|   TX|      NULL|
+---+-------+--------+-------------------+-----+----------+



### 10. Cretae a new column derived from existing columns 

In [39]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)]
columns = ['firstname', 'middlename','lastname', 'dob', 'gender','salary']
df = spark.createDataFrame(data=data, schema=columns)
df.printSchema()
df.show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [40]:
df.withColumn('copiedColumn', col('salary')* -1).show()

+---------+----------+--------+----------+------+------+------------+
|firstname|middlename|lastname|       dob|gender|salary|copiedColumn|
+---------+----------+--------+----------+------+------+------------+
|    James|          |   Smith|1991-04-01|     M|  3000|       -3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|       -4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|       -4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|       -4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|           1|
+---------+----------+--------+----------+------+------+------------+



### 11. Remove duplicate rows from a Pyspark df

In [41]:
data = [("James", "Sales", 3000), \
    ("Michael", "Sales", 4600), \
    ("Robert", "Sales", 4100), \
    ("Maria", "Finance", 3000), \
    ("James", "Sales", 3000), \
    ("Scott", "Finance", 3300), \
    ("Jen", "Finance", 3900), \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000), \
    ("Saif", "Sales", 4100) \
  ]

columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.show()

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
|        Maria|   Finance|  3000|
|        James|     Sales|  3000|
|        Scott|   Finance|  3300|
|          Jen|   Finance|  3900|
|         Jeff| Marketing|  3000|
|        Kumar| Marketing|  2000|
|         Saif|     Sales|  4100|
+-------------+----------+------+



In [42]:
# Applying distinct() to remove duplicate rows 

distinctDF=df.distinct()
#df2 = df.dropDuplicate()
print('distinct count: '+str(distinctDF.count()))
distinctDF.show()

distinct count: 9
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
|        Maria|   Finance|  3000|
|        Scott|   Finance|  3300|
|          Jen|   Finance|  3900|
|         Jeff| Marketing|  3000|
|        Kumar| Marketing|  2000|
|         Saif|     Sales|  4100|
+-------------+----------+------+



In [43]:
dropDisDF = df.dropDuplicates(['department', 'salary'])
print('distinct count: ', str(dropDisDF.count()))
dropDisDF.show()

distinct count:  8
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        Maria|   Finance|  3000|
|        Scott|   Finance|  3300|
|          Jen|   Finance|  3900|
|        Kumar| Marketing|  2000|
|         Jeff| Marketing|  3000|
|        James|     Sales|  3000|
|       Robert|     Sales|  4100|
|      Michael|     Sales|  4600|
+-------------+----------+------+



### 12. Sort a dataframe based on one or multiple  columns 

In [44]:
simpleData = [("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Raman","Finance","CA",99000,40,24000), \
    ("Scott","Finance","NY",83000,36,19000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]
columns= ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [45]:
# DataFrame sorting using the sort() function
# sort() takes a Boolean argument for ascending or descending order. 

#df.sort('department', 'state').show()
df.sort('department', 'state', ascending=[True, True]).show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
+-------------+----------+-----+------+---+-----+



In [46]:
df.sort(col('department'), col('state')).show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|        James|     Sales|   NY| 90000| 34|10000|
+-------------+----------+-----+------+---+-----+



In [47]:
# DataFrame sorting using orderBy() function 
# by default, it orders by ascending.

#df.orderBy('department', 'state').show()
df.orderBy(col('department'), col('state')).show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|        James|     Sales|   NY| 90000| 34|10000|
+-------------+----------+-----+------+---+-----+



In [48]:
# sort by ascending ASC / DESC

df.orderBy(col('department').asc(), col('state').desc()).show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|          Jen|   Finance|   NY| 79000| 53|15000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
+-------------+----------+-----+------+---+-----+



### 13. Perform a simple arithmetic operation on DF columns

In [49]:
data = [(100, 2,1), (200, 3,4), (300, 4,4 )]
df = spark.createDataFrame(data).toDF('col1', 'col2', 'col3')
df.show()

+----+----+----+
|col1|col2|col3|
+----+----+----+
| 100|   2|   1|
| 200|   3|   4|
| 300|   4|   4|
+----+----+----+



In [50]:
df.select(df.col1 + df.col2).show()
df.select(df.col1 - df.col2).show()
df.select(df.col1 * df.col2).show()
df.select(df.col1 / df.col2).show()
df.select(df.col1 % df.col2).show()

df.select(df.col1 > df.col2).show()
df.select(df.col1 < df.col2).show()
df.select(df.col1 == df.col2).show()

+-------------+
|(col1 + col2)|
+-------------+
|          102|
|          203|
|          304|
+-------------+

+-------------+
|(col1 - col2)|
+-------------+
|           98|
|          197|
|          296|
+-------------+

+-------------+
|(col1 * col2)|
+-------------+
|          200|
|          600|
|         1200|
+-------------+

+-----------------+
|    (col1 / col2)|
+-----------------+
|             50.0|
|66.66666666666667|
|             75.0|
+-----------------+

+-------------+
|(col1 % col2)|
+-------------+
|            0|
|            2|
|            0|
+-------------+

+-------------+
|(col1 > col2)|
+-------------+
|         true|
|         true|
|         true|
+-------------+

+-------------+
|(col1 < col2)|
+-------------+
|        false|
|        false|
|        false|
+-------------+

+-------------+
|(col1 = col2)|
+-------------+
|        false|
|        false|
|        false|
+-------------+



### 14. calculate descriptive statistics for numeric columns

In [51]:
simpleData = [("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Raman","Finance","CA",99000,40,24000), \
    ("Scott","Finance","NY",83000,36,19000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]
columns= ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [52]:
df.describe().show()

+-------+-------------+----------+-----+-----------------+------------------+------------------+
|summary|employee_name|department|state|           salary|               age|             bonus|
+-------+-------------+----------+-----+-----------------+------------------+------------------+
|  count|            9|         9|    9|                9|                 9|                 9|
|   mean|         NULL|      NULL| NULL|86555.55555555556|38.666666666666664|19222.222222222223|
| stddev|         NULL|      NULL| NULL|6540.472290116195|11.947803145348521| 4465.920335658087|
|    min|        James|   Finance|   CA|            79000|                24|             10000|
|    max|        Scott|     Sales|   NY|            99000|                56|             24000|
+-------+-------------+----------+-----+-----------------+------------------+------------------+



In [53]:
df.select('age').describe().show()

+-------+------------------+
|summary|               age|
+-------+------------------+
|  count|                 9|
|   mean|38.666666666666664|
| stddev|11.947803145348521|
|    min|                24|
|    max|                56|
+-------+------------------+



In [54]:
df.select('department', 'age').describe().show()

+-------+----------+------------------+
|summary|department|               age|
+-------+----------+------------------+
|  count|         9|                 9|
|   mean|      NULL|38.666666666666664|
| stddev|      NULL|11.947803145348521|
|    min|   Finance|                24|
|    max|     Sales|                56|
+-------+----------+------------------+



### 15. Apply user-defined functions (UDF) on DF

In [55]:
dept = [('Finance', 10), ('Marketing', 20), ('Sales', 30), ('IT', 40)]
rdd = spark.sparkContext.parallelize(dept)
rdd

ParallelCollectionRDD[314] at readRDDFromFile at PythonRDD.scala:289

In [56]:
df = rdd.toDF()
df.show()

+---------+---+
|       _1| _2|
+---------+---+
|  Finance| 10|
|Marketing| 20|
|    Sales| 30|
|       IT| 40|
+---------+---+



In [57]:
depcol = ['dept_name', 'dept_id']
df2 = rdd.toDF(depcol)
df2.printSchema()
df2.show()

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
|       IT|     40|
+---------+-------+



In [58]:
dschema = StructType([StructField('dept_name', StringType(), True),
                     StructField('dept_id', StringType(), True)])
df1 = spark.createDataFrame(data = rdd, schema=dschema)
df1.show()

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
|       IT|     40|
+---------+-------+



In [59]:
data = [('James', 3000), ('Anna',4001), ('Robert', 6200)]
d1 = spark.createDataFrame(data, ['name', 'salary'])
d1.show()

+------+------+
|  name|salary|
+------+------+
| James|  3000|
|  Anna|  4001|
|Robert|  6200|
+------+------+



In [60]:
rdd = d1.rdd
rdd.collect()

[Row(name='James', salary=3000),
 Row(name='Anna', salary=4001),
 Row(name='Robert', salary=6200)]

In [61]:
# Apply map() transformation

rdd2 = d1.rdd.map(lambda x: [x[0], x[1]*2])
rdd2.collect()

[['James', 6000], ['Anna', 8002], ['Robert', 12400]]

In [62]:
rdf = spark.createDataFrame(rdd2,['name','bonus'])
rdf.show()

+------+-----+
|  name|bonus|
+------+-----+
| James| 6000|
|  Anna| 8002|
|Robert|12400|
+------+-----+



### 16. Convert a Pyspark DF to Pandas DF

In [63]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)]
columns = ['firstname', 'middlename','lastname', 'dob', 'gender','salary']
df = spark.createDataFrame(data=data, schema=columns)
df.printSchema()
df.show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [64]:
pdf = df.toPandas()
pdf

Unnamed: 0,firstname,middlename,lastname,dob,gender,salary
0,James,,Smith,1991-04-01,M,3000
1,Michael,Rose,,2000-05-19,M,4000
2,Robert,,Williams,1978-09-05,M,4000
3,Maria,Anne,Jones,1967-12-01,F,4000
4,Jen,Mary,Brown,1980-02-17,F,-1


### 17. Write a pyspark DF to a csv file

In [65]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)]
columns = ['firstname', 'middlename','lastname', 'dob', 'gender','salary']
df = spark.createDataFrame(data=data, schema=columns)
df.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [66]:
pan = df.toPandas()
pan.to_csv('d1.csv')

### 18 . cache or persist a pyspark DF for better performance.

In [67]:
df4 = spark.read.format('csv')\
    .option('header', True)\
    .option('inferschema', True)\
    .option('mode', 'FAILFAST')\
    .load('employee_data.csv')
df4.show()

+---+--------+---+------+------------+--------+--------+
| id|    name|age|salary|     address| nominee|     _c6|
+---+--------+---+------+------------+--------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|    NULL|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|    NULL|
|  3|  Pritam| 22|150000|   Bangalore|   India|nominee3|
|  4|Prantosh| 17|200000|     Kolkata|   India|nominee4|
|  5|  Vikash| 31|300000|        NULL|nominee5|    NULL|
+---+--------+---+------+------------+--------+--------+



In [68]:
df = spark.read.csv('employee_data.csv', header = True, inferSchema = True, mode = 'FAILFAST')
df.show()

+---+--------+---+------+------------+--------+--------+
| id|    name|age|salary|     address| nominee|     _c6|
+---+--------+---+------+------------+--------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|    NULL|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|    NULL|
|  3|  Pritam| 22|150000|   Bangalore|   India|nominee3|
|  4|Prantosh| 17|200000|     Kolkata|   India|nominee4|
|  5|  Vikash| 31|300000|        NULL|nominee5|    NULL|
+---+--------+---+------+------------+--------+--------+



In [69]:
df1 = spark.read.csv('employee_data.csv', header = True, inferSchema = True, mode = 'PERMISSIVE')
df1.show()  

+---+--------+---+------+------------+--------+--------+
| id|    name|age|salary|     address| nominee|     _c6|
+---+--------+---+------+------------+--------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|    NULL|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|    NULL|
|  3|  Pritam| 22|150000|   Bangalore|   India|nominee3|
|  4|Prantosh| 17|200000|     Kolkata|   India|nominee4|
|  5|  Vikash| 31|300000|        NULL|nominee5|    NULL|
+---+--------+---+------+------------+--------+--------+



In [70]:
spark()

TypeError: 'SparkSession' object is not callable