In [1]:
sc

In [2]:
spark

In [3]:
people_df = spark.read.json("file:///home/hadoop/Downloads/People.json")

In [4]:
people_df.show(5)

+---------+-----------+----------+------+---+---------+------+
|     city|    country|first_name|gender| id|last_name|salary|
+---------+-----------+----------+------+---+---------+------+
|Mulyosari|  Indonesia|     Valma|Female|  1|     Sans|983107|
|  Niihama|      Japan|     Paolo|  Male|  2|   Kiddie|649173|
|Dū Qal‘ah|Afghanistan|    Miltie|  Male|  3| De Zuani|352898|
|   Iberia|       Peru|    Jarrid|  Male|  4| Dalziell|170398|
| La Ronge|     Canada| Reinaldos|  Male|  5|   Keeffe|440989|
+---------+-----------+----------+------+---+---------+------+
only showing top 5 rows



In [5]:
people_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: long (nullable = true)



### Create a user defined schema for fields of DataFrame

In [6]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [7]:
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", LongType(), True),
    StructField("city", StringType(), True),
    StructField("country", StringType(), True)
])

In [8]:
people_df = spark.read.schema(schema).json("file:///home/hadoop/Downloads/People.json")

In [9]:
people_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)



In [10]:
bank_data = spark.read.json("file:///home/hadoop/Downloads/bank_edited.json", multiLine=True)
#if the data is pretty formatted the contents will be in new line which will cause an error while reading
#so multiLine parameter is required to resolve that issue

In [11]:
bank_data.show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown| 

In [12]:
bank_data.printSchema()

root
 |-- age: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: long (nullable = true)
 |-- default: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- job: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- month: string (nullable = true)
 |-- pdays: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



In [13]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

### 2. Typecast Any one column

- Any transformation for individual column will go through withColumn()

In [14]:
bank_data = bank_data.withColumn("age", col("age").cast(IntegerType()))

In [15]:
bank_data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- balance: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: long (nullable = true)
 |-- default: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- job: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- month: string (nullable = true)
 |-- pdays: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



### 3.Creating new column from two strings

In [16]:
from pyspark.sql.functions import concat

people_df.withColumn("Full_Name", concat(col("first_name"),lit(" "),col("last_name"))).show()

+---+----------+---------+------+------+------------------+------------+-----------------+
| id|first_name|last_name|gender|salary|              city|     country|        Full_Name|
+---+----------+---------+------+------+------------------+------------+-----------------+
|  1|     Valma|     Sans|Female|983107|         Mulyosari|   Indonesia|       Valma Sans|
|  2|     Paolo|   Kiddie|  Male|649173|           Niihama|       Japan|     Paolo Kiddie|
|  3|    Miltie| De Zuani|  Male|352898|         Dū Qal‘ah| Afghanistan|  Miltie De Zuani|
|  4|    Jarrid| Dalziell|  Male|170398|            Iberia|        Peru|  Jarrid Dalziell|
|  5| Reinaldos|   Keeffe|  Male|440989|          La Ronge|      Canada| Reinaldos Keeffe|
|  6|        Eb|Schwanden|  Male|274126|      Kuala Lumpur|    Malaysia|     Eb Schwanden|
|  7|    Alleyn|   Paddon|  Male|681914|         Al Qurayn|Saudi Arabia|    Alleyn Paddon|
|  8|   Baryram|     Yell|  Male|250748|           Jixiang|       China|     Baryram Yell|

### 4. Renaming a column

In [17]:
people_df = people_df.withColumnRenamed("salary", "income")

In [18]:
people_df.show()

+---+----------+---------+------+------+------------------+------------+
| id|first_name|last_name|gender|income|              city|     country|
+---+----------+---------+------+------+------------------+------------+
|  1|     Valma|     Sans|Female|983107|         Mulyosari|   Indonesia|
|  2|     Paolo|   Kiddie|  Male|649173|           Niihama|       Japan|
|  3|    Miltie| De Zuani|  Male|352898|         Dū Qal‘ah| Afghanistan|
|  4|    Jarrid| Dalziell|  Male|170398|            Iberia|        Peru|
|  5| Reinaldos|   Keeffe|  Male|440989|          La Ronge|      Canada|
|  6|        Eb|Schwanden|  Male|274126|      Kuala Lumpur|    Malaysia|
|  7|    Alleyn|   Paddon|  Male|681914|         Al Qurayn|Saudi Arabia|
|  8|   Baryram|     Yell|  Male|250748|           Jixiang|       China|
|  9|     Cammy|     Axel|Female|221750|Thị Trấn Phong Thổ|     Vietnam|
| 10|       Erl|  Caldera|  Male|680801|        Kotatengah|   Indonesia|
| 11|    Miguel|   Moules|  Male|819771|        Rol

### 5.Limit()

In [19]:
people_df.limit(5).show()

+---+----------+---------+------+------+---------+-----------+
| id|first_name|last_name|gender|income|     city|    country|
+---+----------+---------+------+------+---------+-----------+
|  1|     Valma|     Sans|Female|983107|Mulyosari|  Indonesia|
|  2|     Paolo|   Kiddie|  Male|649173|  Niihama|      Japan|
|  3|    Miltie| De Zuani|  Male|352898|Dū Qal‘ah|Afghanistan|
|  4|    Jarrid| Dalziell|  Male|170398|   Iberia|       Peru|
|  5| Reinaldos|   Keeffe|  Male|440989| La Ronge|     Canada|
+---+----------+---------+------+------+---------+-----------+



### 6. OrderBy()

- Arrange Data in ascending & descending order

In [20]:
#sort gender by ascending order and 
# for the genders being same the income is arranges in descending order
people_df.orderBy(['gender','income'], ascending=[True,False]).show()

+---+----------+----------+------+------+---------------+-----------+
| id|first_name| last_name|gender|income|           city|    country|
+---+----------+----------+------+------+---------------+-----------+
|812|      Ebba|   Brandom|Female|996994|    Dhromolaxia|     Cyprus|
|689|  Harriott|Strathearn|Female|995400|          Taiba|      China|
|158|   Evelina|      Ibbs|Female|995388| Kotawaikabubak|  Indonesia|
|770|     Ruthe|      Brun|Female|992992|        Xintian|      China|
|131|     Harri|     Raper|Female|989626|        Hujigou|      China|
|872|     Rycca|     Maund|Female|987858|           Azul|  Argentina|
| 82| Hollyanne|  Heberden|Female|985194|        Citeguh|  Indonesia|
|376|     Kylie|Grigorescu|Female|984966|        La Vega|   Colombia|
|503|    Billye|  Rappport|Female|984735|      Nerópolis|     Brazil|
| 77|     Ailyn|    Palmar|Female|984515|   Lonpao Dajah|  Indonesia|
|  1|     Valma|      Sans|Female|983107|      Mulyosari|  Indonesia|
|490|  Cathlene|  Ga

### Materialized view
- createOrReplaceTempView()

In [21]:
bank_data.createOrReplaceTempView("bankdata_df")

In [23]:
spark.sql("select * from bankdata_df").show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown| 

In [24]:
spark.sql("select count(*) from bankdata_df").show()

+--------+
|count(1)|
+--------+
|   45211|
+--------+



#### Show the top 10 Youngest Employee with Maximum Balance 

In [43]:
spark.sql("select age, MAX(balance) from bankdata_df GROUP BY age ORDER BY age ASC, MAX(balance) DESC").limit(10).show()

+---+------------+
|age|max(balance)|
+---+------------+
| 18|        1944|
| 19|        5368|
| 20|        8860|
| 21|        8278|
| 22|       10971|
| 23|       19690|
| 24|       23878|
| 25|       16874|
| 26|       24299|
| 27|       24025|
+---+------------+



#### Show the worst 5 Job type having minimum salary

In [42]:
spark.sql("select job,MIN(balance) as minimum_salary from bankdata_df GROUP BY job ORDER BY MIN(balance) ASC  ").limit(5).show()

+-------------+--------------+
|          job|minimum_salary|
+-------------+--------------+
|  blue-collar|         -8019|
|   management|         -6847|
|self-employed|         -3313|
|   technician|         -2827|
|     services|         -2122|
+-------------+--------------+

