In [0]:
employee_df=spark.read.format("CSV")\
                  .option("header","true")\
                  .option("inferschema","true")\
                  .option("mode","PERMISSIVE")\
                  .load("/FileStore/tables/employee_file.csv")
employee_df.show(5)

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+



In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
employee_df.select("name").show()  #STRING method to select columns

+--------+
|    name|
+--------+
|  Manish|
|  Nikita|
|  Pritam|
|Prantosh|
|  Vikash|
+--------+



In [0]:
employee_df.select(col("name")).show()  #STRING method to select columns

+--------+
|    name|
+--------+
|  Manish|
|  Nikita|
|  Pritam|
|Prantosh|
|  Vikash|
+--------+



In [0]:
employee_df.select(col("id")+5).show()  # When to use COLUMN method

+--------+
|(id + 5)|
+--------+
|       6|
|       7|
|       8|
|       9|
|      10|
+--------+



In [0]:
employee_df.select("id", "name", "age").show()

+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|  Manish| 26|
|  2|  Nikita| 23|
|  3|  Pritam| 22|
|  4|Prantosh| 17|
|  5|  Vikash| 31|
+---+--------+---+



In [0]:
employee_df.select(col("id"), col("name"), col("age")).show()

+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|  Manish| 26|
|  2|  Nikita| 23|
|  3|  Pritam| 22|
|  4|Prantosh| 17|
|  5|  Vikash| 31|
+---+--------+---+



In [0]:
employee_df.select("id", col("name"), employee_df["salary"], employee_df.address).show()

+---+--------+------+------------+
| id|    name|salary|     address|
+---+--------+------+------------+
|  1|  Manish| 75000|       bihar|
|  2|  Nikita|100000|uttarpradesh|
|  3|  Pritam|150000|   Bangalore|
|  4|Prantosh|200000|     Kolkata|
|  5|  Vikash|300000|        null|
+---+--------+------+------------+



In [0]:
employee_df.select("id+5").show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2145342719754695>:1[0m
[0;32m----> 1[0m [43memployee_df[49m[38;5;241;43m.[39;49m[43mselect[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mid+5[39;49m[38;5;124;43m"[39;49m[43m)[49m[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m     49[0m     logger[38;5;241m.[39mlog_success(
[1;32m     50[0m         module_name, class_nam

In [0]:
employee_df.select(expr("id+5")).show()

+--------+
|(id + 5)|
+--------+
|       6|
|       7|
|       8|
|       9|
|      10|
+--------+



In [0]:
employee_df.select(expr("id as employee_id"), expr("name as employee_name"), expr("concat(name,address)")).show()

+-----------+-------------+---------------------+
|employee_id|employee_name|concat(name, address)|
+-----------+-------------+---------------------+
|          1|       Manish|          Manishbihar|
|          2|       Nikita|   Nikitauttarpradesh|
|          3|       Pritam|      PritamBangalore|
|          4|     Prantosh|      PrantoshKolkata|
|          5|       Vikash|                 null|
+-----------+-------------+---------------------+



In [0]:
employee_df.select("*").show()

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+



Spark SQL

In [0]:
employee_df.createOrReplaceTempView("employee_tbl")

In [0]:
spark.sql("""
          select * from employee_tbl
          """).show()

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+

