In [12]:
import os
import sys
from pyspark.sql import *
from pyspark.sql.functions import *

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:

spark = SparkSession.builder \
                    .appName("Spark Exploring Columns Demo") \
                    .master("local[3]") \
                    .enableHiveSupport() \
                    .getOrCreate()

# Columns can be referred as Column Object and column string

In [3]:
df = spark.read \
        .format("csv") \
        .option("header", "true") \
        .load("sample.csv")
df.show(5)

+--------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+------------------+-------------------------+-----------------------+------------+----------+-----------------------+---------------------+------------------+---------------+--------+
|     Timestamp|Age|Gender|       Country|state|self_employed|family_history|treatment|work_interfere|  no_employees|remote_work|tech_company|  benefits|care_options|wellness_program| seek_help| anonymity|             leave|mental_health_consequence|phys_health_consequence|   coworkers|supervisor|mental_health_interview|phys_health_interview|mental_vs_physical|obs_consequence|comments|
+--------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+-----------------

In [13]:
df.select(df.Timestamp, "age", col('Gender')).show(5)

+--------------+---+------+
|     Timestamp|age|Gender|
+--------------+---+------+
|27-08-14 11:29| 37|Female|
|27-08-14 11:29| 44|     M|
|27-08-14 11:29| 32|  Male|
|27-08-14 11:29| 31|  Male|
|27-08-14 11:30| 31|  Male|
+--------------+---+------+
only showing top 5 rows



# create a column expression by combining 2 or more columns 

In [14]:
# 1 Using string or sql expression
df.select(df.Timestamp, "age", col('Gender'), expr("concat(Gender, Age) as Gnder_Age")).show(5)

+--------------+---+------+---------+
|     Timestamp|age|Gender|Gnder_Age|
+--------------+---+------+---------+
|27-08-14 11:29| 37|Female| Female37|
|27-08-14 11:29| 44|     M|      M44|
|27-08-14 11:29| 32|  Male|   Male32|
|27-08-14 11:29| 31|  Male|   Male31|
|27-08-14 11:30| 31|  Male|   Male31|
+--------------+---+------+---------+
only showing top 5 rows



In [16]:
# Convert to column objext expressions
df.select(df.Timestamp, "age", col('Gender'), (concat("Gender", "Age").alias("Gnder_Age"))).show(5)

+--------------+---+------+---------+
|     Timestamp|age|Gender|Gnder_Age|
+--------------+---+------+---------+
|27-08-14 11:29| 37|Female| Female37|
|27-08-14 11:29| 44|     M|      M44|
|27-08-14 11:29| 32|  Male|   Male32|
|27-08-14 11:29| 31|  Male|   Male31|
|27-08-14 11:30| 31|  Male|   Male31|
+--------------+---+------+---------+
only showing top 5 rows

