In [27]:
import findspark
findspark.init()

In [28]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.4.0") \
    .getOrCreate()

# create dataframe

In [45]:
data = ((1,'kiran ','chinta',150000.00,'usa','+1 232 343 7889','000 00 0000'),
    (2,'goats ',' machi',100000.00,'ind','+91 111 222 7889','123 45 6789')
       )
schema="""eid INT,first_name STRING,last_name STRING,salary FLOAT,country STRING,phone_number STRING,ssn STRING"""
df = spark.createDataFrame(data=data,schema=schema)
df.show()

+---+----------+---------+--------+-------+----------------+-----------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|
+---+----------+---------+--------+-------+----------------+-----------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|
+---+----------+---------+--------+-------+----------------+-----------+



# String manipulation functions
- case converstion - <i style="color:blue"> lower,upper,initcap
- getting lenght - <i style="color:blue"> lenght 
- extracting substring - <i style="color:blue"> substring, split
- trimming - <i style="color:blue"> trim, ltrim, rtrim
- padding - <i style="color:blue"> pad, rpad
- concatenation - <i style="color:blue"> concat, concat_ws

### case conversions

In [46]:
from pyspark.sql.functions import lower,upper, col, initcap
df.withColumn('lower_first_name',lower('first_name')).show(2)

df.withColumn('upper_first_name',upper('first_name')).show(2)

df.withColumn('init_first_name',initcap('first_name')).show(2)

+---+----------+---------+--------+-------+----------------+-----------+----------------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|lower_first_name|
+---+----------+---------+--------+-------+----------------+-----------+----------------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|          kiran |
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|          goats |
+---+----------+---------+--------+-------+----------------+-----------+----------------+

+---+----------+---------+--------+-------+----------------+-----------+----------------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|upper_first_name|
+---+----------+---------+--------+-------+----------------+-----------+----------------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|          KIRAN |
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|          GOATS |
+---+----

# length

In [47]:
from pyspark.sql.functions import length, col

df.withColumn('name_length',length('first_name')).show(2)

+---+----------+---------+--------+-------+----------------+-----------+-----------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|name_length|
+---+----------+---------+--------+-------+----------------+-----------+-----------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|          6|
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|          6|
+---+----------+---------+--------+-------+----------------+-----------+-----------+



# substring

In [48]:
# substring -> ('col','start position','lenght')
from pyspark.sql.functions import substring, col

df.withColumn('sub',substring('first_name',0,3)).show(2)

+---+----------+---------+--------+-------+----------------+-----------+---+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|sub|
+---+----------+---------+--------+-------+----------------+-----------+---+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|kir|
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|goa|
+---+----------+---------+--------+-------+----------------+-----------+---+



In [49]:
df.withColumn('last_4_ssn',substring('ssn',7,5)).show(2)

+---+----------+---------+--------+-------+----------------+-----------+----------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|last_4_ssn|
+---+----------+---------+--------+-------+----------------+-----------+----------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|      0000|
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|      6789|
+---+----------+---------+--------+-------+----------------+-----------+----------+



# split

In [50]:
# split -> ('col','delimiter','limit <optional>')
from pyspark.sql.functions import split, col

df.withColumn('area_code',split('phone_number',' ')[0]).show(2)

+---+----------+---------+--------+-------+----------------+-----------+---------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|area_code|
+---+----------+---------+--------+-------+----------------+-----------+---------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|       +1|
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|      +91|
+---+----------+---------+--------+-------+----------------+-----------+---------+



# trimming
- trim : removes spaces from both ends
- ltrim : remove spaces from left end
- rtrim : remove spaces from right end

In [52]:
from pyspark.sql.functions import trim
df.withColumn('new_trim_col',trim('first_name')).show(2)

+---+----------+---------+--------+-------+----------------+-----------+------------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|new_trim_col|
+---+----------+---------+--------+-------+----------------+-----------+------------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|       kiran|
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|       goats|
+---+----------+---------+--------+-------+----------------+-----------+------------+



In [55]:
from pyspark.sql.functions import ltrim
df.withColumn('new_ltrim_col',ltrim('last_name')).show(2)

+---+----------+---------+--------+-------+----------------+-----------+-------------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|new_ltrim_col|
+---+----------+---------+--------+-------+----------------+-----------+-------------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|       chinta|
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|        machi|
+---+----------+---------+--------+-------+----------------+-----------+-------------+



In [56]:
from pyspark.sql.functions import rtrim
df.withColumn('new_rtrim_col',rtrim('first_name')).show(2)

+---+----------+---------+--------+-------+----------------+-----------+-------------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|new_rtrim_col|
+---+----------+---------+--------+-------+----------------+-----------+-------------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|        kiran|
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|        goats|
+---+----------+---------+--------+-------+----------------+-----------+-------------+



# padding
- to make sure column have same lenght

In [58]:
from pyspark.sql.functions import lpad

df.withColumn('lpad_col',lpad('last_name',10,'*')).show()

+---+----------+---------+--------+-------+----------------+-----------+----------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|  lpad_col|
+---+----------+---------+--------+-------+----------------+-----------+----------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|****chinta|
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|**** machi|
+---+----------+---------+--------+-------+----------------+-----------+----------+



In [59]:
from pyspark.sql.functions import rpad

df.withColumn('rpad_col',rpad('last_name',10,'*')).show()

+---+----------+---------+--------+-------+----------------+-----------+----------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|  rpad_col|
+---+----------+---------+--------+-------+----------------+-----------+----------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|chinta****|
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789| machi****|
+---+----------+---------+--------+-------+----------------+-----------+----------+



# concatenation

In [60]:
from pyspark.sql.functions import concat, lit

df.withColumn('full_name',concat('first_name',lit('-'),'last_name')).show()

+---+----------+---------+--------+-------+----------------+-----------+-------------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|    full_name|
+---+----------+---------+--------+-------+----------------+-----------+-------------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|kiran -chinta|
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|goats - machi|
+---+----------+---------+--------+-------+----------------+-----------+-------------+



In [62]:
from pyspark.sql.functions import concat_ws

df.withColumn('full_name',concat_ws('_','first_name','last_name')).show()

+---+----------+---------+--------+-------+----------------+-----------+-------------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|    full_name|
+---+----------+---------+--------+-------+----------------+-----------+-------------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|kiran _chinta|
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|goats _ machi|
+---+----------+---------+--------+-------+----------------+-----------+-------------+



In [63]:
spark.stop()