In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/13 11:12:05 WARN Utils: Your hostname, kirans-mac.local, resolves to a loopback address: 127.0.0.1; using 172.18.197.149 instead (on interface en0)
25/06/13 11:12:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/13 11:12:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 50864)
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/miniconda3/lib/p

# create Dataframe from range

In [5]:
df = spark.range(5)
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



# create single column DataFrame using list

In [7]:
# integer datatype values
ages_list =[10,20,30]

from pyspark.sql.types import *
df = spark.createDataFrame(ages_list, IntegerType()) # if type is not included it will fail
df.show()

+-----+
|value|
+-----+
|   10|
|   20|
|   30|
+-----+



In [9]:
# string datatypes list
names = ['kiran','kuamr','chinta']

from pyspark.sql.types import *
df = spark.createDataFrame(names,StringType()) # if type is not included it will fail
df.show()

+------+
| value|
+------+
| kiran|
| kuamr|
|chinta|
+------+



# create DataFrame using list of Tuples

In [10]:
user_list = [(1,'kiran',10),(2,'kumar',20),(3,'chinta',30)]

df3 = spark.createDataFrame(user_list)
df3.show()

+---+------+---+
| _1|    _2| _3|
+---+------+---+
|  1| kiran| 10|
|  2| kumar| 20|
|  3|chinta| 30|
+---+------+---+



In [11]:
df4 = spark.createDataFrame(user_list,'user_id int, name string, age int')
df4.show()

+-------+------+---+
|user_id|  name|age|
+-------+------+---+
|      1| kiran| 10|
|      2| kumar| 20|
|      3|chinta| 30|
+-------+------+---+



# create DataFrame using list of Lists

In [12]:
user_list = [[1,'kiran',10],[2,'kumar',20],[3,'chinta',30]]

df5 = spark.createDataFrame(user_list)
df5.show()

+---+------+---+
| _1|    _2| _3|
+---+------+---+
|  1| kiran| 10|
|  2| kumar| 20|
|  3|chinta| 30|
+---+------+---+



In [13]:
df6 = spark.createDataFrame(user_list,'user_id int, name string, age int')
df6.show()

+-------+------+---+
|user_id|  name|age|
+-------+------+---+
|      1| kiran| 10|
|      2| kumar| 20|
|      3|chinta| 30|
+-------+------+---+



# create DataFrame using list of Dicts

In [15]:
users_list = [{'user_id':1, 'name':'kiran','age':10},
             {'user_id':2, 'name':'kumar','age':20},
             {'user_id':3, 'name':'chinta','age':30}]

df7 = spark.createDataFrame(users_list)
df7.show()

+---+------+-------+
|age|  name|user_id|
+---+------+-------+
| 10| kiran|      1|
| 20| kumar|      2|
| 30|chinta|      3|
+---+------+-------+



# create DataFrame using Row

In [16]:
df8 = spark.createDataFrame(Row(**user) for user in users_list)

df8.show()

+-------+------+---+
|user_id|  name|age|
+-------+------+---+
|      1| kiran| 10|
|      2| kumar| 20|
|      3|chinta| 30|
+-------+------+---+



# create DataFrame using pandas DataFrame

In [58]:
import pandas as pd
users_list = [{'user_id':1, 'name':'kiran','age':10},
             {'user_id':2, 'name':'kumar','age':20},
             {'user_id':3, 'name':'chinta','age':30}]

pandas_df = pd.DataFrame(users_list)

df7 = spark.createDataFrame(pandas_df)
df7.show()

ConnectionRefusedError: [Errno 61] Connection refused

# describe spark dataframe

In [17]:
df8.show()

+-------+------+---+
|user_id|  name|age|
+-------+------+---+
|      1| kiran| 10|
|      2| kumar| 20|
|      3|chinta| 30|
+-------+------+---+



In [18]:
df8.columns

['user_id', 'name', 'age']

In [19]:
df8.dtypes

[('user_id', 'bigint'), ('name', 'string'), ('age', 'bigint')]

In [20]:
df8.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



# create DataFrame with Schema(string)

In [28]:
import datetime

users_list = [(1,'kiran',10, datetime.date(1990,10,15),datetime.datetime(1990,2,13,1,15),True,1200.34),
             (2,'kumar',20, datetime.date(1991,11,15),datetime.datetime(1990,4,1,1,15),False,2200.34),
             (3,'chinta',30, datetime.date(1991,12,15),datetime.datetime(1990,6,5,1,15),True,4200.34)]


schema = 'user_id INT, name STRING, age INT, dob DATE, slot TIMESTAMP, available BOOLEAN, sal FLOAT'
df9 = spark.createDataFrame(users_list, schema = schema)
df9.show()

+-------+------+---+----------+-------------------+---------+-------+
|user_id|  name|age|       dob|               slot|available|    sal|
+-------+------+---+----------+-------------------+---------+-------+
|      1| kiran| 10|1990-10-15|1990-02-13 01:15:00|     true|1200.34|
|      2| kumar| 20|1991-11-15|1990-04-01 01:15:00|    false|2200.34|
|      3|chinta| 30|1991-12-15|1990-06-05 01:15:00|     true|4200.34|
+-------+------+---+----------+-------------------+---------+-------+



In [29]:
df9.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- dob: date (nullable = true)
 |-- slot: timestamp (nullable = true)
 |-- available: boolean (nullable = true)
 |-- sal: float (nullable = true)



# create DataFrame with Schema(list)

In [30]:
import datetime

users_list = [(1,'kiran',10, datetime.date(1990,10,15),datetime.datetime(1990,2,13,1,15),True,1200.34),
             (2,'kumar',20, datetime.date(1991,11,15),datetime.datetime(1990,4,1,1,15),False,2200.34),
             (3,'chinta',30, datetime.date(1991,12,15),datetime.datetime(1990,6,5,1,15),True,4200.34)]


schema = ['user_id INT', 'name STRING', 'age INT', 'dob DATE', 'slot TIMESTAMP', 'available BOOLEAN', 'sal FLOAT']
df9 = spark.createDataFrame(users_list, schema = schema)
df9.show()

+-----------+-----------+-------+----------+-------------------+-----------------+---------+
|user_id INT|name STRING|age INT|  dob DATE|     slot TIMESTAMP|available BOOLEAN|sal FLOAT|
+-----------+-----------+-------+----------+-------------------+-----------------+---------+
|          1|      kiran|     10|1990-10-15|1990-02-13 01:15:00|             true|  1200.34|
|          2|      kumar|     20|1991-11-15|1990-04-01 01:15:00|            false|  2200.34|
|          3|     chinta|     30|1991-12-15|1990-06-05 01:15:00|             true|  4200.34|
+-----------+-----------+-------+----------+-------------------+-----------------+---------+



# create DataFrame with Schema(sparkTypes/structTypes)

In [35]:
import datetime

users_list = [(1,'kiran',10, datetime.date(1990,10,15),datetime.datetime(1990,2,13,1,15),True,1200.34),
             (2,'kumar',20, datetime.date(1991,11,15),datetime.datetime(1990,4,1,1,15),False,2200.34),
             (3,'chinta',30, datetime.date(1991,12,15),datetime.datetime(1990,6,5,1,15),True,4200.34)]


schema = ['user_id INT', 'name STRING', 'age INT', 'dob DATE', 'slot TIMESTAMP', 'available BOOLEAN', 'sal FLOAT']

schema = StructType([
            StructField('id',IntegerType()),
            StructField('name',StringType()),
            StructField('age',IntegerType()),
            StructField('dob',DateType()),
            StructField('slot',TimestampType()),
            StructField('available',BooleanType()),
            StructField('sal',FloatType())
    
])

df9 = spark.createDataFrame(users_list, schema = schema)
df9.show()

+---+------+---+----------+-------------------+---------+-------+
| id|  name|age|       dob|               slot|available|    sal|
+---+------+---+----------+-------------------+---------+-------+
|  1| kiran| 10|1990-10-15|1990-02-13 01:15:00|     true|1200.34|
|  2| kumar| 20|1991-11-15|1990-04-01 01:15:00|    false|2200.34|
|  3|chinta| 30|1991-12-15|1990-06-05 01:15:00|     true|4200.34|
+---+------+---+----------+-------------------+---------+-------+



# special dataypes in spark

- python --> spark
- list --> array
- dict --> map
-     struct (record in exchange)

# Create DataFrame with ARRAY datatype

In [38]:
users_list = [{'user_id':1, 'name':'kiran','age':10,'phone_numbers':[1234,5434]},
             {'user_id':2, 'name':'kumar','age':20,'phone_numbers':[246,135]},
             {'user_id':3, 'name':'chinta','age':30, 'phone_numbers':[456,789]},
             {'user_id':4, 'name':'goats','age':40, 'phone_numbers':[None,789]},
             {'user_id':5, 'name':'manchi','age':50, 'phone_numbers':[456,None]},
             {'user_id':6, 'name':'unknown','age':60, 'phone_numbers':[None,None]},]


df10 = spark.createDataFrame(users_list)
df10.show()

+---+-------+-------------+-------+
|age|   name|phone_numbers|user_id|
+---+-------+-------------+-------+
| 10|  kiran| [1234, 5434]|      1|
| 20|  kumar|   [246, 135]|      2|
| 30| chinta|   [456, 789]|      3|
| 40|  goats|  [NULL, 789]|      4|
| 50| manchi|  [456, NULL]|      5|
| 60|unknown| [NULL, NULL]|      6|
+---+-------+-------------+-------+



In [39]:
df10.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)
 |-- phone_numbers: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- user_id: long (nullable = true)



In [41]:
# explode --> will create seperate columns by flattening the data

from pyspark.sql.functions import explode

df10.select('*',explode('phone_numbers')).show()

+---+-------+-------------+-------+----+
|age|   name|phone_numbers|user_id| col|
+---+-------+-------------+-------+----+
| 10|  kiran| [1234, 5434]|      1|1234|
| 10|  kiran| [1234, 5434]|      1|5434|
| 20|  kumar|   [246, 135]|      2| 246|
| 20|  kumar|   [246, 135]|      2| 135|
| 30| chinta|   [456, 789]|      3| 456|
| 30| chinta|   [456, 789]|      3| 789|
| 40|  goats|  [NULL, 789]|      4|NULL|
| 40|  goats|  [NULL, 789]|      4| 789|
| 50| manchi|  [456, NULL]|      5| 456|
| 50| manchi|  [456, NULL]|      5|NULL|
| 60|unknown| [NULL, NULL]|      6|NULL|
| 60|unknown| [NULL, NULL]|      6|NULL|
+---+-------+-------------+-------+----+



In [46]:
# create two seperate columns
from pyspark.sql.functions import col

df10.\
    select('user_id',col('phone_numbers')[0].alias('home_number'),col('phone_numbers')[1].alias('mobile_number')).\
    show()

+-------+-----------+-------------+
|user_id|home_number|mobile_number|
+-------+-----------+-------------+
|      1|       1234|         5434|
|      2|        246|          135|
|      3|        456|          789|
|      4|       NULL|          789|
|      5|        456|         NULL|
|      6|       NULL|         NULL|
+-------+-----------+-------------+



# Create DataFrame with MAP datatype

In [48]:
users_list = [{'user_id':1, 'name':'kiran','age':10,'phone_numbers':{"home":123,"mobile":456}},
             {'user_id':2, 'name':'kumar','age':20,'phone_numbers':{"home":246,"mobile":680}},
             {'user_id':3, 'name':'chinta','age':30, 'phone_numbers':{"home":135,"mobile":579}},
             {'user_id':4, 'name':'goats','age':40, 'phone_numbers':{"home":111,"mobile":None}},
             {'user_id':5, 'name':'manchi','age':50, 'phone_numbers':{"home":None,"mobile":222}},
             {'user_id':6, 'name':'unknown','age':60, 'phone_numbers':{"home":None,"mobile":None}},]

df11 = spark.createDataFrame(users_list)
df11.show(truncate=False)

+---+-------+------------------------------+-------+
|age|name   |phone_numbers                 |user_id|
+---+-------+------------------------------+-------+
|10 |kiran  |{mobile -> 456, home -> 123}  |1      |
|20 |kumar  |{mobile -> 680, home -> 246}  |2      |
|30 |chinta |{mobile -> 579, home -> 135}  |3      |
|40 |goats  |{mobile -> NULL, home -> 111} |4      |
|50 |manchi |{mobile -> 222, home -> NULL} |5      |
|60 |unknown|{mobile -> NULL, home -> NULL}|6      |
+---+-------+------------------------------+-------+



In [49]:
df11.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)
 |-- phone_numbers: map (nullable = true)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)
 |-- user_id: long (nullable = true)



In [50]:
# explode --> will create seperate columns by flattening the data

from pyspark.sql.functions import explode

df11.select('*',explode('phone_numbers')).show()

+---+-------+--------------------+-------+------+-----+
|age|   name|       phone_numbers|user_id|   key|value|
+---+-------+--------------------+-------+------+-----+
| 10|  kiran|{mobile -> 456, h...|      1|mobile|  456|
| 10|  kiran|{mobile -> 456, h...|      1|  home|  123|
| 20|  kumar|{mobile -> 680, h...|      2|mobile|  680|
| 20|  kumar|{mobile -> 680, h...|      2|  home|  246|
| 30| chinta|{mobile -> 579, h...|      3|mobile|  579|
| 30| chinta|{mobile -> 579, h...|      3|  home|  135|
| 40|  goats|{mobile -> NULL, ...|      4|mobile| NULL|
| 40|  goats|{mobile -> NULL, ...|      4|  home|  111|
| 50| manchi|{mobile -> 222, h...|      5|mobile|  222|
| 50| manchi|{mobile -> 222, h...|      5|  home| NULL|
| 60|unknown|{mobile -> NULL, ...|      6|mobile| NULL|
| 60|unknown|{mobile -> NULL, ...|      6|  home| NULL|
+---+-------+--------------------+-------+------+-----+



In [51]:
# create two seperate columns
from pyspark.sql.functions import col

df11.\
    select('user_id',col('phone_numbers')['home'].alias('home_number'),col('phone_numbers')['mobile'].alias('mobile_number')).\
    show()

+-------+-----------+-------------+
|user_id|home_number|mobile_number|
+-------+-----------+-------------+
|      1|        123|          456|
|      2|        246|          680|
|      3|        135|          579|
|      4|        111|         NULL|
|      5|       NULL|          222|
|      6|       NULL|         NULL|
+-------+-----------+-------------+



# create DataFrame using STRUCT datatype

In [53]:
users_list = [{'user_id':1, 'name':'kiran','age':10,'phone_numbers':Row(mobile=123,home=456)},
             {'user_id':2, 'name':'kumar','age':20,'phone_numbers':Row(mobile=123,home=456)},
             {'user_id':3, 'name':'chinta','age':30, 'phone_numbers':Row(mobile=123,home=456)},
             {'user_id':4, 'name':'goats','age':40, 'phone_numbers':Row(mobile=123,home=456)},
             {'user_id':5, 'name':'manchi','age':50, 'phone_numbers':Row(mobile=123,home=456)},
             {'user_id':6, 'name':'unknown','age':60, 'phone_numbers':Row(mobile=123,home=456)}
             ]

df12 = spark.createDataFrame(users_list)
df12.show(truncate=False)

+---+-------+-------------+-------+
|age|name   |phone_numbers|user_id|
+---+-------+-------------+-------+
|10 |kiran  |{123, 456}   |1      |
|20 |kumar  |{123, 456}   |2      |
|30 |chinta |{123, 456}   |3      |
|40 |goats  |{123, 456}   |4      |
|50 |manchi |{123, 456}   |5      |
|60 |unknown|{123, 456}   |6      |
+---+-------+-------------+-------+



In [54]:
df12.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: long (nullable = true)
 |    |-- home: long (nullable = true)
 |-- user_id: long (nullable = true)



In [56]:
# explode --> doesn't work on struct datatype

In [57]:
df12.select('user_id','phone_numbers.mobile','phone_numbers.home').show()

+-------+------+----+
|user_id|mobile|home|
+-------+------+----+
|      1|   123| 456|
|      2|   123| 456|
|      3|   123| 456|
|      4|   123| 456|
|      5|   123| 456|
|      6|   123| 456|
+-------+------+----+

