In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/11 23:02:14 WARN Utils: Your hostname, kirans-mac.local, resolves to a loopback address: 127.0.0.1; using 172.18.197.149 instead (on interface en0)
25/06/11 23:02:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/11 23:02:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/11 23:02:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/06/11 23:02:14 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


# version

In [5]:
spark.sparkContext.version

'4.0.0'

# Range

In [6]:
df = spark.range(1,10,2)

In [9]:
df.show()

+---+
| id|
+---+
|  1|
|  3|
|  5|
|  7|
|  9|
+---+



In [11]:
spark.range(10).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



# Create dataframe

### using list

In [12]:
lst = (('Robert',35),('James',25))
spark.createDataFrame(data=lst),
df = spark.createDataFrame(data=lst)
df.show()

+------+---+
|    _1| _2|
+------+---+
|Robert| 35|
| James| 25|
+------+---+



### using dict

In [14]:
dict = ({"name":"robert","age":25}, {"name" : "james","age" : 31})
df = spark.createDataFrame(dict)
df.show()

+---+------+
|age|  name|
+---+------+
| 25|robert|
| 31| james|
+---+------+



### using rdd

In [16]:
rdd = spark.sparkContext.parallelize(lst)
df = spark.createDataFrame(data=rdd)
df.show()

+------+---+
|    _1| _2|
+------+---+
|Robert| 35|
| James| 25|
+------+---+



### using pandas dataframe

In [20]:
import pandas as pd
data = (('tom', 10), ('nick', 15), ('juli', 14))
df_pandas = pd.DataFrame(data,columns=('Name','Age'))
df = spark.createDataFrame(data=df_pandas)
df.show()

+----+---+
|Name|Age|
+----+---+
| tom| 10|
|nick| 15|
|juli| 14|
+----+---+



# sql()

In [22]:
df.createOrReplaceTempView("temp_table")

In [24]:
df2 = spark.sql("select * from temp_table")
df2.show()

+----+---+
|Name|Age|
+----+---+
| tom| 10|
|nick| 15|
|juli| 14|
+----+---+



# table()

In [25]:
df3 = spark.table("temp_table")
df3.show()

+----+---+
|Name|Age|
+----+---+
| tom| 10|
|nick| 15|
|juli| 14|
+----+---+



# conf()

In [26]:
spark.conf.get("spark.sql.session.timeZone" )

'America/Chicago'

In [27]:
spark.conf.get('spark.sql.shuffle.partitions')

'200'

In [30]:
spark.conf.set('spark.sql.shuffle.partitions',300)

In [31]:
spark.conf.get('spark.sql.shuffle.partitions')

'300'

# udf

In [35]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *

@udf(IntegerType())
def name_length(name):
    return len(name)

df.withColumn("name_length", name_length(df.Name)).show()

+----+---+-----------+
|Name|Age|name_length|
+----+---+-----------+
| tom| 10|          3|
|nick| 15|          4|
|juli| 14|          4|
+----+---+-----------+



In [38]:

def name_length(name):
    return len(name)
    
spark.udf.register("name_length", name_length, IntegerType())
spark.sql("SELECT *, name_length(name) as name_length FROM temp_table").show()

25/06/11 23:28:37 WARN SimpleFunctionRegistry: The function name_length replaced a previously registered function.


+----+---+-----------+
|Name|Age|name_length|
+----+---+-----------+
| tom| 10|          3|
|nick| 15|          4|
|juli| 14|          4|
+----+---+-----------+

