In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", "5")
print("SparkSession created with name as 'spark'")

SparkSession created with name as 'spark'


In [3]:
df=spark.read.format("json").load("../../../data/SparkTheDefinitiveGuide/flight-data/json/2015-summary.json")

#### Schema

In [4]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [6]:
spark.read.format("json").load("../../../data/SparkTheDefinitiveGuide/flight-data/json/2015-summary.json").schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

In [7]:
from pyspark.sql.types import StructField, StructType, StringType, LongType
myManualSchema = StructType([
StructField("DEST_COUNTRY_NAME", StringType(), True),
StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
StructField("count", LongType(), False, metadata={"hello":"world"})])
df = spark.read.format("json").schema(myManualSchema).load("../../../data/SparkTheDefinitiveGuide/flight-data/json/2015-summary.json")

#### Column 'col is same as expr'

In [8]:
from pyspark.sql.functions import col, column
col("someColumnName")
column("someColumnName")

Column<b'someColumnName'>

In [11]:
from pyspark.sql.functions import expr
expr("(((someCol + 5) * 200) - 6) < otherCol")

Column<b'((((someCol + 5) * 200) - 6) < otherCol)'>

#### Record / Row

In [12]:
df.first()

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)

In [15]:
from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)
myRow

<Row('Hello', None, 1, False)>

In [17]:
myRow[0]

'Hello'

#### DataFrame Transformations

In [18]:
df.createOrReplaceTempView("dfTable")

In [19]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType
myManualSchema = StructType([
StructField("some", StringType(), True),
StructField("col", StringType(), True),
StructField("names", LongType(), False)
])
myRow = Row("Hello", None, 1)
myDf = spark.createDataFrame([myRow], myManualSchema)
myDf.show()

+-----+----+-----+
| some| col|names|
+-----+----+-----+
|Hello|null|    1|
+-----+----+-----+

