In [None]:
spark

# Spark DataFrame

## How to create a dataframe

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

#### createDataFrame and toDF
`createDataFrame()` is a primary method in `SparkSession` to create a dataframe from other data. Another one is `toDF()` in a RDD object

### Using list of tuple

In [None]:
dept = [
    ("Finance",10), 
    ("Marketing",20), 
    ("Sales",30), 
    ("IT",40), 
]

In [None]:
deptColumns = ["dept_name","dept_id"]

deptDF = spark.createDataFrame(data=dept, schema = deptColumns)

#### printSchema
`printSchema()` is used to show column schemas of a dataframe in output

In [None]:
deptDF.printSchema()

#### show
`show()` is used to display dataframe data in table text format. There are two parameters for the function `n`, the number of data to display with default as 50 and `truncate` which is a boolean or number to indicate how width allow before trimming the column output.

In [None]:
deptDF.show(truncate=False)

To specify schema without letting Spark infers from data, we will create `StructType` instance

In [None]:
deptSchema = T.StructType([       
    T.StructField("dept_name", T.StringType(), True),
    T.StructField("dept_id", T.LongType(), True),
])

deptDF1 = spark.createDataFrame(data=dept, schema=deptSchema)
deptDF1.printSchema()
deptDF1.show(truncate=False)

### Using list of Row (Spark data type)

In [None]:
dept2 = [
    T.Row("Finance",10), 
    T.Row("Marketing",20), 
    T.Row("Sales",30), 
    T.Row("IT",40) 
]

deptDF2 = spark.createDataFrame(data=dept, schema = deptColumns)

In [None]:
deptDF2.printSchema()

In [None]:
deptDF2.show(truncate=False)

#### collect
`show()` does not return data in dataframe to **driver**. As a RDD has `collect()` to retrieve all data into **driver**, a dataframe also has one. However the type of item returned is `Row`

In [None]:
deptDF.collect()

In [None]:
deptDF2.collect()

## DataFrame and RDD relations

In [None]:
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]

rdd = spark.sparkContext.parallelize(data)

### Without schema and colum names

Convert a RDD into DataFrame without column name and schema. Spark will infer schema and assign default column name `_n` (n is order sequence of a column, starting from 1

In [None]:
dfFromRDD1 = rdd.toDF()
dfFromRDD1.printSchema()

### With column names

In [None]:
columns = ["language","users_count"]

dfFromRDD1 = rdd.toDF(columns)
dfFromRDD1.printSchema()

Another way to pass column names in `toDF()`

In [None]:
dfFromRDD2 = spark.createDataFrame(rdd).toDF(*columns)
dfFromRDD2.printSchema()

Otherwise `Row` type can be used as data collection with the colum name arguments

In [None]:
rowData = map(lambda x: T.Row(*x), data)

dfFromData3 = spark.createDataFrame(rowData,columns)
dfFromData3.printSchema()

## Empty DataFrame

As an empty data has no data, Spark cannot infer schema from the data. You must pass schema argument for `createDataFrame()` and `toDF()`

In [None]:
schema = T.StructType([
    T.StructField("firstName", T.StringType(), True),
    T.StructField("middleName", T.StringType(), True),
    T.StructField("lastName", T.StringType(), True),
])

emptyDF1 = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
emptyDF1.printSchema()
print("Total rows: " + str(emptyDF1.count()))

Alternatives to create an empty dataframe with the same scheam as above

In [None]:
emptyDF2 = spark.sparkContext.parallelize([]).toDF(schema)
emptyDF3 = spark.createDataFrame([], schema)

## Schema

In [None]:
schema = T.StructType([
    T.StructField("city", T.StringType(), True),
    T.StructField("dates", T.StringType(), True),
    T.StructField("population", T.IntegerType(), True)]
)

In [None]:
dates = ["1991-02-25","1998-05-10", "1993/03/15", "1992/07/17"]
cities = ['Caracas', 'Ccs', '   São Paulo   ', '~Madrid']
population = [37800000, 19795791, 12341418, 6489162]

data = list(zip(cities, dates, population))

In [None]:
# Create data frame with schema
df = spark.createDataFrame(list(zip(cities, dates, population)), schema=schema)

df.show(truncate=False)

## DataFrame Shape

In [None]:
data = [('Scott', 50), ('Jeff', 45), ('Thomas', 54), ('Ann', 34)]
sparkDF = spark.createDataFrame(data, ["name", "age"])

In [None]:
print("Shape: " + str((sparkDF.count(), len(sparkDF.columns))))

In [None]:
def sparkShape(dataFrame):
    return (dataFrame.count(), len(dataFrame.columns))

Exending Spark DataFrame methods

In [None]:
import pyspark
pyspark.sql.dataframe.DataFrame.shape = sparkShape

In [None]:
print(sparkDF.shape())