In [1]:
from pyspark.sql.types import *  # Necessary for creating schemas
from pyspark.sql.functions import * # Importing PySpark functions

## Creating DataFrames
### Making a Simple DataFrame from a Tuple List

In [2]:
# Make a tuple list
a_list = [('a', 1), ('b', 2), ('c', 3)]

# Create a Spark DataFrame, without supplying a schema value
df_from_list_no_schema = \
sqlContext.createDataFrame(a_list)

In [9]:
df_from_list_no_schema.registerTempTable("data")

In [10]:
spark.catalog.listTables()

[Table(name='data', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [28]:
# Show the DataFrame
print("1st O/P : %s \n" % df_from_list_no_schema )
print("2nd O/P : %s \n" %df_from_list_no_schema.collect())
df_from_list_no_schema.show()

1st O/P : DataFrame[_1: string, _2: bigint] 

2nd O/P : [Row(_1='a', _2=1), Row(_1='b', _2=2), Row(_1='c', _2=3)] 

+---+---+
| _1| _2|
+---+---+
|  a|  1|
|  b|  2|
|  c|  3|
+---+---+



### Making a Simple DataFrame from a Tuple List and a Schema

In [29]:
# Create a Spark DataFrame, this time with schema
df_from_list_with_schema = \
sqlContext.createDataFrame(a_list, ['letters', 'numbers']) # this simple schema contains just column names

In [30]:
# Show the DataFrame
df_from_list_with_schema.show()

# Show the DataFrame's schema
df_from_list_with_schema.printSchema()

+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      b|      2|
|      c|      3|
+-------+-------+

root
 |-- letters: string (nullable = true)
 |-- numbers: long (nullable = true)



### Making a Simple DataFrame from a Dictionary

In [31]:
# Make a dictionary
a_dict = [{'letters': 'a', 'numbers': 1},
          {'letters': 'b', 'numbers': 2},
          {'letters': 'c', 'numbers': 3}]

In [42]:
# Create a Spark DataFrame from the dictionary
df_from_dict = sqlContext.createDataFrame(a_dict) # You will get a warning about this

# Show the DataFrame
df_from_dict.show()

# Data types 
df_from_dict.dtypes



+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      b|      2|
|      c|      3|
+-------+-------+



[('letters', 'string'), ('numbers', 'bigint')]

## Making a Simple DataFrame Using a StructType Schema + RDD

In [41]:
# Define the schema
schema = StructType([
    StructField('letters', StringType(), True),
    StructField('numbers', IntegerType(), True)])

In [43]:
# Create an RDD from a list
rdd = sc.parallelize(a_list)

In [45]:
# Create the DataFrame from these raw components
nice_df = sqlContext.createDataFrame(rdd, schema)

### some functions for inspecting the DataFrame.

In [47]:
# `columns`: return all column names as a list
nice_df.columns

['letters', 'numbers']

In [48]:
# `dtypes`: get the datatypes for all columns
nice_df.dtypes

[('letters', 'string'), ('numbers', 'int')]

In [49]:
# `printSchema()`: prints the schema of the supplied DF
nice_df.printSchema()

root
 |-- letters: string (nullable = true)
 |-- numbers: integer (nullable = true)



In [50]:
# `schema`: returns the schema of the provided DF as `StructType` schema
nice_df.schema

StructType(List(StructField(letters,StringType,true),StructField(numbers,IntegerType,true)))

In [56]:
# `first()` returns the first row as a Row while
print(nice_df.first()) # can't supply a value; never a list

print(nice_df.head(4)) # can optionally supply a value (default: 1);
                       # with n > 1, a list

print(nice_df.take(4)) # always expects a value; always a list

Row(letters='a', numbers=1)
[Row(letters='a', numbers=1), Row(letters='b', numbers=2), Row(letters='c', numbers=3)]
[Row(letters='a', numbers=1), Row(letters='b', numbers=2), Row(letters='c', numbers=3)]


In [57]:
# `count()`: returns a count of all rows in DF
nice_df.count()

3

In [58]:
# `describe()`: print out stats for numerical columns
nice_df.describe().show() # can optionally supply a list of column names

+-------+-------+-------+
|summary|letters|numbers|
+-------+-------+-------+
|  count|      3|      3|
|   mean|   null|    2.0|
| stddev|   null|    1.0|
|    min|      a|      1|
|    max|      c|      3|
+-------+-------+-------+



In [59]:
# the `explain()` function explains the under-the-hood evaluation process
nice_df.explain()

== Physical Plan ==
Scan ExistingRDD[letters#145,numbers#146]


## Simple DataFrame Manipulation Functions

###  unionAll(): combine two DataFrames together
###  orderBy(): perform sorting of DataFrame columns
###  select(): select which DataFrame columns to retain
###  drop(): select a single DataFrame column to remove
###  filter(): retain DataFrame rows that match a condition

In [68]:
# Take the DataFrame and add it to itself
(df_from_dict.unionAll(nice_df).show())

+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      b|      2|
|      c|      3|
|      a|      1|
|      b|      2|
|      c|      3|
+-------+-------+



In [75]:
# Add it to itself twice
(df_from_dict.unionAll(nice_df).unionAll(df_from_list_no_schema).show())

+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      b|      2|
|      c|      3|
|      a|      1|
|      b|      2|
|      c|      3|
|      a|      1|
|      b|      2|
|      c|      3|
+-------+-------+



In [77]:
# Coercion will occur if schemas don't align
(nice_df.select(['numbers', 'letters']).unionAll(nice_df).show())

(nice_df.select(['numbers', 'letters']).unionAll(nice_df).printSchema())

+-------+-------+
|numbers|letters|
+-------+-------+
|      1|      a|
|      2|      b|
|      3|      c|
|      a|      1|
|      b|      2|
|      c|      3|
+-------+-------+

root
 |-- numbers: string (nullable = true)
 |-- letters: string (nullable = true)



In [80]:
# Sorting the DataFrame by the `numbers` column
(nice_df
 .unionAll(nice_df)
 .unionAll(nice_df)
 .orderBy("numbers")
 .show())

+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      a|      1|
|      a|      1|
|      b|      2|
|      b|      2|
|      b|      2|
|      c|      3|
|      c|      3|
|      c|      3|
+-------+-------+



In [81]:
# Sort the same column in reverse order
(nice_df
 .unionAll(nice_df)
 .unionAll(nice_df)
 .orderBy('numbers',
          ascending = False)
 .show())

+-------+-------+
|letters|numbers|
+-------+-------+
|      c|      3|
|      c|      3|
|      c|      3|
|      b|      2|
|      b|      2|
|      b|      2|
|      a|      1|
|      a|      1|
|      a|      1|
+-------+-------+



In [82]:
# `select()` and `drop()` both take a list of column names
# Select only the desired columns of the DF
(nice_df
 .select('letters')
 .show())

+-------+
|letters|
+-------+
|      a|
|      b|
|      c|
+-------+



In [85]:
# Re-order columns in the DF using `select()`
(nice_df
 .select(['numbers', 'letters'])
 .show())

+-------+-------+
|numbers|letters|
+-------+-------+
|      1|      a|
|      2|      b|
|      3|      c|
+-------+-------+



In [86]:
# Drop the second column of the DF
(nice_df
 .drop('letters')
 .show())

+-------+
|numbers|
+-------+
|      1|
|      2|
|      3|
+-------+



In [87]:
# The `filter()` function performs filtering of DF rows

# Here is some numeric filtering with comparison operators
# (>, <, >=, <=, ==, != all work)

# Filter rows where values in `numbers` is > 1
(nice_df
 .filter(nice_df.numbers > 1)
 .show())

+-------+-------+
|letters|numbers|
+-------+-------+
|      b|      2|
|      c|      3|
+-------+-------+



In [89]:
# Performing two filter operations
(nice_df
 .filter(nice_df.numbers > 1)
 .filter(nice_df.numbers < 3)
 .show())

+-------+-------+
|letters|numbers|
+-------+-------+
|      b|      2|
+-------+-------+



In [90]:
# Not just numbers! Use the `filter()` + `isin()`
# combo to filter on string columns with a set of values
(nice_df
 .filter(nice_df.letters
         .isin(['a', 'b']))
 .show())

+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      b|      2|
+-------+-------+



In [94]:
# Create a schema object...
nycflights_schema = StructType([
  StructField('year', IntegerType(), True),
  StructField('month', IntegerType(), True),
  StructField('day', IntegerType(), True),
  StructField('dep_time', StringType(), True),
  StructField('dep_delay', IntegerType(), True),
  StructField('arr_time', StringType(), True),
  StructField('arr_delay', IntegerType(), True),
  StructField('carrier', StringType(), True),
  StructField('tailnum', StringType(), True),
  StructField('flight', StringType(), True),  
  StructField('origin', StringType(), True),
  StructField('dest', StringType(), True),
  StructField('air_time', IntegerType(), True),
  StructField('distance', IntegerType(), True),
  StructField('hour', IntegerType(), True),
  StructField('minute', IntegerType(), True)
  ])

# ...and then read the CSV with the schema
nycflights = \
(sqlContext
 .read
 .format('com.databricks.spark.csv')
 .schema(nycflights_schema)
 .options(header = True)
 .load('nycflights13.csv'))