# DataFrame - Creation

## Prepare the Spark session

In [None]:
# Import findspark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the session creation
from pyspark import SparkConf
from pyspark.sql import SparkSession

# Configure and create the session
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
spark = SparkSession.builder.config(conf = conf).getOrCreate()

## From RDD with explicit schema

In [None]:
# Retrieve the context
sc = spark.sparkContext

# Read the file
kickstarter = sc.textFile('./data/live.tsv')

# Function used to parse and transform to Row each line
def parseKickstarter(line):
    fields = line.split('\t')
    return (fields[3], int(fields[1]))

# Parse the required fields from the file
kickstarter = kickstarter.map(parseKickstarter)

# Take a glimpse of the data
kickstarter.take(5)

In [None]:
# Import the spark objects needed
from pyspark.sql.types import StructField, StructType, StringType, LongType

# Define the schema explicitly
fields = []
fields.append(StructField('country', StringType(), True))
fields.append(StructField('amount', LongType(), False))
schema = StructType(fields)

# Create the DataFrame with an RDD and schema
kick_df = spark.createDataFrame(kickstarter, schema)

In [None]:
# Check the type of the results
type(kick_df)

In [None]:
# Take a glimpse of the DataFrame
kick_df.show(10)

In [None]:
# Review the schema of the DataFrame
kick_df.printSchema()

## From Row RDD and inferring the schema

In [None]:
from pyspark.sql.types import Row

# Retrieve the context
sc = spark.sparkContext

# Read the file
kickstarter = sc.textFile('./data/live.tsv')

# Function used to parse and transform to Row each line
def parseKickstarter(line):
    fields = line.split('\t')
    return Row(country = fields[3], amount = fields[1])

# Parse the required fields as Row
row_kickstarter = kickstarter.map(parseKickstarter)

# Take a glimpse of the data
row_kickstarter.take(10)

In [None]:
# Create the DataFrame without specifying schema
kick_df = spark.createDataFrame(row_kickstarter)

In [None]:
# Check the type of the results
type(kick_df)

In [None]:
# Take a glimpse of the DataFrame
kick_df.show(10)

In [None]:
# Review the schema of the DataFrame
kick_df.printSchema()

## From structured data source

In [None]:
# Read the file specifying format
kick_df = spark.read.csv('./data/live.tsv')
kick_df.printSchema()

In [None]:
# Read the file spcifying CSV options
kick_df = spark.read.options(sep='\t', header=False).csv('./data/live.tsv')
kick_df.printSchema()

In [None]:
# Read the file specifying CSV options and asking for schema inferring
kick_df = spark.read.options(sep='\t', header=False, inferSchema=True).csv('./data/live.tsv')
kick_df.printSchema()

In [None]:
# Create a copy of the DataFrame setting column names
kick_df = kick_df.toDF(
    'id',
    'amt_pledged',
    'by',
    'country',
    'currency',
    'end_time',
    'location',
    'percentage_funded',
    'state',
    'title',
    'type',
    'url'
)
kick_df.printSchema()

## Close the session

In [None]:
spark.close()