# DataFrame - Operations

## Prepare the Spark session

In [None]:
# Import findspark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the session creation
from pyspark import SparkConf
from pyspark.sql import SparkSession

# Configure and create the session
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
spark = SparkSession.builder.config(conf = conf).getOrCreate()

## Read a sample DataFrame 

In [None]:
kick_df = spark.read.options(sep='\t', header=False, inferSchema=True).csv('./data/live.tsv')
kick_df = kick_df.select('_c3', '_c1').toDF('country', 'amount')
kick_df.printSchema()

## Structure inspection

In [None]:
# Inspect types
kick_df.dtypes

In [None]:
# Inspect schema
kick_df.schema

In [None]:
# Inspect prettify schema
kick_df.printSchema()

In [None]:
# Inspect columns
kick_df.columns

In [None]:
# Display the number of rows
kick_df.count()

## Content inspection

In [None]:
# Display DataFrame
kick_df.show(5)

In [None]:
# Display top rows of the DataFrame
kick_df.head(5)

In [None]:
# Display bottom rows of the DataFrame
kick_df.tail(5)

In [None]:
# Display top rows of the DataFrame
kick_df.take(5)

In [None]:
# Display first element of the DataFrame
kick_df.first()

In [None]:
# Display basic statistics
kick_df.describe().show()

## Column operations

### Column reference

In [None]:
# Reference as variable
kick_df.amount

In [None]:
# Reference as collection
kick_df['amount']

### Column transformation

In [None]:
# Rename the column with an alias
kick_df.amount.alias('quantity')

In [None]:
# Import the type from spark
from pyspark.sql.types import FloatType

# Apply the column casting
kick_df.amount.cast(FloatType())

In [None]:
# String column substring
kick_df.country.substr(1, 1)

### Function application on columns

In [None]:
# Import functions from spark
from pyspark.sql import functions as F

In [None]:
# Case-when structure
F.when(kick_df.amount < 0, 'Wrong').when(kick_df.amount > 1000, 'High').otherwise('Low')

In [None]:
# Absolute of a column
F.abs(kick_df.amount)

In [None]:
# Square root of a column
F.sqrt(kick_df.amount)

In [None]:
# Power of a column
F.pow(kick_df.amount, 3)

In [None]:
# Round of a column
F.round(kick_df.amount, 2)

### Conditionals

In [None]:
# Condition on missing values
kick_df.amount.isNull()

In [None]:
# Condition on non missing values
kick_df.amount.isNotNull()

In [None]:
# Condition of appearance in list
kick_df.country.isin(['ES', 'UK'])

In [None]:
# Substring matching (like)
kick_df.country.like('%ES%')

In [None]:
# Substring matching (contains)
kick_df.country.contains('S')

In [None]:
# Substring matching (startswith)
kick_df.country.startswith('E')

In [None]:
# Substring matching (endswith)
kick_df.country.endswith('S')

In [None]:
# Value in a range
kick_df.amount.between(1000, 2000)

### Sorting

In [None]:
# Ascending with default null treatment
kick_df.amount.asc()

In [None]:
# Ascending with nulls first
kick_df.amount.asc_nulls_first()

In [None]:
# Ascending with nulls last
kick_df.amount.asc_nulls_last()

In [None]:
# Descending with default null treatment
kick_df.amount.desc()

In [None]:
# Descending with nulls first
kick_df.amount.desc_nulls_first()

In [None]:
# Descending with nulls last
kick_df.amount.desc_nulls_last()

## Table operations

### Select

In [None]:
# Select all columns
kick_df.show(5)

In [None]:
# Single column selection by name
kick_df.select('amount').show(5)

In [None]:
# Multiple column selection
kick_df.select('country', 'amount').show(5)

In [None]:
# Column selection by reference
kick_df.select(kick_df.amount).show(5)

In [None]:
# Direct operations on columns
kick_df.select((kick_df.amount / 1000)).show(5)

In [None]:
# Column operations on select
kick_df.select(
    kick_df.amount,
    (kick_df.amount / 1000),
    (kick_df.amount > 1000),
    F.when(kick_df.amount > 1000, 'High').otherwise('Low'),
    F.round(kick_df.amount / 1000, 1)
).show(5)

In [None]:
# Column operations on select (aliasing)
kick_df.select(
    kick_df.amount,
    (kick_df.amount / 1000).alias('amount_k'),
    (kick_df.amount > 1000).alias('amount_over_1000'),
    F.when(kick_df.amount > 1000, 'High').otherwise('Low').alias('amount_high'),
    F.round(kick_df.amount / 1000, 1).alias('rounded_amount')
).show(5)

### Where

In [None]:
# Filtering by direct values
kick_df.where(kick_df.country == 'GB').show(5)

In [None]:
# Filtering by conditional functions
kick_df.where(kick_df.amount.between(10000, 20000)).show(5)

### Sorting

In [None]:
# Sort by single column + ascending parameter
kick_df.select('country', 'amount').orderBy('amount', ascending=False).show(5)

In [None]:
# Sort by multiple columns + ascending parameter
kick_df.select('country', 'amount').orderBy(['country', 'amount'], ascending=[False, False]).show(5)

In [None]:
# Sort by columns and ordering
kick_df.select('country', 'amount').orderBy(kick_df.country.asc(), kick_df.amount.asc()).show(5)

### Grouping and aggregation

In [None]:
# Add more than one column
kick_df = kick_df.select('country', 'amount', (kick_df.amount * 2).alias('double_amount'))

In [None]:
# Grouping by column
kick_df.groupBy('country')

In [None]:
# Applying aggregation functions to all non grouped columns
kick_df.groupBy('country').sum().show(5)
kick_df.groupBy('country').mean().show(5)
kick_df.groupBy('country').min().show(5)
kick_df.groupBy('country').max().show(5)
kick_df.groupBy('country').count().show(5)

In [None]:
# Applying aggregation functions to some non grouped columns
kick_df.groupBy('country').sum('amount').show(5)

In [None]:
# Applying different aggregation functions
kick_df.groupBy('country').agg(
    F.sum('amount'),
    F.mean('double_amount')
).show(5)

### Join

In [None]:
# Prepare one dataset
live_df = spark.read.options(sep='\t', header=False, inferSchema=True).csv('./data/live.tsv')
live_df = live_df.select('_c6', '_c1').toDF('location', 'amount')
live_df.show(5)

In [None]:
# Prepare the other dataset
most_df = spark.read.options(sep='\t', header=False, inferSchema=True).csv('./data/most-backed.tsv')
most_df = most_df.select('_c6', '_c1').toDF('location', 'amount')
most_df.show(5)

In [None]:
# Cartesian
live_df.join(most_df).show(5)

In [None]:
# Join with common join column
live_df.join(most_df, on='location', how='inner').show(5)

In [None]:
# Join with different join column
live_df.join(most_df, (live_df.location == most_df.location), how='left').show(5)

### Adding/replacing/renaming columns

In [None]:
# Adding column
kick_df.withColumn('half_amount', kick_df.amount / 2).show(5)

In [None]:
# Replacing column
kick_df.withColumn('double_amount', kick_df.double_amount.cast(FloatType())).show(5)

In [None]:
# Renaming column
kick_df.withColumnRenamed('double_amount', 'amount_x_2').show(5)

## Conversion

In [None]:
# From DataFrame to RDD
kick_df.rdd.take(5)

In [None]:
# From DataFrame to pandas
kick_df.toPandas().head()

## Persistence

In [None]:
# Write to file specifying format
kick_df.write.csv('./data/output/')

In [None]:
# Write to file specifying format and CSV options
kick_df.write.options(sep='\t', header=True).csv('./data/output_options/')

In [None]:
# Write to file specifying other formats
kick_df.write.parquet('./data/output_parquet/')

## Close the session

In [None]:
spark.close()