## Dataframe Basics

In [1]:
# Import our SparkSession so we can use it
from pyspark.sql import SparkSession
# Create our SparkSession, this can take a couple minutes locally
spark = SparkSession.builder.appName("basics").getOrCreate()

In [None]:
# Load in data
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/zepl-trilogy-test/food.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("food.csv"), sep=",", header=True)
df.show()

In [4]:
# Let's show the data
df.show()

+-------+-----+
|   food|price|
+-------+-----+
|  pizza| null|
|  sushi|   12|
|chinese|   10|
+-------+-----+



In [5]:
# Print our schema
df.printSchema()

root
 |-- food: string (nullable = true)
 |-- price: long (nullable = true)



In [6]:
# Show the columns
df.columns

['food', 'price']

In [7]:
# Describe our data
df.describe()

DataFrame[summary: string, food: string, price: string]

In [8]:
# Import struct fields that we can use
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [9]:
# Next we need to create the list of struct fields
schema = [StructField("food", StringType(), True), StructField("price", IntegerType(), True),]
schema

[StructField(price,IntegerType,true), StructField(food,StringType,true)]

In [10]:
# Pass in our fields
final = StructType(fields=schema)
final

StructType(List(StructField(price,IntegerType,true),StructField(food,StringType,true)))

In [11]:
# Read our data with our new schema
dataframe = spark.read.csv(SparkFiles.get("food.csv"), sep=",", header=True, schema=final)
dataframe

DataFrame[price: int, food: string]

In [12]:
# Print it out
dataframe.printSchema()

root
 |-- price: integer (nullable = true)
 |-- food: string (nullable = true)



### Accessing data

In [13]:
dataframe['price']

Column<b'price'>

In [14]:
type(dataframe['price'])

pyspark.sql.column.Column

In [15]:
dataframe.select('price')

DataFrame[price: int]

In [16]:
type(dataframe.select('price'))

pyspark.sql.dataframe.DataFrame

In [17]:
dataframe.select('price').show()

+-----+
|price|
+-----+
| null|
|   12|
|   10|
+-----+



### Manipulating Columns

In [18]:
# Add new column
dataframe.withColumn('newprice', dataframe['price']).show()

+-----+-------+--------+
|price|   food|newprice|
+-----+-------+--------+
| null|  pizza|    null|
|   12|  sushi|      12|
|   10|chinese|      10|
+-----+-------+--------+



In [19]:
# Update column name
dataframe.withColumnRenamed('price','newerprice').show()

+----------+-------+
|newerprice|   food|
+----------+-------+
|      null|  pizza|
|        12|  sushi|
|        10|chinese|
+----------+-------+



In [20]:
# Double the price
dataframe.withColumn('doubleprice',dataframe['price']*2).show()

+-----+-------+-----------+
|price|   food|doubleprice|
+-----+-------+-----------+
| null|  pizza|       null|
|   12|  sushi|         24|
|   10|chinese|         20|
+-----+-------+-----------+



In [21]:
# Add a dollar to the price
dataframe.withColumn('add_one_dollar',dataframe['price']+1).show()

+-----+-------+--------------+
|price|   food|add_one_dollar|
+-----+-------+--------------+
| null|  pizza|          null|
|   12|  sushi|            13|
|   10|chinese|            11|
+-----+-------+--------------+



In [22]:
# Half the price
dataframe.withColumn('half_price',dataframe['price']/2).show()

+-----+-------+----------+
|price|   food|half_price|
+-----+-------+----------+
| null|  pizza|      null|
|   12|  sushi|       6.0|
|   10|chinese|       5.0|
+-----+-------+----------+



In [23]:
# Collecting a column as a list
dataframe.select("price").collect()

[Row(price=None), Row(price=12), Row(price=10)]

# Converting PySpark DataFrame to Pandas DataFrame

In [24]:
import pandas as pd
pandas_df = dataframe.toPandas() 

In [25]:
pandas_df.head()

Unnamed: 0,price,food
0,,pizza
1,12.0,sushi
2,10.0,chinese
