# Spark API Mini Exercises

In [3]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

import pandas as pd
import numpy as np


## 1. Spark Dataframe Basics

In [4]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

In [5]:
#ii. Convert the pandas dataframe to a spark dataframe. 
#From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [6]:
#iii. Show the first 3 rows of the dataframe.
df.show(3)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  1.345815683152521|    y|false|
|-1.0452821799941256|    x|false|
| 0.9737660452795452|    x|false|
+-------------------+-----+-----+
only showing top 3 rows



In [7]:
#iv. Show the first 7 rows of the dataframe.
df.show(7)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  1.345815683152521|    y|false|
|-1.0452821799941256|    x|false|
| 0.9737660452795452|    x|false|
| -0.867856173967037|    z| true|
|-0.5346956316913865|    x| true|
| 0.5058042271556487|    z|false|
| 0.8595277934341654|    x| true|
+-------------------+-----+-----+
only showing top 7 rows



In [9]:
#v. View a summary of the data using .describe.
df.describe().show()

+-------+-------------------+-----+
|summary|                  n|group|
+-------+-------------------+-----+
|  count|                 20|   20|
|   mean| 0.3743549326714791| null|
| stddev| 0.8972092768502807| null|
|    min|-1.0452821799941256|    x|
|    max|  1.989506080109543|    z|
+-------+-------------------+-----+



In [10]:
#vi. Use .select to create a new dataframe with just the n and abool columns. 
#View the first 5 rows of this dataframe.
df.select('n', 'abool').show(5)

+-------------------+-----+
|                  n|abool|
+-------------------+-----+
|  1.345815683152521|false|
|-1.0452821799941256|false|
| 0.9737660452795452|false|
| -0.867856173967037| true|
|-0.5346956316913865| true|
+-------------------+-----+
only showing top 5 rows



In [11]:
#vii. Use .select to create a new dataframe with just the group and abool columns. 
#View the first 5 rows of this dataframe.
df.select('group', 'abool').show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    y|false|
|    x|false|
|    x|false|
|    z| true|
|    x| true|
+-----+-----+
only showing top 5 rows



In [14]:
#viii. Use .select to create a new dataframe with the group column and 
# the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.
df.select(df.group, df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    y|          false|
|    x|          false|
|    x|          false|
+-----+---------------+
only showing top 3 rows



In [15]:
#ix. Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. 
#Show the first 6 rows of this dataframe.
df.select(df.group, df.n.alias("a_numeric_value")).show(6)

+-----+-------------------+
|group|    a_numeric_value|
+-----+-------------------+
|    y|  1.345815683152521|
|    x|-1.0452821799941256|
|    x| 0.9737660452795452|
|    z| -0.867856173967037|
|    x|-0.5346956316913865|
|    z| 0.5058042271556487|
+-----+-------------------+
only showing top 6 rows



## 2. Column Manipulation

In [18]:
#ii. Use .select to add 4 to the n column. Show the results.
df.select

DataFrame[n: double, group: string, abool: boolean]