    Create a SparkSession that connects to Spark in local mode. Configure the SparkSession to use two cores.
    Using the example from the lesson, create a spark data frame that contains your favorite programming languages. The name of the column should be language.
    Print the schema of the dataframe
    View the dataframe
    Count the number of records using .count

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[2]").appName("mylocalconnection").getOrCreate()

In [2]:
language = spark.createDataFrame([("Python", ), ("Java", ), ("C++", ), ("HTML", ), ("Javascript", )], schema=['language'])


In [3]:
language

DataFrame[language: string]

In [4]:
language.printSchema()

root
 |-- language: string (nullable = true)



In [5]:
language.show()

+----------+
|  language|
+----------+
|    Python|
|      Java|
|       C++|
|      HTML|
|Javascript|
+----------+



In [6]:
language.count()

5

In [7]:
spark

In [8]:
import pandas as pd
import numpy as np

pandas_dataframe = pd.DataFrame(dict(n=np.arange(100), group=np.random.choice(list('abc'), 100)))

In [9]:
df = spark.createDataFrame(pandas_dataframe)

In [10]:
df.show()

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    c|
|  2|    c|
|  3|    b|
|  4|    a|
|  5|    b|
|  6|    a|
|  7|    b|
|  8|    a|
|  9|    a|
| 10|    a|
| 11|    c|
| 12|    c|
| 13|    b|
| 14|    c|
| 15|    c|
| 16|    b|
| 17|    b|
| 18|    a|
| 19|    a|
+---+-----+
only showing top 20 rows



In [11]:
#create temp sql-like table
df.createOrReplaceTempView('numbers')

In [12]:
#use sql queries on temp table
spark.sql('''
SELECT * FROM numbers
''').show()

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    c|
|  2|    c|
|  3|    b|
|  4|    a|
|  5|    b|
|  6|    a|
|  7|    b|
|  8|    a|
|  9|    a|
| 10|    a|
| 11|    c|
| 12|    c|
| 13|    b|
| 14|    c|
| 15|    c|
| 16|    b|
| 17|    b|
| 18|    a|
| 19|    a|
+---+-----+
only showing top 20 rows



In [13]:
another_pandas_df = spark.sql('''
SELECT avg(n) as mean
FROM numbers
GROUP BY group
''').toPandas()

In [14]:
another_pandas_df

Unnamed: 0,mean
0,43.481481
1,49.290323
2,53.52381


In [15]:
from pyspark.sql.functions import expr, avg

df.groupBy('group').agg(expr('avg(n)')).show()

+-----+------------------+
|group|            avg(n)|
+-----+------------------+
|    c| 43.48148148148148|
|    b| 49.29032258064516|
|    a|53.523809523809526|
+-----+------------------+



In [16]:
df.groupBy(df.group).agg(avg(df.n)).show()

+-----+------------------+
|group|            avg(n)|
+-----+------------------+
|    c| 43.48148148148148|
|    b| 49.29032258064516|
|    a|53.523809523809526|
+-----+------------------+



In [17]:
df2 = df.select('n', expr('n + 1 as incremented'))

In [18]:
df2.select(expr('incremented / 2 as x')).agg(expr('sum(x)')).explain()

== Physical Plan ==
*(2) HashAggregate(keys=[], functions=[sum(x#76)])
+- Exchange SinglePartition
   +- *(1) HashAggregate(keys=[], functions=[partial_sum(x#76)])
      +- *(1) Project [(cast((n#12L + 1) as double) / 2.0) AS x#76]
         +- Scan ExistingRDD[n#12L,group#13]


In [19]:
#alternative way
df.select(df.n, (df.n + 1).alias('incremented')).show()

+---+-----------+
|  n|incremented|
+---+-----------+
|  0|          1|
|  1|          2|
|  2|          3|
|  3|          4|
|  4|          5|
|  5|          6|
|  6|          7|
|  7|          8|
|  8|          9|
|  9|         10|
| 10|         11|
| 11|         12|
| 12|         13|
| 13|         14|
| 14|         15|
| 15|         16|
| 16|         17|
| 17|         18|
| 18|         19|
| 19|         20|
+---+-----------+
only showing top 20 rows



In [22]:
#selecting columns by string
df.select('n', 'group').show()

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    c|
|  2|    c|
|  3|    b|
|  4|    a|
|  5|    b|
|  6|    a|
|  7|    b|
|  8|    a|
|  9|    a|
| 10|    a|
| 11|    c|
| 12|    c|
| 13|    b|
| 14|    c|
| 15|    c|
| 16|    b|
| 17|    b|
| 18|    a|
| 19|    a|
+---+-----+
only showing top 20 rows



In [25]:
#alternate way
#df.group is a column object and more can be done with this
df.select(df.group, df.n).show()

+-----+---+
|group|  n|
+-----+---+
|    b|  0|
|    c|  1|
|    c|  2|
|    b|  3|
|    a|  4|
|    b|  5|
|    a|  6|
|    b|  7|
|    a|  8|
|    a|  9|
|    a| 10|
|    c| 11|
|    c| 12|
|    b| 13|
|    c| 14|
|    c| 15|
|    b| 16|
|    b| 17|
|    a| 18|
|    a| 19|
+-----+---+
only showing top 20 rows



In [26]:
#column representing a transformation
df.select(df.n+1).show()

+-------+
|(n + 1)|
+-------+
|      1|
|      2|
|      3|
|      4|
|      5|
|      6|
|      7|
|      8|
|      9|
|     10|
|     11|
|     12|
|     13|
|     14|
|     15|
|     16|
|     17|
|     18|
|     19|
|     20|
+-------+
only showing top 20 rows



In [27]:
from pyspark.sql.functions import col, column, expr
#col and column to the exact same thing

In [28]:
#col function
my_column = (col('n') + 16)*3

df.select(my_column).show()

+--------------+
|((n + 16) * 3)|
+--------------+
|            48|
|            51|
|            54|
|            57|
|            60|
|            63|
|            66|
|            69|
|            72|
|            75|
|            78|
|            81|
|            84|
|            87|
|            90|
|            93|
|            96|
|            99|
|           102|
|           105|
+--------------+
only showing top 20 rows



In [29]:
#naming the created column
df.select(my_column.alias('quantity')).show()

+--------+
|quantity|
+--------+
|      48|
|      51|
|      54|
|      57|
|      60|
|      63|
|      66|
|      69|
|      72|
|      75|
|      78|
|      81|
|      84|
|      87|
|      90|
|      93|
|      96|
|      99|
|     102|
|     105|
+--------+
only showing top 20 rows



In [30]:
#alternate way; can't reference cols that haven't been created this way
df.select(((df.n + 16) * 3).alias('quantity')).show()

+--------+
|quantity|
+--------+
|      48|
|      51|
|      54|
|      57|
|      60|
|      63|
|      66|
|      69|
|      72|
|      75|
|      78|
|      81|
|      84|
|      87|
|      90|
|      93|
|      96|
|      99|
|     102|
|     105|
+--------+
only showing top 20 rows



In [32]:
#third method; similar to sql queries
df.select(expr('(n+16)*3 AS quantity')).show()

+--------+
|quantity|
+--------+
|      48|
|      51|
|      54|
|      57|
|      60|
|      63|
|      66|
|      69|
|      72|
|      75|
|      78|
|      81|
|      84|
|      87|
|      90|
|      93|
|      96|
|      99|
|     102|
|     105|
+--------+
only showing top 20 rows



In [33]:
#change type of column with cast function
df.select(df.n.cast('string'))

DataFrame[n: string]

In [34]:
#aggregate function
df.select(expr('sum(n)')).show()

+------+
|sum(n)|
+------+
|  4950|
+------+



In [36]:
#using built in function to aggregate

from pyspark.sql.functions import sum

df.select(sum(df.n)).show()

+------+
|sum(n)|
+------+
|  4950|
+------+



In [39]:
#agg fns can be combined
df.select(sum(df.n).alias('the_sum'), expr('avg(n) as mean'), expr('min(n) as min')).show()

+-------+----+---+
|the_sum|mean|min|
+-------+----+---+
|   4950|49.5|  0|
+-------+----+---+

