    Create a SparkSession that connects to Spark in local mode. Configure the SparkSession to use two cores.
    Using the example from the lesson, create a spark data frame that contains your favorite programming languages. The name of the column should be language.
    Print the schema of the dataframe
    View the dataframe
    Count the number of records using .count

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[2]").appName("mylocalconnection").getOrCreate()

In [2]:
language = spark.createDataFrame([("Python", ), ("Java", ), ("C++", ), ("HTML", ), ("Javascript", )], schema=['language'])


In [3]:
language

DataFrame[language: string]

In [4]:
language.printSchema()

root
 |-- language: string (nullable = true)



In [5]:
language.show()

+----------+
|  language|
+----------+
|    Python|
|      Java|
|       C++|
|      HTML|
|Javascript|
+----------+



In [6]:
language.count()

5

In [9]:
spark

In [10]:
import pandas as pd
import numpy as np

pandas_dataframe = pd.DataFrame(dict(n=np.arange(100), group=np.random.choice(list('abc'), 100)))

In [12]:
df = spark.createDataFrame(pandas_dataframe)

In [13]:
df.show()

+---+-----+
|  n|group|
+---+-----+
|  0|    a|
|  1|    c|
|  2|    b|
|  3|    a|
|  4|    b|
|  5|    b|
|  6|    a|
|  7|    a|
|  8|    a|
|  9|    b|
| 10|    a|
| 11|    a|
| 12|    c|
| 13|    b|
| 14|    a|
| 15|    a|
| 16|    a|
| 17|    a|
| 18|    b|
| 19|    c|
+---+-----+
only showing top 20 rows



In [14]:
df.createOrReplaceTempView('numbers')

In [15]:
#create temporary sql table and use sql queries
spark.sql('''
SELECT * FROM numbers
''').show()

+---+-----+
|  n|group|
+---+-----+
|  0|    a|
|  1|    c|
|  2|    b|
|  3|    a|
|  4|    b|
|  5|    b|
|  6|    a|
|  7|    a|
|  8|    a|
|  9|    b|
| 10|    a|
| 11|    a|
| 12|    c|
| 13|    b|
| 14|    a|
| 15|    a|
| 16|    a|
| 17|    a|
| 18|    b|
| 19|    c|
+---+-----+
only showing top 20 rows



In [17]:
another_pandas_df = spark.sql('''
SELECT avg(n) as mean
FROM numbers
GROUP BY group
''').toPandas()

In [18]:
another_pandas_df

Unnamed: 0,mean
0,57.2
1,48.90625
2,45.465116


In [23]:
from pyspark.sql.functions import expr, avg

df.groupBy('group').agg(expr('avg(n)')).show()

+-----+-----------------+
|group|           avg(n)|
+-----+-----------------+
|    c|             57.2|
|    b|         48.90625|
|    a|45.46511627906977|
+-----+-----------------+



In [24]:
df.groupBy(df.group).agg(avg(df.n)).show()

+-----+-----------------+
|group|           avg(n)|
+-----+-----------------+
|    c|             57.2|
|    b|         48.90625|
|    a|45.46511627906977|
+-----+-----------------+



In [21]:
df.select('n', expr('n + 1 as incremented')).show()

+---+-----------+
|  n|incremented|
+---+-----------+
|  0|          1|
|  1|          2|
|  2|          3|
|  3|          4|
|  4|          5|
|  5|          6|
|  6|          7|
|  7|          8|
|  8|          9|
|  9|         10|
| 10|         11|
| 11|         12|
| 12|         13|
| 13|         14|
| 14|         15|
| 15|         16|
| 16|         17|
| 17|         18|
| 18|         19|
| 19|         20|
+---+-----------+
only showing top 20 rows



In [22]:
#alternative way
df.select(df.n, (df.n + 1).alias('incremented')).show()

+---+-----------+
|  n|incremented|
+---+-----------+
|  0|          1|
|  1|          2|
|  2|          3|
|  3|          4|
|  4|          5|
|  5|          6|
|  6|          7|
|  7|          8|
|  8|          9|
|  9|         10|
| 10|         11|
| 11|         12|
| 12|         13|
| 13|         14|
| 14|         15|
| 15|         16|
| 16|         17|
| 17|         18|
| 18|         19|
| 19|         20|
+---+-----------+
only showing top 20 rows

