In [13]:
import pandas as pd
import numpy as np

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
     .master("local")\
     .appName("mylocalconnection")\
     .getOrCreate()

In [18]:
from pyspark.sql.functions import expr, avg

In [6]:
langs = spark.createDataFrame([('English', ), ('Spanish', ), ('French', )], schema=['language', ])

In [8]:
langs.printSchema()

root
 |-- language: string (nullable = true)



In [10]:
langs.show()

+--------+
|language|
+--------+
| English|
| Spanish|
|  French|
+--------+



In [12]:
langs.count()

3

In [14]:
pandas_dataframe = pd.DataFrame(dict(n=np.arange(100), group=np.random.choice(list('abc'), 100)))

In [17]:
df = spark.createDataFrame(pandas_dataframe)
df.show()

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    c|
|  2|    a|
|  3|    a|
|  4|    b|
|  5|    b|
|  6|    c|
|  7|    b|
|  8|    c|
|  9|    c|
| 10|    b|
| 11|    c|
| 12|    c|
| 13|    b|
| 14|    a|
| 15|    a|
| 16|    a|
| 17|    c|
| 18|    b|
| 19|    b|
+---+-----+
only showing top 20 rows



In [19]:
df.groupby('group').agg(expr('avg(n)')).show()

+-----+------------------+
|group|            avg(n)|
+-----+------------------+
|    c| 48.75675675675676|
|    b| 43.34615384615385|
|    a|54.567567567567565|
+-----+------------------+



In [20]:
df.groupby(df.group).agg(avg(df.n)).show()

+-----+------------------+
|group|            avg(n)|
+-----+------------------+
|    c| 48.75675675675676|
|    b| 43.34615384615385|
|    a|54.567567567567565|
+-----+------------------+



### Create a local SQL table from the DF.

In [21]:
df.createOrReplaceTempView('numbers')

In [22]:
another_pandas_dataframe = spark.sql('''
SELECT group, avg(n) as mean
FROM numbers
GROUP BY group
''').show()

+-----+------------------+
|group|              mean|
+-----+------------------+
|    c| 48.75675675675676|
|    b| 43.34615384615385|
|    a|54.567567567567565|
+-----+------------------+



In [25]:
df.groupby('group').count().show()

+-----+-----+
|group|count|
+-----+-----+
|    c|   37|
|    b|   26|
|    a|   37|
+-----+-----+



In [26]:
df.groupby('group').sum().show()

+-----+------+
|group|sum(n)|
+-----+------+
|    c|  1804|
|    b|  1127|
|    a|  2019|
+-----+------+



In [27]:
spark.sql('''
SELECT group, sum(n) as sum
FROM numbers
GROUP BY group
''').show()

+-----+----+
|group| sum|
+-----+----+
|    c|1804|
|    b|1127|
|    a|2019|
+-----+----+

