In [53]:
import pyspark
from pyspark.sql import SparkSession, functions as F


In [54]:
spark = SparkSession.builder.config('spark.driver.memory', '4g').appName('square_integers').getOrCreate()
df = spark.createDataFrame([(2,"Alice"), (35, "Bob"), (72, "Martha")], schema=["Age", "Name"])
df.filter(df.Age>18).show()
df.withColumn("Country", F.lit("USA")).show()

+---+------+
|Age|  Name|
+---+------+
| 35|   Bob|
| 72|Martha|
+---+------+

+---+------+-------+
|Age|  Name|Country|
+---+------+-------+
|  2| Alice|    USA|
| 35|   Bob|    USA|
| 72|Martha|    USA|
+---+------+-------+



In [55]:
df.select(F.count(df.Name)).show()

+-----------+
|count(Name)|
+-----------+
|          3|
+-----------+



In [56]:
df.select(F.avg(df.Age)).show()
df.select(F.sum(df.Age)).show()

+------------------+
|          avg(Age)|
+------------------+
|36.333333333333336|
+------------------+

+--------+
|sum(Age)|
+--------+
|     109|
+--------+



In [59]:
df.write.csv('df.csv')

In [58]:
split_on_spaces = F.split('value', ' ')
df = (
    spark.read.option("recursiveFileLookup","true").text('TextDocs/')
    .withColumn('value', F.explode(split_on_spaces))
    .groupBy('value').count()
    .orderBy(F.desc('count'))
)
top_val_dict = {r['value']:r['count'] for r in df.head(10)}

print(top_val_dict)

{'': 81, 'the': 59, 'was': 41, 'to': 39, 'and': 33, 'he': 28, 'that': 28, 'a': 28, 'of': 20, 'her': 20}
