In [64]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("spark://spark-master:7077").getOrCreate()
sc = spark.sparkContext

In [65]:
rdd = sc.parallelize(range(1000000+1))

In [66]:
rdd.sum()

500000500000

In [67]:
rdd.count()

1000001

In [68]:
rdd.first()

0

In [69]:
rdd.max()

1000000

In [70]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

In [71]:
columns = ["firstname","middlename","lastname","dob","gender","salary"]

In [72]:
dataframe = spark.createDataFrame(data=data, schema = columns)

In [73]:
dataframe.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [74]:
dataframe.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [75]:
dataframe.select('firstname','lastname','salary').show(3)

+---------+--------+------+
|firstname|lastname|salary|
+---------+--------+------+
|    James|   Smith|  3000|
|  Michael|        |  4000|
|   Robert|Williams|  4000|
+---------+--------+------+
only showing top 3 rows



In [76]:
dataframe.describe('salary').show()

+-------+------------------+
|summary|            salary|
+-------+------------------+
|  count|                 5|
|   mean|            2999.8|
| stddev|1732.4838238783068|
|    min|                -1|
|    max|              4000|
+-------+------------------+



In [77]:
dataframe.describe().show()

+-------+---------+----------+--------+----------+------+------------------+
|summary|firstname|middlename|lastname|       dob|gender|            salary|
+-------+---------+----------+--------+----------+------+------------------+
|  count|        5|         5|       5|         5|     5|                 5|
|   mean|     null|      null|    null|      null|  null|            2999.8|
| stddev|     null|      null|    null|      null|  null|1732.4838238783068|
|    min|    James|          |        |1967-12-01|     F|                -1|
|    max|   Robert|      Rose|Williams|2000-05-19|     M|              4000|
+-------+---------+----------+--------+----------+------+------------------+



In [83]:
dataframe.createOrReplaceTempView("PERSON_DATA")
groupDataframe = spark.sql("SELECT gender, count(*) as number from PERSON_DATA group by gender")
groupDataframe.show()

+------+------+
|gender|number|
+------+------+
|     M|     3|
|     F|     2|
+------+------+



In [93]:
groupDataframe2 = spark.sql("SELECT gender, avg(salary) from PERSON_DATA group by gender")
groupDataframe2.show()

+------+------------------+
|gender|       avg(salary)|
+------+------------------+
|     M|3666.6666666666665|
|     F|            1999.5|
+------+------------------+



In [49]:
dataframe.groupBy("gender").count().sort("count",ascending=False).show()

+------+-----+
|gender|count|
+------+-----+
|     M|    3|
|     F|    2|
+------+-----+



In [50]:
dataframe.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [54]:
dataframe.filter(dataframe.salary > 3999).count()

3

In [56]:
dataframe.groupby('gender').agg({'salary': 'mean'}).show()

+------+------------------+
|gender|       avg(salary)|
+------+------------------+
|     M|3666.6666666666665|
|     F|            1999.5|
+------+------------------+

