In [28]:
#import findspark
#findspark.init()
#findspark.find()

import findspark
findspark.init()
import pandas as pd
import numpy as np

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()

In [29]:
simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



In [10]:
# approx_count_distinct() function returns the count of distinct items in a group.
# much faster at approximately counting the distinct records rather than doing an exact count
from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("salary")).collect()

[Row(approx_count_distinct(salary)=6)]

In [11]:
# Average
from pyspark.sql.functions import avg

df.select(avg("salary")).collect()

[Row(avg(salary)=3400.0)]

In [16]:
# If you want to grab the value, 3400, ....
df.select(avg("salary")).collect()[0][0]

3400.0

In [17]:
# Distinct values in col
from pyspark.sql.functions import collect_set
df.select(collect_set("salary")).collect()

[Row(collect_set(salary)=[4600, 3000, 3900, 4100, 3300, 2000])]

In [18]:
# count distinct 
from pyspark.sql.functions import countDistinct
df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)

+----------------------------------+
|count(DISTINCT department, salary)|
+----------------------------------+
|8                                 |
+----------------------------------+



In [20]:
# You can also collect to grab `8` -- question: how?
df2.collect()

[Row(count(DISTINCT department, salary)=8)]

In [21]:
df.select(countDistinct("department", "salary")).collect()[0][0]

8

In [22]:
# Count can be SLOW for big datasets
from pyspark.sql.functions import count
df.select(count("salary")).collect()

[Row(count(salary)=10)]

In [23]:
# Stats
from pyspark.sql.functions import first, last, max, min
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)

+-------------+
|first(salary)|
+-------------+
|3000         |
+-------------+

+------------+
|last(salary)|
+------------+
|4100        |
+------------+

+-----------+
|max(salary)|
+-----------+
|4600       |
+-----------+

+-----------+
|min(salary)|
+-----------+
|2000       |
+-----------+



In [24]:
# All at once
df.select(
    first("salary").alias("first_sal"), 
    last("salary").alias("last_sal"),
    max("salary").alias("max_sal"),
    min("salary").alias("min_sal"),
).show(truncate=False)

+---------+--------+-------+-------+
|first_sal|last_sal|max_sal|min_sal|
+---------+--------+-------+-------+
|3000     |4100    |4600   |2000   |
+---------+--------+-------+-------+



In [25]:
# You can rename the columns
df.withColumnRenamed('salary', 'compensation').show()

+-------------+----------+------------+
|employee_name|department|compensation|
+-------------+----------+------------+
|        James|     Sales|        3000|
|      Michael|     Sales|        4600|
|       Robert|     Sales|        4100|
|        Maria|   Finance|        3000|
|        James|     Sales|        3000|
|        Scott|   Finance|        3300|
|          Jen|   Finance|        3900|
|         Jeff| Marketing|        3000|
|        Kumar| Marketing|        2000|
|         Saif|     Sales|        4100|
+-------------+----------+------------+



In [26]:
# ^^^ question: does this change original df?
df.show()

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
|        Maria|   Finance|  3000|
|        James|     Sales|  3000|
|        Scott|   Finance|  3300|
|          Jen|   Finance|  3900|
|         Jeff| Marketing|  3000|
|        Kumar| Marketing|  2000|
|         Saif|     Sales|  4100|
+-------------+----------+------+



In [27]:
df=df.withColumnRenamed('salary', 'compensation')
df.show()

+-------------+----------+------------+
|employee_name|department|compensation|
+-------------+----------+------------+
|        James|     Sales|        3000|
|      Michael|     Sales|        4600|
|       Robert|     Sales|        4100|
|        Maria|   Finance|        3000|
|        James|     Sales|        3000|
|        Scott|   Finance|        3300|
|          Jen|   Finance|        3900|
|         Jeff| Marketing|        3000|
|        Kumar| Marketing|        2000|
|         Saif|     Sales|        4100|
+-------------+----------+------------+

