In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 38 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 47.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=edced475e8cbc7496524582d7673d1b455287c71685ad1f8b6997e9b6bb2d175
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


## Windows function

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window
import pyspark.sql.types as T

In [3]:
from google.colab import files
uploads = files.upload()

Saving adult_income_dataset.csv to adult_income_dataset.csv


In [4]:
spark = SparkSession.builder.getOrCreate()

In [5]:
df = spark.read.csv('/content/adult_income_dataset.csv', inferSchema=True, header=True)

In [6]:
df.show(5)

+---+---+---------+------+------------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+
|  x|age|workclass|fnlwgt|   education|educational-num|    marital-status|       occupation|relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+---+---------+------+------------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+
|  1| 25|  Private|226802|        11th|              7|     Never-married|Machine-op-inspct|   Own-child|Black|  Male|           0|           0|            40| United-States| <=50K|
|  2| 38|  Private| 89814|     HS-grad|              9|Married-civ-spouse|  Farming-fishing|     Husband|White|  Male|           0|           0|            50| United-States| <=50K|
|  3| 28|Local-gov|336951|  Assoc-acdm|             12|Married-civ-spouse|  Protective-ser

In [7]:
df.printSchema()

root
 |-- x: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [8]:
df.count()

48842

In [14]:
windowSpec = Window.partitionBy('occupation')

data = df.withColumn('list_salary', F.collect_list(F.col('fnlwgt')).over(windowSpec))\
.withColumn('average_salary', F.avg(F.col('fnlwgt')).over(windowSpec))\
.withColumn('total_salary', F.sum(F.col('fnlwgt')).over(windowSpec))

In [15]:
data.show()

+---+---+----------------+------+------------+---------------+--------------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+--------------------+------------------+------------+
|  x|age|       workclass|fnlwgt|   education|educational-num|      marital-status|     occupation| relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|         list_salary|    average_salary|total_salary|
+---+---+----------------+------+------------+---------------+--------------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+--------------------+------------------+------------+
| 16| 43|         Private|346189|     Masters|             14|  Married-civ-spouse|Exec-managerial|      Husband|White|  Male|           0|           0|            50| United-States|  >50K|[346189, 106444, ...|186125.05142951035|  1132757063|
| 31| 46|       State-gov|10

In [16]:
data.count()

48842

In [21]:
df.select('occupation').distinct().show()

+-----------------+
|       occupation|
+-----------------+
|            Sales|
|  Exec-managerial|
|   Prof-specialty|
|Handlers-cleaners|
|  Farming-fishing|
|     Craft-repair|
| Transport-moving|
|  Priv-house-serv|
|  Protective-serv|
|    Other-service|
|     Tech-support|
|Machine-op-inspct|
|     Armed-Forces|
|                ?|
|     Adm-clerical|
+-----------------+



### comparision of groupby function and window (partitionby) function

In [23]:
data1 = df.groupBy('occupation').agg(
    F.expr('collect_list(fnlwgt)').alias('list_salary'),
    F.expr('avg(fnlwgt)').alias('average_salary'),
    F.expr('sum(fnlwgt)').alias('total_salary'))

In [24]:
data1.show()

+-----------------+--------------------+------------------+------------+
|       occupation|         list_salary|    average_salary|total_salary|
+-----------------+--------------------+------------------+------------+
|            Sales|[188274, 120277, ...| 190483.1558866279|  1048419290|
|  Exec-managerial|[346189, 106444, ...|186125.05142951035|  1132757063|
|   Prof-specialty|[104626, 85019, 2...|185456.03110823073|  1144634624|
|Handlers-cleaners|[280215, 269705, ...|202052.54295366796|   418652869|
|  Farming-fishing|[89814, 465326, 1...| 172659.8744966443|   257263213|
|     Craft-repair|[104996, 432824, ...| 192258.9847840314|  1175086915|
| Transport-moving|[186272, 170338, ...|191550.58174097663|   451101620|
|  Priv-house-serv|[248446, 201062, ...|194482.93388429753|    47064870|
|  Protective-serv|[336951, 258120, ...|201530.26653102748|   198104252|
|    Other-service|[198693, 369667, ...| 187928.3272394881|   925171155|
|     Tech-support|[107914, 175622, ...|190454.5110

In [27]:
# we add another window function called orderBy to arrange the salary(fnlwgt) in ascending order
windowSpec = Window.partitionBy('occupation').orderBy(F.asc('fnlwgt'))

data_2 = df.withColumn('list_salary', F.collect_list(F.col('fnlwgt')).over(windowSpec))\
.withColumn('average_salary', F.avg(F.col('fnlwgt')).over(windowSpec))\
.withColumn('total_salary', F.sum(F.col('fnlwgt')).over(windowSpec))

In [28]:
data_2.show()

+-----+---+----------------+------+------------+---------------+--------------------+---------------+-------------+------------------+------+------------+------------+--------------+--------------+------+--------------------+------------------+------------+
|    x|age|       workclass|fnlwgt|   education|educational-num|      marital-status|     occupation| relationship|              race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|         list_salary|    average_salary|total_salary|
+-----+---+----------------+------+------------+---------------+--------------------+---------------+-------------+------------------+------+------------+------------+--------------+--------------+------+--------------------+------------------+------------+
| 7669| 44|         Private| 13769|Some-college|             10|  Married-civ-spouse|Exec-managerial|      Husband|Amer-Indian-Eskimo|  Male|           0|           0|            40| United-States|  >50K|             [13769]| 