In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 46 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 48.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=6475ef07d5bf33fe91bdba863dcdc8de726c3865ba82711f97d66917a9653f5a
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


### Rank, Dense Rank and Percent Rank

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [3]:
from google.colab import files
uploads = files.upload()

Saving adult_income_dataset.csv to adult_income_dataset.csv


In [4]:
spark = SparkSession.builder.getOrCreate()

In [5]:
df = spark.read.csv('/content/adult_income_dataset.csv', inferSchema=True, header = True)

In [6]:
df.show()

+---+---+----------------+------+------------+---------------+------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+
|  x|age|       workclass|fnlwgt|   education|educational-num|    marital-status|       occupation| relationship|              race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+---+----------------+------+------------+---------------+------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+
|  1| 25|         Private|226802|        11th|              7|     Never-married|Machine-op-inspct|    Own-child|             Black|  Male|           0|           0|            40| United-States| <=50K|
|  2| 38|         Private| 89814|     HS-grad|              9|Married-civ-spouse|  Farming-fishing|      Husband|             White|  Male|           0|           0|            50| United-

In [7]:
df.printSchema()

root
 |-- x: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [9]:
# expr:
# here we are joining two col by a seperator ','
df.withColumn('work_class_occupation', F.expr("workclass || ',' || occupation")).show()

+---+---+----------------+------+------------+---------------+------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+---------------------+
|  x|age|       workclass|fnlwgt|   education|educational-num|    marital-status|       occupation| relationship|              race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|work_class_occupation|
+---+---+----------------+------+------------+---------------+------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+---------------------+
|  1| 25|         Private|226802|        11th|              7|     Never-married|Machine-op-inspct|    Own-child|             Black|  Male|           0|           0|            40| United-States| <=50K| Private,Machine-o...|
|  2| 38|         Private| 89814|     HS-grad|              9|Married-civ-spouse|  Farming-fishing| 

In [13]:
df.withColumn('workclass_new', F.expr("CASE WHEN workclass = 'State-gov' THEN 'State-Govt'" + "WHEN workclass = '?'THEN 'UNKNOWN'  ELSE 'NA' END")).show()

+---+---+----------------+------+------------+---------------+------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+-------------+
|  x|age|       workclass|fnlwgt|   education|educational-num|    marital-status|       occupation| relationship|              race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|workclass_new|
+---+---+----------------+------+------------+---------------+------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+-------------+
|  1| 25|         Private|226802|        11th|              7|     Never-married|Machine-op-inspct|    Own-child|             Black|  Male|           0|           0|            40| United-States| <=50K|           NA|
|  2| 38|         Private| 89814|     HS-grad|              9|Married-civ-spouse|  Farming-fishing|      Husband|             White|

In [16]:
df.select( df.workclass,df.fnlwgt, F.expr('fnlwgt + 10000 as new_salary')).show()

+----------------+------+----------+
|       workclass|fnlwgt|new_salary|
+----------------+------+----------+
|         Private|226802|    236802|
|         Private| 89814|     99814|
|       Local-gov|336951|    346951|
|         Private|160323|    170323|
|               ?|103497|    113497|
|         Private|198693|    208693|
|               ?|227026|    237026|
|Self-emp-not-inc|104626|    114626|
|         Private|369667|    379667|
|         Private|104996|    114996|
|         Private|184454|    194454|
|     Federal-gov|212465|    222465|
|         Private| 82091|     92091|
|               ?|299831|    309831|
|         Private|279724|    289724|
|         Private|346189|    356189|
|       State-gov|444554|    454554|
|         Private|128354|    138354|
|         Private| 60548|     70548|
|         Private| 85019|     95019|
+----------------+------+----------+
only showing top 20 rows



In [17]:

df.filter(F.expr('x == age')).show()

+---+---+---------+------+---------+---------------+------------------+--------------+------------+-----+------+------------+------------+--------------+--------------+------+
|  x|age|workclass|fnlwgt|education|educational-num|    marital-status|    occupation|relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+---+---------+------+---------+---------------+------------------+--------------+------------+-----+------+------------+------------+--------------+--------------+------+
| 25| 25|  Private|205947|Bachelors|             13|Married-civ-spouse|Prof-specialty|     Husband|White|  Male|           0|           0|            40| United-States| <=50K|
+---+---+---------+------+---------+---------------+------------------+--------------+------------+-----+------+------------+------------+--------------+--------------+------+

