1. How to import PySpark and check the version?

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySpark 101 Exercises").getOrCreate()
print(spark.version)

24/09/23 13:50:01 WARN Utils: Your hostname, AI-CJB-LAP-460 resolves to a loopback address: 127.0.1.1; using 192.168.1.165 instead (on interface wlp0s20f3)
24/09/23 13:50:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/23 13:50:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


3.5.2


2. How to convert the index of a PySpark DataFrame into a column?

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import row_number
spark = SparkSession.builder \
    .appName("Convert Index to Column") \
    .getOrCreate()
df = spark.createDataFrame([
    ("Alice", 1),
    ("Bob", 2),
    ("Charlie", 3),
], ["Name", "Value"])
win= Window.orderBy("Name") 
df_with_index = df.withColumn("Index", row_number().over(win))
df_with_index.show()


24/09/23 14:07:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 14:07:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 14:07:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------+-----+-----+
|   Name|Value|Index|
+-------+-----+-----+
|  Alice|    1|    1|
|    Bob|    2|    2|
|Charlie|    3|    3|
+-------+-----+-----+



3. How to combine many lists to form a PySpark DataFrame?

In [9]:
list1 = ["a", "b", "c", "d"]
list2 = [1, 2, 3, 4]
rdd=spark.sparkContext.parallelize(list(zip(list1,list2)))
df=rdd.toDF(["col1","col2"])
df.show()


+----+----+
|col1|col2|
+----+----+
|   a|   1|
|   b|   2|
|   c|   3|
|   d|   4|
+----+----+



4. How to get the items of list A not present in list B?

In [13]:
list_A = [1, 2, 3, 4, 5]
list_B = [4, 5, 6, 7, 8]
rdd1=spark.sparkContext.parallelize(list_A)
rdd2=spark.sparkContext.parallelize(list_B)
res=rdd1.subtract(rdd2)
res.collect()


                                                                                

[1, 2, 3]

5. How to get the items not common to both list A and list B?

In [14]:
list_A = [1, 2, 3, 4, 5]
list_B = [4, 5, 6, 7, 8]
rdd1=spark.sparkContext.parallelize(list_A)
rdd2=spark.sparkContext.parallelize(list_B)
res1=rdd1.subtract(rdd2)
res2=rdd2.subtract(rdd1)
res=res1.union(res2)
res.collect()


                                                                                

[1, 2, 3, 6, 7, 8]

6. How to get the minimum, 25th percentile, median, 75th, and max of a numeric column?

In [15]:
data = [("A", 10), ("B", 20), ("C", 30), ("D", 40), ("E", 50), ("F", 15), ("G", 28), ("H", 54), ("I", 41), ("J", 86)]
df = spark.createDataFrame(data, ["Name", "Age"])
df.show()

quantiles = df.approxQuantile("Age", [0.0, 0.25, 0.5, 0.75, 1.0], 0.01)

print("Min: ", quantiles[0])
print("25th percentile: ", quantiles[1])
print("Median: ", quantiles[2])
print("75th percentile: ", quantiles[3])
print("Max: ", quantiles[4])

+----+---+
|Name|Age|
+----+---+
|   A| 10|
|   B| 20|
|   C| 30|
|   D| 40|
|   E| 50|
|   F| 15|
|   G| 28|
|   H| 54|
|   I| 41|
|   J| 86|
+----+---+

Min:  10.0
25th percentile:  20.0
Median:  30.0
75th percentile:  50.0
Max:  86.0


7. How to get frequency counts of unique items of a column?

In [20]:
from pyspark.sql import Row

data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]
df = spark.createDataFrame(data)
df.groupBy("name").count().show()
df.groupBy("job").count().show()

+----+-----+
|name|count|
+----+-----+
|John|    2|
|Mary|    1|
| Bob|    3|
| Sam|    1|
+----+-----+

+---------+-----+
|      job|count|
+---------+-----+
| Engineer|    4|
|Scientist|    2|
|   Doctor|    1|
+---------+-----+



8. How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?

In [23]:
from pyspark.sql import Row
from pyspark.sql.functions import col, when
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]
df = spark.createDataFrame(data)
top_2 = df.groupBy('job').count()\
    .orderBy('count', ascending=False)\
        .limit(2).select('job').rdd.flatMap(lambda x: x).collect()
df = df.withColumn('job', when(col('job')\
.isin(top_2), col('job')).otherwise('Other'))
df.show()

+----+---------+
|name|      job|
+----+---------+
|John| Engineer|
|John| Engineer|
|Mary|Scientist|
| Bob| Engineer|
| Bob| Engineer|
| Bob|Scientist|
| Sam|    Other|
+----+---------+



9. How to Drop rows with NA values specific to a particular column?

In [26]:
df = spark.createDataFrame([
("A", 1, None),
("B", None, "123" ),
("B", 3, "456"),
("D", None, None),
], ["Name", "Value", "id"])
df1=df.dropna(subset=["Value"])
df1.show()

+----+-----+----+
|Name|Value|  id|
+----+-----+----+
|   A|    1|NULL|
|   B|    3| 456|
+----+-----+----+



10. How to rename columns of a PySpark DataFrame using two lists – one containing the old column names and the other containing the new column names?

In [36]:
df = spark.createDataFrame([(1, 2, 3), (4, 5, 6)], ["col1", "col2", "col3"])
old_names = ["col1", "col2", "col3"]
new_names = ["new_col1", "new_col2", "new_col3"]
df1 = df.toDF(*new_names)
df1.show()

+--------+--------+--------+
|new_col1|new_col2|new_col3|
+--------+--------+--------+
|       1|       2|       3|
|       4|       5|       6|
+--------+--------+--------+



11. How to bin a numeric list to 10 groups of equal size?

In [45]:
from pyspark.sql.functions import rand
from pyspark.sql import functions as F
num_items = 100
df = spark.range(num_items).select\
    (rand(seed=42).alias("values"))
min_value = df.agg(F.min("values")).first()[0]
max_value = df.agg(F.max("values")).first()[0]
bin_size = (max_value - min_value) / 10
binned_df = df.withColumn(
    "bucket",
    (F.col("values") - min_value) / bin_size
)
binned_df = binned_df.withColumn("bucket", F.ceil(F.col("bucket")))
print("Binned DataFrame:")
binned_df.show(5)


Binned DataFrame:
+-------------------+------+
|             values|bucket|
+-------------------+------+
|  0.619189370225301|     7|
| 0.5096018842446481|     6|
| 0.8325259388871524|     9|
|0.26322809041172357|     3|
| 0.6702867696264135|     7|
+-------------------+------+
only showing top 5 rows



12. How to create contigency table?

In [53]:
data = [("A", "X"), ("A", "Y"), ("A", "X"), ("B", "Y"), ("B", "X"), ("C", "X"), ("C", "X"), ("C", "Y")]
df = spark.createDataFrame(data, ["category1", "category2"])
df.crosstab("category1", "category2").show()



+-------------------+---+---+
|category1_category2|  X|  Y|
+-------------------+---+---+
|                  B|  1|  1|
|                  C|  2|  1|
|                  A|  2|  1|
+-------------------+---+---+



13. How to find the numbers that are multiples of 3 from a column?

In [59]:
from pyspark.sql.functions import rand
df = spark.range(10)
df = df.withColumn("random", ((rand(seed=42) * 10) + 1).cast("int"))
df=df.withColumn("is_multiple_of_3", when(col("random") % 3 == 0,"yes").otherwise('no'))
df.show()

+---+------+----------------+
| id|random|is_multiple_of_3|
+---+------+----------------+
|  0|     7|              no|
|  1|     9|             yes|
|  2|     8|              no|
|  3|     8|              no|
|  4|     3|             yes|
|  5|     1|              no|
|  6|     7|              no|
|  7|     4|              no|
|  8|     5|              no|
|  9|     1|              no|
+---+------+----------------+



14. How to extract items at given positions from a column?

In [60]:
from pyspark.sql.functions import rand
df = df.withColumn("random", ((rand(seed=42) * 10) + 1).cast("int"))
df.show()
pos = [0, 4, 8, 5]
df.select(pos)

+---+------+----------------+
| id|random|is_multiple_of_3|
+---+------+----------------+
|  0|     7|              no|
|  1|     9|             yes|
|  2|     8|              no|
|  3|     8|              no|
|  4|     3|             yes|
|  5|     1|              no|
|  6|     7|              no|
|  7|     4|              no|
|  8|     5|              no|
|  9|     1|              no|
+---+------+----------------+



PySparkTypeError: [NOT_COLUMN_OR_STR] Argument `col` should be a Column or str, got int.