1. How to import PySpark and check the version?

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySpark 101 Exercises").getOrCreate()
print(spark.version)

24/09/23 13:50:01 WARN Utils: Your hostname, AI-CJB-LAP-460 resolves to a loopback address: 127.0.1.1; using 192.168.1.165 instead (on interface wlp0s20f3)
24/09/23 13:50:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/23 13:50:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


3.5.2


2. How to convert the index of a PySpark DataFrame into a column?

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import row_number
spark = SparkSession.builder \
    .appName("Convert Index to Column") \
    .getOrCreate()
df = spark.createDataFrame([
    ("Alice", 1),
    ("Bob", 2),
    ("Charlie", 3),
], ["Name", "Value"])
win= Window.orderBy("Name") 
df_with_index = df.withColumn("Index", row_number().over(win))
df_with_index.show()


24/09/23 14:07:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 14:07:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 14:07:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------+-----+-----+
|   Name|Value|Index|
+-------+-----+-----+
|  Alice|    1|    1|
|    Bob|    2|    2|
|Charlie|    3|    3|
+-------+-----+-----+



3. How to combine many lists to form a PySpark DataFrame?

In [9]:
list1 = ["a", "b", "c", "d"]
list2 = [1, 2, 3, 4]
rdd=spark.sparkContext.parallelize(list(zip(list1,list2)))
df=rdd.toDF(["col1","col2"])
df.show()


+----+----+
|col1|col2|
+----+----+
|   a|   1|
|   b|   2|
|   c|   3|
|   d|   4|
+----+----+



4. How to get the items of list A not present in list B?

In [13]:
list_A = [1, 2, 3, 4, 5]
list_B = [4, 5, 6, 7, 8]
rdd1=spark.sparkContext.parallelize(list_A)
rdd2=spark.sparkContext.parallelize(list_B)
res=rdd1.subtract(rdd2)
res.collect()


                                                                                

[1, 2, 3]

5. How to get the items not common to both list A and list B?

In [14]:
list_A = [1, 2, 3, 4, 5]
list_B = [4, 5, 6, 7, 8]
rdd1=spark.sparkContext.parallelize(list_A)
rdd2=spark.sparkContext.parallelize(list_B)
res1=rdd1.subtract(rdd2)
res2=rdd2.subtract(rdd1)
res=res1.union(res2)
res.collect()


                                                                                

[1, 2, 3, 6, 7, 8]

6. How to get the minimum, 25th percentile, median, 75th, and max of a numeric column?

In [15]:
data = [("A", 10), ("B", 20), ("C", 30), ("D", 40), ("E", 50), ("F", 15), ("G", 28), ("H", 54), ("I", 41), ("J", 86)]
df = spark.createDataFrame(data, ["Name", "Age"])
df.show()

quantiles = df.approxQuantile("Age", [0.0, 0.25, 0.5, 0.75, 1.0], 0.01)

print("Min: ", quantiles[0])
print("25th percentile: ", quantiles[1])
print("Median: ", quantiles[2])
print("75th percentile: ", quantiles[3])
print("Max: ", quantiles[4])

+----+---+
|Name|Age|
+----+---+
|   A| 10|
|   B| 20|
|   C| 30|
|   D| 40|
|   E| 50|
|   F| 15|
|   G| 28|
|   H| 54|
|   I| 41|
|   J| 86|
+----+---+

Min:  10.0
25th percentile:  20.0
Median:  30.0
75th percentile:  50.0
Max:  86.0


7. How to get frequency counts of unique items of a column?

In [20]:
from pyspark.sql import Row

data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]
df = spark.createDataFrame(data)
df.groupBy("name").count().show()
df.groupBy("job").count().show()

+----+-----+
|name|count|
+----+-----+
|John|    2|
|Mary|    1|
| Bob|    3|
| Sam|    1|
+----+-----+

+---------+-----+
|      job|count|
+---------+-----+
| Engineer|    4|
|Scientist|    2|
|   Doctor|    1|
+---------+-----+



8. How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?

In [23]:
from pyspark.sql import Row
from pyspark.sql.functions import col, when
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]
df = spark.createDataFrame(data)
top_2 = df.groupBy('job').count()\
    .orderBy('count', ascending=False)\
        .limit(2).select('job').rdd.flatMap(lambda x: x).collect()
df = df.withColumn('job', when(col('job')\
.isin(top_2), col('job')).otherwise('Other'))
df.show()

+----+---------+
|name|      job|
+----+---------+
|John| Engineer|
|John| Engineer|
|Mary|Scientist|
| Bob| Engineer|
| Bob| Engineer|
| Bob|Scientist|
| Sam|    Other|
+----+---------+



9. How to Drop rows with NA values specific to a particular column?

In [26]:
df = spark.createDataFrame([
("A", 1, None),
("B", None, "123" ),
("B", 3, "456"),
("D", None, None),
], ["Name", "Value", "id"])
df1=df.dropna(subset=["Value"])
df1.show()

+----+-----+----+
|Name|Value|  id|
+----+-----+----+
|   A|    1|NULL|
|   B|    3| 456|
+----+-----+----+



10. How to rename columns of a PySpark DataFrame using two lists – one containing the old column names and the other containing the new column names?

In [36]:
df = spark.createDataFrame([(1, 2, 3), (4, 5, 6)], ["col1", "col2", "col3"])
old_names = ["col1", "col2", "col3"]
new_names = ["new_col1", "new_col2", "new_col3"]
df1 = df.toDF(*new_names)
df1.show()

+--------+--------+--------+
|new_col1|new_col2|new_col3|
+--------+--------+--------+
|       1|       2|       3|
|       4|       5|       6|
+--------+--------+--------+



11. How to bin a numeric list to 10 groups of equal size?

In [45]:
from pyspark.sql.functions import rand
from pyspark.sql import functions as F
num_items = 100
df = spark.range(num_items).select\
    (rand(seed=42).alias("values"))
min_value = df.agg(F.min("values")).first()[0]
max_value = df.agg(F.max("values")).first()[0]
bin_size = (max_value - min_value) / 10
binned_df = df.withColumn(
    "bucket",
    (F.col("values") - min_value) / bin_size
)
binned_df = binned_df.withColumn("bucket", F.ceil(F.col("bucket")))
print("Binned DataFrame:")
binned_df.show(5)


Binned DataFrame:
+-------------------+------+
|             values|bucket|
+-------------------+------+
|  0.619189370225301|     7|
| 0.5096018842446481|     6|
| 0.8325259388871524|     9|
|0.26322809041172357|     3|
| 0.6702867696264135|     7|
+-------------------+------+
only showing top 5 rows



12. How to create contigency table?

In [53]:
data = [("A", "X"), ("A", "Y"), ("A", "X"), ("B", "Y"), ("B", "X"), ("C", "X"), ("C", "X"), ("C", "Y")]
df = spark.createDataFrame(data, ["category1", "category2"])
df.crosstab("category1", "category2").show()



+-------------------+---+---+
|category1_category2|  X|  Y|
+-------------------+---+---+
|                  B|  1|  1|
|                  C|  2|  1|
|                  A|  2|  1|
+-------------------+---+---+



13. How to find the numbers that are multiples of 3 from a column?

In [68]:
from pyspark.sql.functions import rand
df = spark.range(10)
df = df.withColumn("random", ((rand(seed=42) * 10) + 1).cast("int"))
df1=df.withColumn("is_multiple_of_3", when(col("random") % 3 == 0,"yes").otherwise('no'))
df1.show()

+---+------+----------------+
| id|random|is_multiple_of_3|
+---+------+----------------+
|  0|     7|              no|
|  1|     9|             yes|
|  2|     8|              no|
|  3|     8|              no|
|  4|     3|             yes|
|  5|     1|              no|
|  6|     7|              no|
|  7|     4|              no|
|  8|     5|              no|
|  9|     1|              no|
+---+------+----------------+



14. How to extract items at given positions from a column?

In [69]:
from pyspark.sql.functions import rand

df = df.withColumn("random", ((rand(seed=42) * 10) + 1).cast("int"))

pos = [0, 4, 8, 5]
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, monotonically_increasing_id

pos = [0, 4, 8, 5]

# Define window specification
w = Window.orderBy(monotonically_increasing_id())

# Add index
df = df.withColumn("index", row_number().over(w) - 1)

df.show()

# Filter the DataFrame based on the specified positions
df_filtered = df.filter(df.index.isin(pos))

df_filtered.show()

24/09/23 15:21:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:21:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:21:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:21:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:21:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+---+------+-----+
| id|random|index|
+---+------+-----+
|  0|     7|    0|
|  1|     9|    1|
|  2|     8|    2|
|  3|     8|    3|
|  4|     3|    4|
|  5|     1|    5|
|  6|     7|    6|
|  7|     4|    7|
|  8|     5|    8|
|  9|     1|    9|
+---+------+-----+

+---+------+-----+
| id|random|index|
+---+------+-----+
|  0|     7|    0|
|  4|     3|    4|
|  5|     1|    5|
|  8|     5|    8|
+---+------+-----+



24/09/23 15:21:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:21:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:21:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:21:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:21:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


15. How to stack two DataFrames vertically ?

In [72]:
df_A = spark.createDataFrame([("apple", 3, 5), ("banana", 1, 10), ("orange", 2, 8)], ["Name", "Col_1", "Col_2"])
df_B = spark.createDataFrame([("apple", 3, 5), ("banana", 1, 15), ("grape", 4, 6)], ["Name", "Col_1", "Col_3"])
df_A.union(df_B).show()

+------+-----+-----+
|  Name|Col_1|Col_2|
+------+-----+-----+
| apple|    3|    5|
|banana|    1|   10|
|orange|    2|    8|
| apple|    3|    5|
|banana|    1|   15|
| grape|    4|    6|
+------+-----+-----+



16. How to compute the mean squared error on a truth and predicted columns?

In [77]:
from pyspark.sql import functions as F
data = [(1, 1), (2, 4), (3, 9), (4, 16), (5, 25)]
df = spark.createDataFrame(data, ["actual", "predicted"])
df.select(F.mean((F.col("actual") - F.col("predicted")) ** 2).alias("MSE")).show()

+-----+
|  MSE|
+-----+
|116.8|
+-----+



17. How to convert the first character of each element in a series to uppercase?

In [81]:
from pyspark.sql.functions import initcap
data = [("john",), ("alice",), ("bob",)]
df = spark.createDataFrame(data, ["name"])

df = df.withColumn("name", initcap(df.name))
df.show()

+-----+
| name|
+-----+
| John|
|Alice|
|  Bob|
+-----+



18. How to compute summary statistics for all columns in a dataframe

In [83]:
data = [('James', 34, 55000),
('Michael', 30, 70000),
('Robert', 37, 60000),
('Maria', 29, 80000),
('Jen', 32, 65000)]

df = spark.createDataFrame(data, ["name", "age" , "salary"])
df.summary().show()


24/09/23 15:44:13 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+------+-----------------+-----------------+
|summary|  name|              age|           salary|
+-------+------+-----------------+-----------------+
|  count|     5|                5|                5|
|   mean|  NULL|             32.4|          66000.0|
| stddev|  NULL|3.209361307176242|9617.692030835675|
|    min| James|               29|            55000|
|    25%|  NULL|               30|            60000|
|    50%|  NULL|               32|            65000|
|    75%|  NULL|               34|            70000|
|    max|Robert|               37|            80000|
+-------+------+-----------------+-----------------+




19. How to calculate the number of characters in each word in a column?

In [84]:
from pyspark.sql import functions as F
data = [("john",), ("alice",), ("bob",)]
df = spark.createDataFrame(data, ["name"])
df = df.withColumn('len', F.length(df.name))
df.show()

+-----+---+
| name|len|
+-----+---+
| john|  4|
|alice|  5|
|  bob|  3|
+-----+---+



In [91]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
spark = SparkSession.builder \
    .appName("Difference of Differences") \
    .getOrCreate()
data = [('James', 34, 55000),
        ('Michael', 30, 70000),
        ('Robert', 37, 60000),
        ('Maria', 29, 80000),
        ('Jen', 32, 65000)]
df = spark.createDataFrame(data, ["Name", "Age", "Salary"])
window_spec = Window.orderBy("Salary")
df_first_diff = df.withColumn("first_diff", F.col("Salary") - F.lag("Salary").over(window_spec))
df_second_diff = df_first_diff.withColumn("second_diff", 
    F.col("first_diff") - F.lag("first_diff").over(window_spec))
df_second_diff.show()

24/09/23 15:58:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:58:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:58:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:58:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:58:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------+---+------+----------+-----------+
|   Name|Age|Salary|first_diff|second_diff|
+-------+---+------+----------+-----------+
|  James| 34| 55000|      NULL|       NULL|
| Robert| 37| 60000|      5000|       NULL|
|    Jen| 32| 65000|      5000|          0|
|Michael| 30| 70000|      5000|          0|
|  Maria| 29| 80000|     10000|       5000|
+-------+---+------+----------+-----------+



24/09/23 15:58:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:58:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:58:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 15:58:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


21. How to get the day of month, week number, day of year and day of week from a date strings?

In [102]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import to_date, dayofmonth, weekofyear, dayofyear, dayofweek
spark = SparkSession.builder \
    .appName("dateex") \
    .getOrCreate()
data = [("2023-05-18", "01 Jan 2010"), ("2023-12-31", "01 Jan 2010")]
df = spark.createDataFrame(data, ["date_str_1", "date_str_2"])

df = df.withColumn("date_1", to_date(df.date_str_1, 'yyyy-MM-dd'))
df = df.withColumn("date_2", to_date(df.date_str_2, 'dd MMM yyyy'))

df = df.withColumn("day_of_month", dayofmonth(df.date_1))\
.withColumn("week_number", weekofyear(df.date_1))\
.withColumn("day_of_year", dayofyear(df.date_1))\
.withColumn("day_of_week", dayofweek(df.date_1))

df.show()


+----------+-----------+----------+----------+------------+-----------+-----------+-----------+
|date_str_1| date_str_2|    date_1|    date_2|day_of_month|week_number|day_of_year|day_of_week|
+----------+-----------+----------+----------+------------+-----------+-----------+-----------+
|2023-05-18|01 Jan 2010|2023-05-18|2010-01-01|          18|         20|        138|          5|
|2023-12-31|01 Jan 2010|2023-12-31|2010-01-01|          31|         52|        365|          1|
+----------+-----------+----------+----------+------------+-----------+-----------+-----------+



22. How to convert year-month string to dates corresponding to the 4th day of the month?

In [104]:
from pyspark.sql.functions import expr, col
df = spark.createDataFrame([('Jan 2010',), ('Feb 2011',), ('Mar 2012',)], ['MonthYear'])
df = df.withColumn('Date', expr("to_date(MonthYear, 'MMM yyyy')"))

df = df.withColumn('Date', expr("date_add(date_sub(Date, day(Date) - 1), 3)"))

df.show()


+---------+----------+
|MonthYear|      Date|
+---------+----------+
| Jan 2010|2010-01-04|
| Feb 2011|2011-02-04|
| Mar 2012|2012-03-04|
+---------+----------+



23 How to filter words that contain atleast 2 vowels from a series?

In [108]:
df = spark.createDataFrame([('Apple',), ('Orange',), ('Plan',) , ('Python',) , ('Money',)], ['Word'])
from pyspark.sql.functions import col, length, translate

df2 = df.filter(F.col("word").rlike(r'[aeiouAEIOU].*[aeiouAEIOU]'))
df2.show()

+------+
|  Word|
+------+
| Apple|
|Orange|
| Money|
+------+



24. How to filter valid emails from a list?

In [111]:
data = ['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']
df = spark.createDataFrame(data, "string")
pattern = '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
df2=df.filter(F.col("value").rlike(pattern))
df2.show(truncate =False)

+-----------------+
|value            |
+-----------------+
|rameses@egypt.com|
|matt@t.co        |
|narendra@modi.com|
+-----------------+



25. How to Pivot PySpark DataFrame?


In [115]:
data = [
(2021, 1, "US", 5000),
(2021, 1, "EU", 4000),
(2021, 2, "US", 5500),
(2021, 2, "EU", 4500),
(2021, 3, "US", 6000),
(2021, 3, "EU", 5000),
(2021, 4, "US", 7000),
(2021, 4, "EU", 6000),
]
columns = ["year", "quarter", "region", "revenue"]
df = spark.createDataFrame(data, columns)
pivot_df=df.groupBy("year","region").pivot("quarter").agg(F.mean("revenue"))
pivot_df.show()


+----+------+------+------+------+------+
|year|region|     1|     2|     3|     4|
+----+------+------+------+------+------+
|2021|    US|5000.0|5500.0|6000.0|7000.0|
|2021|    EU|4000.0|4500.0|5000.0|6000.0|
+----+------+------+------+------+------+



26. How to get the mean of a variable grouped by another variable?

In [118]:
data = [("1001", "Laptop", 1000),
("1002", "Mouse", 50),
("1003", "Laptop", 1200),
("1004", "Mouse", 30),
("1005", "Smartphone", 700)]
columns = ["OrderID", "Product", "Price"]
df = spark.createDataFrame(data, columns)
df2=df.groupBy("Product").agg(F.mean("Price"))
df2.show()

+----------+----------+
|   Product|avg(Price)|
+----------+----------+
|    Laptop|    1100.0|
|     Mouse|      40.0|
|Smartphone|     700.0|
+----------+----------+



27. How to compute the euclidean distance between two columns?

In [123]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = SparkSession.builder \
    .appName("Euclideanex") \
    .getOrCreate()
data = [(1, 10), (2, 9), (3, 8), (4, 7), (5, 6), (6, 5), (7, 4), (8, 3), (9, 2), (10, 1)]
df = spark.createDataFrame(data, ["series1", "series2"])
df = df.withColumn("euclidean_distance", F.sqrt((F.col("series1") - F.col("series2"))**2))
df.show()

+-------+-------+------------------+
|series1|series2|euclidean_distance|
+-------+-------+------------------+
|      1|     10|               9.0|
|      2|      9|               7.0|
|      3|      8|               5.0|
|      4|      7|               3.0|
|      5|      6|               1.0|
|      6|      5|               1.0|
|      7|      4|               3.0|
|      8|      3|               5.0|
|      9|      2|               7.0|
|     10|      1|               9.0|
+-------+-------+------------------+



28. How to replace missing spaces in a string with the least frequent character?

In [135]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from collections import Counter

df = spark.createDataFrame([('dbc deb abed gade',)], ["string"])
def least_frequent_char(s):
    s = s.replace(" ", "")  
    if not s:  
        return None
    return min(Counter(s), key=Counter(s).get)
least_f = F.udf(least_frequent_char)
least_char = df.select(least_f(F.col("string")).alias("least_char")).first()[0]
if least_char:
    df = df.withColumn("replaced_string", F.regexp_replace(F.col("string"), " ", least_char))

df.show(truncate=False)

+-----------------+-----------------+
|string           |replaced_string  |
+-----------------+-----------------+
|dbc deb abed gade|dbccdebcabedcgade|
+-----------------+-----------------+



29. How to create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values?

In [144]:
from pyspark.sql.functions import expr, explode, sequence, rand

# Start date and end date (start + 10 weekends)
start_date = '2000-01-01'
end_date = '2000-03-04' 
df = spark.range(1).select(
explode(
sequence(
expr(f"date '{start_date}'"),
expr(f"date '{end_date}'"),
expr("interval 1 day")
)
).alias("date")
)
df = df.filter(expr("dayofweek(date) = 7")) 
df = df.withColumn("random_numbers", ((rand(seed=42) * 10) + 1).cast("int"))

df.show()

+----------+--------------+
|      date|random_numbers|
+----------+--------------+
|2000-01-01|             5|
|2000-01-08|             1|
|2000-01-15|             9|
|2000-01-22|             6|
|2000-01-29|             3|
|2000-02-05|             2|
|2000-02-12|             3|
|2000-02-19|             2|
|2000-02-26|             6|
|2000-03-04|             4|
+----------+--------------+



30. How to get the nrows, ncolumns, datatype of a dataframe?


In [150]:
from pyspark import SparkFiles

# Load the dataset
url = "https://raw.githubusercontent.com/selva86/datasets/master/Churn_Modelling.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("Churn_Modelling.csv"), header=True, inferSchema=True)

# Get the number of rows and columns
nrows = df.count()
ncolumns = len(df.columns)

# Get the data types
data_types = df.dtypes

# Get summary statistics
summary_stats = df.describe().show()

# Convert to NumPy array and list
numpy_array = df.toPandas().values
list_equiv = df.collect()

# Output the results
print(f"Number of rows: {nrows}")
print(f"Number of columns: {ncolumns}")




24/09/23 17:58:07 WARN SparkContext: The path https://raw.githubusercontent.com/selva86/datasets/master/Churn_Modelling.csv has been added already. Overwriting of added paths is not supported in the current version.


+-------+------------------+-----------------+-------+-----------------+---------+------+------------------+------------------+-----------------+------------------+-------------------+-------------------+-----------------+-------------------+
|summary|         RowNumber|       CustomerId|Surname|      CreditScore|Geography|Gender|               Age|            Tenure|          Balance|     NumOfProducts|          HasCrCard|     IsActiveMember|  EstimatedSalary|             Exited|
+-------+------------------+-----------------+-------+-----------------+---------+------+------------------+------------------+-----------------+------------------+-------------------+-------------------+-----------------+-------------------+
|  count|             10000|            10000|  10000|            10000|    10000| 10000|             10000|             10000|            10000|             10000|              10000|              10000|            10000|              10000|
|   mean|            5000.5|

31. How to rename a specific columns in a dataframe?

In [153]:
df = spark.createDataFrame([('Alice', 1, 30),('Bob', 2, 35)], ["name", "age", "qty"])

old_names = ["qty", "age"]
new_names = ["user_qty", "user_age"]
for old_name, new_name in zip(old_names, new_names):
    df = df.withColumnRenamed(old_name, new_name)

df.show()


+-----+--------+--------+
| name|user_age|user_qty|
+-----+--------+--------+
|Alice|       1|      30|
|  Bob|       2|      35|
+-----+--------+--------+



32. How to check if a dataframe has any missing values and count of missing values in each column?

In [157]:
from pyspark.sql.functions import col, sum
df = spark.createDataFrame([
("A", 1, None),
("B", None, "123" ),
("B", 3, "456"),
("D", None, None),
], ["Name", "Value", "id"])
missing = df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in df.columns))
has_missing = any(row.asDict().values() for row in missing.collect())

missing_count = missing.collect()[0].asDict()
print(missing_count)

df.show()

{'Name': 0, 'Value': 2, 'id': 2}
+----+-----+----+
|Name|Value|  id|
+----+-----+----+
|   A|    1|NULL|
|   B| NULL| 123|
|   B|    3| 456|
|   D| NULL|NULL|
+----+-----+----+



33 How to replace missing values of multiple numeric columns with the mean?

In [159]:
from pyspark.sql import functions as F

df = spark.createDataFrame([
    ("A", 1, None),
    ("B", None, 123),
    ("B", 3, 456),
    ("D", 6, None),
], ["Name", "var1", "var2"])

mean_values = df.select([F.mean(column).alias(column) for column in ["var1", "var2"]]).collect()[0]

df2 = df.na.fill({
    "var1": mean_values["var1"],
    "var2": mean_values["var2"]
})

df2.show()


+----+----+----+
|Name|var1|var2|
+----+----+----+
|   A|   1| 289|
|   B|   3| 123|
|   B|   3| 456|
|   D|   6| 289|
+----+----+----+



34. How to change the order of columns of a dataframe?

In [160]:

data = [("John", "Doe", 30), ("Jane", "Doe", 25), ("Alice", "Smith", 22)]
df = spark.createDataFrame(data, ["First_Name", "Last_Name", "Age"])
new_order = ["Age", "First_Name", "Last_Name"]
df = df.select(*new_order)
df.show()

+---+----------+---------+
|Age|First_Name|Last_Name|
+---+----------+---------+
| 30|      John|      Doe|
| 25|      Jane|      Doe|
| 22|     Alice|    Smith|
+---+----------+---------+



35. How to format or suppress scientific notations in a PySpark DataFrame?

In [161]:
df = spark.createDataFrame([(1, 0.000000123), (2, 0.000023456), (3, 0.000345678)], ["id", "your_column"])
from pyspark.sql.functions import format_number

# Determine the number of decimal places you want
decimal_places = 10

df = df.withColumn("your_column", format_number("your_column", decimal_places))
df.show()


+---+------------+
| id| your_column|
+---+------------+
|  1|0.0000001230|
|  2|0.0000234560|
|  3|0.0003456780|
+---+------------+

