https://www.machinelearningplus.com/pyspark/pyspark-exercises-101-pyspark-exercises-for-data-analysis/

### 1. How to import PySpark and check the version?

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Ex').getOrCreate()

In [4]:
print(spark.version)

3.5.1


### 57. How to View PySpark Cluster Details?

In [5]:
print(spark.sparkContext.uiWebUrl)

http://LAPTOP-3V2ROQ70:4040


In [6]:
from pyspark.sql.types import *

### Create empty dataframe

In [7]:
# Create an empty RDD by using emptyRDD() 
emptyRDD = spark.sparkContext.emptyRDD()
print(emptyRDD.collect()) 

[]


In [8]:
#Creates Empty RDD using parallelize
rdd22 = spark.sparkContext.parallelize([])
rdd22.collect()

[]

In [9]:
# Create Empty DataFrame without Schema (no columns) 

df = spark.createDataFrame([], StructType([]))
df.show()

++
||
++
++



In [10]:
# Create Empty DataFrame with Schema (StructType)

from pyspark.sql.types import StructType, StructField, StringType 
schema = StructType([
    StructField('firstname', StringType(), True),
    StructField('middlename', StringType(), True),
    StructField('lastname', StringType(), True)
])
df = spark.createDataFrame(emptyRDD, schema)
df.show()

+---------+----------+--------+
|firstname|middlename|lastname|
+---------+----------+--------+
+---------+----------+--------+



In [11]:
# 3. Convert Empty RDD to DataFrame

df1 = emptyRDD.toDF(schema)
df1.show()

+---------+----------+--------+
|firstname|middlename|lastname|
+---------+----------+--------+
+---------+----------+--------+



In [12]:
# 4. Create Empty DataFrame with Schema. 
df2 = spark.createDataFrame([], schema)
df.show()

+---------+----------+--------+
|firstname|middlename|lastname|
+---------+----------+--------+
+---------+----------+--------+



### 2. How to convert the index of a PySpark DataFrame into a column?

In [13]:
df = spark.createDataFrame([("Alice", 1), ("Bob", 2), ("Charlie", 3)],
                            ["Name", "Value"])
df.show()

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [14]:
from pyspark.sql.window import Window
from pyspark.sql.functions import monotonically_increasing_id, row_number

window = Window.orderBy(monotonically_increasing_id())

df.withColumn('index', row_number().over(window) - 1).show()

+-------+-----+-----+
|   Name|Value|index|
+-------+-----+-----+
|  Alice|    1|    0|
|    Bob|    2|    1|
|Charlie|    3|    2|
+-------+-----+-----+



### 3. How to combine many lists to form a PySpark DataFrame?

In [None]:
rdd = spark.sparkContext.parallelize(list(zip(list1, list2)))
df = rdd.toDF(['col1', 'col2'])
df.show() 

In [15]:
list1 = ["a", "b", "c", "d"]
list2 = [1, 2, 3, 4]

rdd = spark.sparkContext.parallelize(list(zip(list1, list2)))
df = rdd.toDF(['col1','col2'])
df.show()

+----+----+
|col1|col2|
+----+----+
|   a|   1|
|   b|   2|
|   c|   3|
|   d|   4|
+----+----+



### 4. How to get the items of list A not present in list B?

In [16]:
list_A = [1, 2, 3, 4, 5]
list_B = [4, 5, 6, 7, 8]

sc = spark.sparkContext

rdd1 = sc.parallelize(list_A)
rdd2 = sc.parallelize(list_B)

rdd = rdd1.subtract(rdd2)

print(rdd.collect())

[1, 2, 3]


### 5. How to get the items not common to both list A and list B?

In [17]:
list_A = [1, 2, 3, 4, 5]
list_B = [4, 5, 6, 7, 8]

sc = spark.sparkContext
rdd1 = sc.parallelize(list_A)
rdd2 = sc.parallelize(list_B)

r1 = rdd1.subtract(rdd2)
r2 = rdd2.subtract(rdd1)

r = r1.union(r2)
rdd = r.collect()
print(rdd)

[1, 2, 3, 6, 7, 8]


### 6. How to get the minimum, 25th percentile, median, 75th, and max of a numeric column?

In [18]:
data = [("A", 10), ("B", 20), ("C", 30), ("D", 40), ("E", 50), ("F", 15), ("G", 28), ("H", 54), ("I", 41), ("J", 86)]
df = spark.createDataFrame(data, ['Name', 'Age'])
df.show()

+----+---+
|Name|Age|
+----+---+
|   A| 10|
|   B| 20|
|   C| 30|
|   D| 40|
|   E| 50|
|   F| 15|
|   G| 28|
|   H| 54|
|   I| 41|
|   J| 86|
+----+---+



In [19]:
quantiles = df.approxQuantile('Age', [0.0, 0.25, 0.5, 0.75, 1.0], 0.01)
quantiles

[10.0, 20.0, 30.0, 50.0, 86.0]

In [20]:
print("Min: ", quantiles[0])
print("25th percentile: ", quantiles[1])
print("Median: ", quantiles[2])
print("75th percentile: ", quantiles[3])
print("Max: ", quantiles[4])

Min:  10.0
25th percentile:  20.0
Median:  30.0
75th percentile:  50.0
Max:  86.0


### 7. How to get frequency counts of unique items of a column?

In [21]:
from pyspark.sql import Row

data = [
    Row(name='John', job='Engineer'),
    Row(name='John', job='Engineer'),
    Row(name='Mary', job='Scientist'),
    Row(name='Bob', job='Engineer'),
    Row(name='Bob', job='Engineer'),
    Row(name='Bob', job='Scientist'),
    Row(name='Sam', job='Doctor')
]
df= spark.createDataFrame(data)
df.show()

+----+---------+
|name|      job|
+----+---------+
|John| Engineer|
|John| Engineer|
|Mary|Scientist|
| Bob| Engineer|
| Bob| Engineer|
| Bob|Scientist|
| Sam|   Doctor|
+----+---------+



In [22]:
df.groupBy('job').count().show()

+---------+-----+
|      job|count|
+---------+-----+
| Engineer|    4|
|Scientist|    2|
|   Doctor|    1|
+---------+-----+



### 8. How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?

In [23]:
from pyspark.sql.functions import *

In [24]:
t1 = df.groupBy('job').count().orderBy('count', ascending= False).limit(2)
t = t1.select('job').rdd.flatMap(lambda x:x).collect()
t

['Engineer', 'Scientist']

In [25]:
df = df.withColumn('job', when(col('job').isin(t), col('job')).otherwise('Other'))
df.show()

+----+---------+
|name|      job|
+----+---------+
|John| Engineer|
|John| Engineer|
|Mary|Scientist|
| Bob| Engineer|
| Bob| Engineer|
| Bob|Scientist|
| Sam|    Other|
+----+---------+



### 9. How to Drop rows with NA values specific to a particular column?

In [26]:
df = spark.createDataFrame([
("A", 1, None),
("B", None, "123" ),
("B", 3, "456"),
("D", None, None),
], ["Name", "Value", "id"])

df.show()

+----+-----+----+
|Name|Value|  id|
+----+-----+----+
|   A|    1|NULL|
|   B| NULL| 123|
|   B|    3| 456|
|   D| NULL|NULL|
+----+-----+----+



In [27]:
df.dropna(subset=['Value']).show()

+----+-----+----+
|Name|Value|  id|
+----+-----+----+
|   A|    1|NULL|
|   B|    3| 456|
+----+-----+----+



### 10. How to rename columns of a PySpark DataFrame using two lists – one containing the old column names and the other containing the new column names?

In [28]:
df = spark.createDataFrame([(1, 2, 3), (4, 5, 6)], ["col1", "col2", "col3"])

old_names = ["col1", "col2", "col3"]
new_names = ["new_col1", "new_col2", "new_col3"]

df.show()

+----+----+----+
|col1|col2|col3|
+----+----+----+
|   1|   2|   3|
|   4|   5|   6|
+----+----+----+



In [29]:
df.toDF(*new_names).show()

+--------+--------+--------+
|new_col1|new_col2|new_col3|
+--------+--------+--------+
|       1|       2|       3|
|       4|       5|       6|
+--------+--------+--------+



In [30]:
for old_names, new_names in zip(old_names, new_names):
    df = df.withColumnRenamed(old_names, new_names)
df.show()

+--------+--------+--------+
|new_col1|new_col2|new_col3|
+--------+--------+--------+
|       1|       2|       3|
|       4|       5|       6|
+--------+--------+--------+



### 11. print random number

In [31]:
df = spark.range(10)
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



In [32]:
nitem=10
df = spark.range(nitem).select(rand().alias('values'))
df.show()

+--------------------+
|              values|
+--------------------+
|  0.6215324217275521|
|  0.9620160229965483|
|  0.5762688869088491|
|2.594845203433005E-4|
|  0.5690733699843449|
|   0.531704931092798|
|  0.2281446039677244|
|  0.6206951757453757|
|  0.8652795522695643|
| 0.12055456066418213|
+--------------------+



### 13. How to find the numbers that are multiples of 3 from a column?

In [33]:
df = spark.range(10)
df.printSchema()
df = df.withColumn('random', ((rand(42)*10)+1).cast('int'))
df.show()
df.printSchema()

root
 |-- id: long (nullable = false)

+---+------+
| id|random|
+---+------+
|  0|     7|
|  1|     9|
|  2|     8|
|  3|     8|
|  4|     3|
|  5|     1|
|  6|     7|
|  7|     4|
|  8|     5|
|  9|     1|
+---+------+

root
 |-- id: long (nullable = false)
 |-- random: integer (nullable = true)



In [34]:
df.withColumn('is_multiple_of_3', when(col('random') %3 == 0, "yes").otherwise('No')).show()

+---+------+----------------+
| id|random|is_multiple_of_3|
+---+------+----------------+
|  0|     7|              No|
|  1|     9|             yes|
|  2|     8|              No|
|  3|     8|              No|
|  4|     3|             yes|
|  5|     1|              No|
|  6|     7|              No|
|  7|     4|              No|
|  8|     5|              No|
|  9|     1|              No|
+---+------+----------------+



### 14. How to extract items at given positions from a column?

In [35]:
pos = [0, 4, 8, 5] 

df = spark.range(10)
df = df.withColumn('random', ((rand(42)*10)+1).cast('int'))

# add index
w = Window.orderBy(monotonically_increasing_id())
df = df.withColumn('index', row_number().over(w)-1)

# Filter the DF based on the specified positions
df_f = df.filter(col('index').isin(pos))
df_f.show()

+---+------+-----+
| id|random|index|
+---+------+-----+
|  0|     7|    0|
|  4|     3|    4|
|  5|     1|    5|
|  8|     5|    8|
+---+------+-----+



### 15. How to stack two DataFrames vertically ?

In [36]:
df_A = spark.createDataFrame([("apple", 3, 5), ("banana", 1, 10), ("orange", 2, 8)], ["Name", "Col_1", "Col_2"])
df_A.show()

df_B = spark.createDataFrame([("apple", 3, 5), ("banana", 1, 15), ("grape", 4, 6)], ["Name", "Col_1", "Col_3"])
df_B.show()

+------+-----+-----+
|  Name|Col_1|Col_2|
+------+-----+-----+
| apple|    3|    5|
|banana|    1|   10|
|orange|    2|    8|
+------+-----+-----+

+------+-----+-----+
|  Name|Col_1|Col_3|
+------+-----+-----+
| apple|    3|    5|
|banana|    1|   15|
| grape|    4|    6|
+------+-----+-----+



In [37]:
df_A.union(df_B).show()

+------+-----+-----+
|  Name|Col_1|Col_2|
+------+-----+-----+
| apple|    3|    5|
|banana|    1|   10|
|orange|    2|    8|
| apple|    3|    5|
|banana|    1|   15|
| grape|    4|    6|
+------+-----+-----+



### 17. How to convert the first character of each element in a series to uppercase?

In [38]:
data = [("john",), ("alice",), ("bob",)]
df = spark.createDataFrame(data, ["name"])
df.show()

+-----+
| name|
+-----+
| john|
|alice|
|  bob|
+-----+



In [39]:
df.withColumn('name', upper(col('name'))).show()

# Convert the first character to uppercase
df.withColumn('name', initcap(col('name'))).show()

+-----+
| name|
+-----+
| JOHN|
|ALICE|
|  BOB|
+-----+

+-----+
| name|
+-----+
| John|
|Alice|
|  Bob|
+-----+



### 18. How to compute summary statistics for all columns in a dataframe

In [40]:
data = [('James', 34, 55000),
('Michael', 30, 70000),
('Robert', 37, 60000),
('Maria', 29, 80000),
('Jen', 32, 65000)]
df = spark.createDataFrame(data, ["name", "age" , "salary"])
df.show()

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|  James| 34| 55000|
|Michael| 30| 70000|
| Robert| 37| 60000|
|  Maria| 29| 80000|
|    Jen| 32| 65000|
+-------+---+------+



In [41]:
df.describe().show()

+-------+------+-----------------+-----------------+
|summary|  name|              age|           salary|
+-------+------+-----------------+-----------------+
|  count|     5|                5|                5|
|   mean|  NULL|             32.4|          66000.0|
| stddev|  NULL|3.209361307176242|9617.692030835675|
|    min| James|               29|            55000|
|    max|Robert|               37|            80000|
+-------+------+-----------------+-----------------+



In [42]:
df.summary().show()

+-------+------+-----------------+-----------------+
|summary|  name|              age|           salary|
+-------+------+-----------------+-----------------+
|  count|     5|                5|                5|
|   mean|  NULL|             32.4|          66000.0|
| stddev|  NULL|3.209361307176242|9617.692030835675|
|    min| James|               29|            55000|
|    25%|  NULL|               30|            60000|
|    50%|  NULL|               32|            65000|
|    75%|  NULL|               34|            70000|
|    max|Robert|               37|            80000|
+-------+------+-----------------+-----------------+



### 19. How to calculate the number of characters in each word in a column?

In [43]:
data = [("john",), ("alice",), ("bob",)]
df = spark.createDataFrame(data, ["name"])

df.show()

+-----+
| name|
+-----+
| john|
|alice|
|  bob|
+-----+



In [44]:
df.withColumn('word_length', length(df.name)).show()

+-----+-----------+
| name|word_length|
+-----+-----------+
| john|          4|
|alice|          5|
|  bob|          3|
+-----+-----------+



### 20 How to compute difference of differences between consecutive numbers of a column?

In [45]:
data = [('James', 34, 55000),
('Michael', 30, 70000),
('Robert', 37, 60000),
('Maria', 29, 80000),
('Jen', 32, 65000)]

df = spark.createDataFrame(data, ["name", "age" , "salary"])

df.show()

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|  James| 34| 55000|
|Michael| 30| 70000|
| Robert| 37| 60000|
|  Maria| 29| 80000|
|    Jen| 32| 65000|
+-------+---+------+



In [46]:
df = df.withColumn('id', monotonically_increasing_id())
window = Window.orderBy('id')

# Generate the lag of the variable 
df = df.withColumn('prev_value', lag(df.salary).over(window))
df.show()

+-------+---+------+-----------+----------+
|   name|age|salary|         id|prev_value|
+-------+---+------+-----------+----------+
|  James| 34| 55000| 8589934592|      NULL|
|Michael| 30| 70000|25769803776|     55000|
| Robert| 37| 60000|34359738368|     70000|
|  Maria| 29| 80000|51539607552|     60000|
|    Jen| 32| 65000|60129542144|     80000|
+-------+---+------+-----------+----------+



In [47]:
# Compute the difference with lag 
df.withColumn('diff', when(isnull(df.salary - df.prev_value), 0).otherwise(df.salary-df.prev_value)).drop('id').show()

+-------+---+------+----------+------+
|   name|age|salary|prev_value|  diff|
+-------+---+------+----------+------+
|  James| 34| 55000|      NULL|     0|
|Michael| 30| 70000|     55000| 15000|
| Robert| 37| 60000|     70000|-10000|
|  Maria| 29| 80000|     60000| 20000|
|    Jen| 32| 65000|     80000|-15000|
+-------+---+------+----------+------+



In [48]:
df.select(df.salary - df.prev_value).show()

+---------------------+
|(salary - prev_value)|
+---------------------+
|                 NULL|
|                15000|
|               -10000|
|                20000|
|               -15000|
+---------------------+



### 21. How to get the 
### day of month,  week number, day of year and  day of week from a date strings?

In [49]:
data = [("2023-05-18","01 Jan 2010",), ("2023-12-31", "01 Jan 2010",)]
df = spark.createDataFrame(data, ["date_str_1", "date_str_2"])

df.show()

+----------+-----------+
|date_str_1| date_str_2|
+----------+-----------+
|2023-05-18|01 Jan 2010|
|2023-12-31|01 Jan 2010|
+----------+-----------+



In [50]:
# Convert date string to date format
df = df.withColumn('date_1', to_date(df.date_str_1, 'yyyy-MM-dd'))
df = df.withColumn('date_2', to_date(df.date_str_2, 'dd MMM yyyy'))
df.show()

+----------+-----------+----------+----------+
|date_str_1| date_str_2|    date_1|    date_2|
+----------+-----------+----------+----------+
|2023-05-18|01 Jan 2010|2023-05-18|2010-01-01|
|2023-12-31|01 Jan 2010|2023-12-31|2010-01-01|
+----------+-----------+----------+----------+



In [51]:
df = df.withColumn("day_of_month", dayofmonth(df.date_1))\
.withColumn("week_number", weekofyear(df.date_1))\
.withColumn("day_of_year", dayofyear(df.date_1))\
.withColumn("day_of_week", dayofweek(df.date_1))

df.show()

+----------+-----------+----------+----------+------------+-----------+-----------+-----------+
|date_str_1| date_str_2|    date_1|    date_2|day_of_month|week_number|day_of_year|day_of_week|
+----------+-----------+----------+----------+------------+-----------+-----------+-----------+
|2023-05-18|01 Jan 2010|2023-05-18|2010-01-01|          18|         20|        138|          5|
|2023-12-31|01 Jan 2010|2023-12-31|2010-01-01|          31|         52|        365|          1|
+----------+-----------+----------+----------+------------+-----------+-----------+-----------+



### 22. How to convert year-month string to dates corresponding to the 4th day of the month?

In [52]:
df = spark.createDataFrame([('Jan 2010',), ('Feb 2011',), ('Mar 2012',)], ['MonthYear'])
df.show()

+---------+
|MonthYear|
+---------+
| Jan 2010|
| Feb 2011|
| Mar 2012|
+---------+



In [53]:
df.withColumn('Date', to_date(df.MonthYear, 'MMM yyyy')).show()

+---------+----------+
|MonthYear|      Date|
+---------+----------+
| Jan 2010|2010-01-01|
| Feb 2011|2011-02-01|
| Mar 2012|2012-03-01|
+---------+----------+



### 23 How to filter words that contain atleast 2 vowels from a series?

In [54]:
df = spark.createDataFrame([('Apple',), ('Orange',), ('Plan',) , ('Python',) , ('Money',)], ['Word'])
df.show()

+------+
|  Word|
+------+
| Apple|
|Orange|
|  Plan|
|Python|
| Money|
+------+



In [55]:
df.where((length(col('Word')) - length(translate(col('Word'), 'AEIOUaeiou', ''))) >= 2).show()

+------+
|  Word|
+------+
| Apple|
|Orange|
| Money|
+------+



### 30. How to get the nrows, ncolumns, datatype of a dataframe?

In [56]:
print(df.count())
print(len(df.columns))
print(df.dtypes)

5
1
[('Word', 'string')]


### 32. How to check if a dataframe has any missing values and count of missing values in each column?

In [57]:
df = spark.createDataFrame([
("A", 1, None),
("B", None, "123" ),
("B", 3, "456"),
("D", None, None),
], ["Name", "Value", "id"])

df.show()

+----+-----+----+
|Name|Value|  id|
+----+-----+----+
|   A|    1|NULL|
|   B| NULL| 123|
|   B|    3| 456|
|   D| NULL|NULL|
+----+-----+----+



In [58]:
df.where(df.id.isNull()).count()

2

In [59]:
df1 = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
df1.show()

+----+-----+---+
|Name|Value| id|
+----+-----+---+
|   0|    2|  2|
+----+-----+---+



In [60]:
df1.collect()[0].asDict()

{'Name': 0, 'Value': 2, 'id': 2}

### 33 How to replace missing values of multiple numeric columns with the mean?

In [61]:
df = spark.createDataFrame([
("A", 1, None),
("B", None, 123 ),
("B", 3, 456),
("D", 6, None),
], ["Name", "var1", "var2"])

df.show()

+----+----+----+
|Name|var1|var2|
+----+----+----+
|   A|   1|NULL|
|   B|NULL| 123|
|   B|   3| 456|
|   D|   6|NULL|
+----+----+----+



In [62]:
from pyspark.ml.feature import Imputer

In [63]:
column_names = ["var1", "var2"]

imputer = Imputer(inputCols= column_names, outputCols= column_names, strategy="mean")
model = imputer.fit(df)

imputed_df = model.transform(df)
imputed_df.show(5)

+----+----+----+
|Name|var1|var2|
+----+----+----+
|   A|   1| 289|
|   B|   3| 123|
|   B|   3| 456|
|   D|   6| 289|
+----+----+----+



In [64]:
df.na.fill(value=0, subset=['var1']).show()

+----+----+----+
|Name|var1|var2|
+----+----+----+
|   A|   1|NULL|
|   B|   0| 123|
|   B|   3| 456|
|   D|   6|NULL|
+----+----+----+



### 34. How to change the order of columns of a dataframe?

In [65]:
data = [("John", "Doe", 30), ("Jane", "Doe", 25), ("Alice", "Smith", 22)]
df = spark.createDataFrame(data, ["First_Name", "Last_Name", "Age"])
df.show()

+----------+---------+---+
|First_Name|Last_Name|Age|
+----------+---------+---+
|      John|      Doe| 30|
|      Jane|      Doe| 25|
|     Alice|    Smith| 22|
+----------+---------+---+



In [66]:
new_order = ["Age", "First_Name", "Last_Name"]
df.select(*new_order).show()

+---+----------+---------+
|Age|First_Name|Last_Name|
+---+----------+---------+
| 30|      John|      Doe|
| 25|      Jane|      Doe|
| 22|     Alice|    Smith|
+---+----------+---------+



### 44. How to create lags and leads of a column by group in a dataframe?

In [67]:
data = [("2023-01-01", "Store1", 100),
("2023-01-02", "Store1", 150),
("2023-01-03", "Store1", 200),
("2023-01-04", "Store1", 250),
("2023-01-05", "Store1", 300),
("2023-01-01", "Store2", 50),
("2023-01-02", "Store2", 60),
("2023-01-03", "Store2", 80),
("2023-01-04", "Store2", 90),
("2023-01-05", "Store2", 120)]

df = spark.createDataFrame(data, ["Date", "Store", "Sales"])
df.show()
df.printSchema()

+----------+------+-----+
|      Date| Store|Sales|
+----------+------+-----+
|2023-01-01|Store1|  100|
|2023-01-02|Store1|  150|
|2023-01-03|Store1|  200|
|2023-01-04|Store1|  250|
|2023-01-05|Store1|  300|
|2023-01-01|Store2|   50|
|2023-01-02|Store2|   60|
|2023-01-03|Store2|   80|
|2023-01-04|Store2|   90|
|2023-01-05|Store2|  120|
+----------+------+-----+

root
 |-- Date: string (nullable = true)
 |-- Store: string (nullable = true)
 |-- Sales: long (nullable = true)



In [68]:
# Convert the date from string to date type 
df = df.withColumn('Date', to_date(df.Date, 'yyyy-MM-dd'))

In [69]:
# Create a Window partitioned by store, ordered by Date 
window = Window.partitionBy('Store').orderBy('Date')

In [70]:
# Create lag variables
df = df.withColumn('Lag_Sales', lag(df.Sales).over(window))
df.show()

+----------+------+-----+---------+
|      Date| Store|Sales|Lag_Sales|
+----------+------+-----+---------+
|2023-01-01|Store1|  100|     NULL|
|2023-01-02|Store1|  150|      100|
|2023-01-03|Store1|  200|      150|
|2023-01-04|Store1|  250|      200|
|2023-01-05|Store1|  300|      250|
|2023-01-01|Store2|   50|     NULL|
|2023-01-02|Store2|   60|       50|
|2023-01-03|Store2|   80|       60|
|2023-01-04|Store2|   90|       80|
|2023-01-05|Store2|  120|       90|
+----------+------+-----+---------+



In [71]:
# Create lead variables 
df = df.withColumn('Lead_Sales', lead(df.Sales).over(window))
df.show()

+----------+------+-----+---------+----------+
|      Date| Store|Sales|Lag_Sales|Lead_Sales|
+----------+------+-----+---------+----------+
|2023-01-01|Store1|  100|     NULL|       150|
|2023-01-02|Store1|  150|      100|       200|
|2023-01-03|Store1|  200|      150|       250|
|2023-01-04|Store1|  250|      200|       300|
|2023-01-05|Store1|  300|      250|      NULL|
|2023-01-01|Store2|   50|     NULL|        60|
|2023-01-02|Store2|   60|       50|        80|
|2023-01-03|Store2|   80|       60|        90|
|2023-01-04|Store2|   90|       80|       120|
|2023-01-05|Store2|  120|       90|      NULL|
+----------+------+-----+---------+----------+



### 49. How to Pivot the dataframe (converting rows into columns) ?

In [72]:
data = [
(2021, 1, "US", 5000),
(2021, 1, "EU", 4000),
(2021, 2, "US", 5500),
(2021, 2, "EU", 4500),
(2021, 3, "US", 6000),
(2021, 3, "EU", 5000),
(2021, 4, "US", 7000),
(2021, 4, "EU", 6000),
]

# Create DataFrame
columns = ["year", "quarter", "region", "revenue"]
df = spark.createDataFrame(data, columns)
df.show()

+----+-------+------+-------+
|year|quarter|region|revenue|
+----+-------+------+-------+
|2021|      1|    US|   5000|
|2021|      1|    EU|   4000|
|2021|      2|    US|   5500|
|2021|      2|    EU|   4500|
|2021|      3|    US|   6000|
|2021|      3|    EU|   5000|
|2021|      4|    US|   7000|
|2021|      4|    EU|   6000|
+----+-------+------+-------+



In [73]:
pivot_df = df.groupBy('year', 'quarter').pivot('region').sum('revenue')
pivot_df.show()

+----+-------+----+----+
|year|quarter|  EU|  US|
+----+-------+----+----+
|2021|      2|4500|5500|
|2021|      1|4000|5000|
|2021|      3|5000|6000|
|2021|      4|6000|7000|
+----+-------+----+----+



### 50. How to UnPivot the dataframe (converting columns into rows) ?

In [94]:
data = [(2021, 2, 4500, 5500),
(2021, 1, 4000, 5000),
(2021, 3, 5000, 6000),
(2021, 4, 6000, 7000)]

columns = ["year", "quarter", "EU", "US"]
df = spark.createDataFrame(data, columns)

df.show()

+----+-------+----+----+
|year|quarter|  EU|  US|
+----+-------+----+----+
|2021|      2|4500|5500|
|2021|      1|4000|5000|
|2021|      3|5000|6000|
|2021|      4|6000|7000|
+----+-------+----+----+



In [98]:
unpivotExpr = "stack(2, 'EU', EU, 'US', US) as (region, revenue)"
unPivotDF = df.select('year', 'quarter', expr(unpivotExpr)).where("revenue is not null")
unPivotDF.show()

+----+-------+------+-------+
|year|quarter|region|revenue|
+----+-------+------+-------+
|2021|      2|    EU|   4500|
|2021|      2|    US|   5500|
|2021|      1|    EU|   4000|
|2021|      1|    US|   5000|
|2021|      3|    EU|   5000|
|2021|      3|    US|   6000|
|2021|      4|    EU|   6000|
|2021|      4|    US|   7000|
+----+-------+------+-------+



### 51. How to impute missing values with Zero?

In [76]:
df = spark.createDataFrame([(1, None), (None, 2), (3, 4), (5, None)], ["a", "b"])
df.show()

+----+----+
|   a|   b|
+----+----+
|   1|NULL|
|NULL|   2|
|   3|   4|
|   5|NULL|
+----+----+



In [77]:
df.fillna(0).show()

+---+---+
|  a|  b|
+---+---+
|  1|  0|
|  0|  2|
|  3|  4|
|  5|  0|
+---+---+



### 55. How to convert a column to lower case using UDF?

In [78]:
data = [('John Doe', 'NEW YORK'),
('Jane Doe', 'LOS ANGELES'),
('Mike Johnson', 'CHICAGO'),
('Sara Smith', 'SAN FRANCISCO')]

df = spark.createDataFrame(data, ['Name', 'City'])
df.show()

+------------+-------------+
|        Name|         City|
+------------+-------------+
|    John Doe|     NEW YORK|
|    Jane Doe|  LOS ANGELES|
|Mike Johnson|      CHICAGO|
|  Sara Smith|SAN FRANCISCO|
+------------+-------------+



In [79]:
def to_lower(s):
    if s is not None:
        return s.lower()

In [80]:
udf_to_lower = udf(to_lower, StringType())
df = df.withColumn('City_lower', udf_to_lower(df.City))
df.show()

+------------+-------------+-------------+
|        Name|         City|   City_lower|
+------------+-------------+-------------+
|    John Doe|     NEW YORK|     new york|
|    Jane Doe|  LOS ANGELES|  los angeles|
|Mike Johnson|      CHICAGO|      chicago|
|  Sara Smith|SAN FRANCISCO|san francisco|
+------------+-------------+-------------+



### 59. How to restrict the PySpark to use the number of cores in the system?

### 60. How to cache PySpark DataFrame or objects and delete cache?

In [81]:
df.cache()
df.unpersist()

DataFrame[Name: string, City: string, City_lower: string]

In [82]:
df.show()

+------------+-------------+-------------+
|        Name|         City|   City_lower|
+------------+-------------+-------------+
|    John Doe|     NEW YORK|     new york|
|    Jane Doe|  LOS ANGELES|  los angeles|
|Mike Johnson|      CHICAGO|      chicago|
|  Sara Smith|SAN FRANCISCO|san francisco|
+------------+-------------+-------------+



### 61. How to Divide a PySpark DataFrame randomly in a given ratio (0.8, 0.2)?

In [83]:
a, b = df.randomSplit([0.8, 0.2])

In [84]:
a.show()
b.show()

+------------+-------------+-------------+
|        Name|         City|   City_lower|
+------------+-------------+-------------+
|    Jane Doe|  LOS ANGELES|  los angeles|
|Mike Johnson|      CHICAGO|      chicago|
|  Sara Smith|SAN FRANCISCO|san francisco|
+------------+-------------+-------------+

+--------+--------+----------+
|    Name|    City|City_lower|
+--------+--------+----------+
|John Doe|NEW YORK|  new york|
+--------+--------+----------+



### 69. How to calculate missing value percentage in each column?

In [85]:
data = [("John", "Doe", None),
(None, "Smith", "New York"),
("Mike", "Smith", None),
("Anna", "Smith", "Boston"),
(None, None, None)]

df = spark.createDataFrame(data, ["FirstName", "LastName", "City"])
df.show()

+---------+--------+--------+
|FirstName|LastName|    City|
+---------+--------+--------+
|     John|     Doe|    NULL|
|     NULL|   Smith|New York|
|     Mike|   Smith|    NULL|
|     Anna|   Smith|  Boston|
|     NULL|    NULL|    NULL|
+---------+--------+--------+



In [86]:
total_rows = df.count()
print(total_rows)

5


In [87]:
for column in df.columns:
    null_values = df.filter(df[column].isNull()).count()
    missing_percentage = (null_values / total_rows) *100 
    print(f"Missing values in {column}: {missing_percentage}%")

Missing values in FirstName: 40.0%
Missing values in LastName: 20.0%
Missing values in City: 60.0%


In [88]:
spark.stop()