In [0]:
# How to convert the index of a PySpark DataFrame into a column?
# +-------+-----+
# | Name|Value|
# +-------+-----+
# | Alice| 1|
# | Bob| 2|
# |Charlie| 3|
# +-------+-----+

# Expected output:
# +-------+-----+-----+
# | Name|Value|index|
# +-------+-----+-----+
# | Alice| 1| 0|
# | Bob| 2| 1|
# |Charlie| 3| 2|
# +-------+-----+-----+

from pyspark.sql.functions import row_number, monotonically_increasing_id
from pyspark.sql.window import Window

df = spark.createDataFrame([
("Alice", 1),
("Bob", 2),
("Charlie", 3),
], ["Name", "Value"])

df.show()

# define window specification
w = Window.orderBy(monotonically_increasing_id())
print(w)

# add index column
# row_number() function is designed to start numbering from 1, and not from 0
# hence need to add -1
df_window_monotonically = df.withColumn("index", row_number().over(w) - 1)
df_window_monotonically.show()

df_monotonically = df.withColumn("index", monotonically_increasing_id())
df_monotonically.show()

In [0]:
# How to combine many lists to form a PySpark DataFrame
list1 = ["a", "b", "c", "d"]
list2 = [1, 2, 3, 4]

data = zip(list1,list2)
df = spark.createDataFrame(data, ["col_list1", "col_list2"])
df.show()

In [0]:
# How to get the items of list A not present in list B
list_A = [1, 2, 3, 4, 5]
list_B = [4, 5, 6, 7, 8]

# in method
filtered_list = [i for i in list_A if i not in list_B]
print(filtered_list)

# execptAll method with df
df_A = spark.createDataFrame(list_A, ["value"])
df_B = spark.createDataFrame(list_B, ["value"])

result_df = df_A.exceptAll(df_B)
result_df.show()

In [0]:
# How to get the items not common to both list A and list B
list_A = [1, 2, 3, 4, 5]
list_B = [4, 5, 6, 7, 8]

# Approach 1: Using List Comprehension - worst performance
filtered_list_A = [i for i in list_A if i not in list_B]
filtered_list_B = [i for i in list_B if i not in list_A]
filtered_list = filtered_list_A + filtered_list_B
print(list(set(filtered_list)))

# Approach 2: Using Sets for Symmetric Difference - best approach
filtered_list = set(list_A).symmetric_difference(list_B)
print(list(filtered_list))

# Approach 3: Using a Set Union Minus an Intersection
set_A_B = set(list_A) | set(list_B)
intersect_A_B = set(list_A) & set(list_B)
filtered_list = set_A_B - intersect_A_B
print(filtered_list)

In [0]:
# How to get the minimum, 25th percentile, median, 75th, and max of a numeric column?

data = [("A", 10), ("B", 20), ("C", 30), ("D", 40), ("E", 50), ("F", 15), ("G", 28), ("H", 54), ("I", 41), ("J", 86)]
df = spark.createDataFrame(data, ["Name", "Age"])

df.show()

# Approach 1: Using approxQuantile()
# O(n log(n))
quantiles = df.approxQuantile("Age", [0.0, 0.25, 0.5, 0.75, 1.0], 0.01)
print(quantiles)
print("Min: ", quantiles[0])
print("25th percentile: ", quantiles[1])
print("Median: ", quantiles[2])
print("75th percentile: ", quantiles[3])
print("Max: ", quantiles[4])

In [0]:
# How to get frequency counts of unique items of a column?
from pyspark.sql import Row

# Sample data
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]

df = spark.createDataFrame(data)
df.show()

distinct_job = df.groupBy("job").count()
distinct_job.show()

In [0]:
# How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?
from pyspark.sql import Row
from pyspark.sql import functions as F

# Sample data
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]

df = spark.createDataFrame(data)
df.show()

df_distinct = df.groupBy("job").count()
df_distinct = df_distinct.orderBy("count", ascending=False).limit(2).select("job")
distinct_job_list = [row['job'] for row in df_distinct.collect()]
print(distinct_job_list)

df_with_replacement = df.withColumn("job", F.when(F.col("job").isin(distinct_job_list), F.col("job")).otherwise("Other"))
df_with_replacement.show()

In [0]:
# Get frequency counts of unique items of a column for the top 2 and combine the remainng as others
from pyspark.sql import Row

# Sample data
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
Row(name='Sam', job='Teacher'),
]

df = spark.createDataFrame(data)
df.show()

df_distinct = df.groupBy("job").count().limit(2).select("job")
distinct_job_list = [row["job"] for row in df_distinct.collect()]
print(distinct_job_list)

df_distinct_count = df.withColumn("job", F.when(F.col("job").isin(distinct_job_list), F.col("job")).otherwise("Other"))
df_distinct_count = df_distinct_count.groupBy("job").count()
df_distinct_count.show()


In [0]:
# How to Drop rows with NA values specific to a particular column

df = spark.createDataFrame([
("A", 1, None),
("B", None, "123" ),
("B", 3, "456"),
("D", None, None),
], ["Name", "Value", "id"])

df.show()

df = df.dropna(subset=["Value"])
df.show()

In [0]:
# How to rename columns of a PySpark DataFrame using two lists – one containing the old column names and the other containing the new column names

# suppose you have the following DataFrame
df = spark.createDataFrame([(1, 2, 3), (4, 5, 6)], ["col1", "col2", "col3"])

# old column names
old_names = ["col1", "col2", "col3"]

# new column names
new_names = ["new_col1", "new_col2", "new_col3"]

df.show()

for i in range(len(old_names)):
    df = df.withColumnRenamed(old_names[i], new_names[i])
df.show()

for old_name, new_name in zip(old_names, new_names):
    df = df.withColumnRenamed(old_name, new_name)
df.show()

In [0]:
# How to bin a numeric list to 10 groups of equal size
from pyspark.sql.functions import rand
from pyspark.ml.feature import Bucketizer

# Create a DataFrame with a single column "values" filled with random numbers
num_items = 100
df = spark.range(num_items).select(rand(seed=42).alias("values"))

df.show(5)

num_buckets = 10
quantiles = df.stat.approxQuantile("values", [i/num_buckets for i in range(num_buckets+1)], 0.01)
print(quantiles)

In [0]:
# How to create contigency table?
data = [("A", "X"), ("A", "Y"), ("A", "X"), ("B", "Y"), ("B", "X"), ("C", "X"), ("C", "X"), ("C", "Y")]
df = spark.createDataFrame(data, ["category1", "category2"])
df.show()

# Creating a cube on ‘category1’, and calculate the number of rows in each dimensional.
# Frequency
df.cube("category1").count().show()

# Contingency table
df.crosstab('category1', 'category2').show()

In [0]:
# How to find the numbers that are multiples of 3 from a column?

from pyspark.sql.functions import rand, col
from pyspark.sql import functions as F

# Generate a DataFrame with a single column "id" with 10 rows
df = spark.range(10)

# Generate a random float between 0 and 1, scale and shift it to get a random integer between 1 and 10
df = df.withColumn("random", ((rand(seed=42) * 10) + 1).cast("int"))

# Show the DataFrame
df.show()

df = df.withColumn("is_multiple_of_3", F.when(F.col("random") % 3 == 0, 1).otherwise(0))
df.show()

In [0]:
# How to extract items at given positions from a column?

from pyspark.sql.functions import rand, monotonically_increasing_id, row_number
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# Generate a DataFrame with a single column "id" with 10 rows
df = spark.range(10)

# Generate a random float between 0 and 1, scale and shift it to get a random integer between 1 and 10
# Generates random numbers between 0.0 and 1.0 for each row in the DataFrame.
# The seed ensures the sequence of random numbers is reproducible.
df = df.withColumn("random", ((rand(seed=42) * 10) + 1).cast("int"))

# Show the DataFrame
df.show()

pos = [0, 4, 8, 5]

window = Window.orderBy(monotonically_increasing_id())

df = df.withColumn("index", row_number().over(window) - 1)
df.show()
df = df.filter(F.col("index").isin(pos))
df.show()

In [0]:
# How to stack two DataFrames vertically ?

# Create DataFrame for region A
df_A = spark.createDataFrame([("apple", 3, 5), ("banana", 1, 10), ("orange", 2, 8)], ["Name", "Col_1", "Col_2"])
df_A.show()

# Create DataFrame for region B
df_B = spark.createDataFrame([("apple", 3, 5), ("banana", 1, 15), ("grape", 4, 6)], ["Name", "Col_1", "Col_3"])
df_B.show()

df = df_A.unionAll(df_B)
df.show()

In [0]:
# How to compute the mean squared error on a truth and predicted columns

data = [(1, 1), (2, 4), (3, 9), (4, 16), (5, 25)]
df = spark.createDataFrame(data, ["actual", "predicted"])

df.show()

df = df.withColumn("squared_error", pow((col("actual") - col("predicted")), 2))
df.show()

mse = df.agg({"squared_error":"avg"}).collect()[0][0]
print(f"Mean Squared Error (MSE) = {mse}")

mse = df.select(F.avg(F.col("squared_error"))).collect()[0][0]
print(f"Mean Squared Error (MSE) = {mse}")


In [0]:
# How to compute difference of differences between consecutive numbers of a column

from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.window import Window

data = [('James', 34, 55000),
('Michael', 30, 70000),
('Robert', 37, 60000),
('Maria', 29, 80000),
('Jen', 32, 65000)]

df = spark.createDataFrame(data, ["name", "age" , "salary"])

df.show()

window = Window.orderBy(monotonically_increasing_id())
df = df.withColumn("id", row_number().over(window) - 1)

df = df.withColumn("prev_salary", F.lag(F.col("salary")).over(window))

df = df.withColumn("salary_difference", F.when(
    F.col("prev_salary").isNotNull(), 
    F.col("salary")-F.col("prev_salary")
    ).otherwise(0)
)

df.show()

In [0]:
# How to get the day of month, week number, day of year and day of week from a date strings

data = [("2023-05-18","01 Jan 2010",), ("2023-12-31", "01 Jan 2010",)]
df = spark.createDataFrame(data, ["date_str_1", "date_str_2"])

df.show()

# if the date string format is correct, then can use date casting
df = df.withColumn("date_1", F.col("date_str_1").cast("date"))

# if the date is malformed, then use to_date
df = df.withColumn("date_2", F.to_date(F.col("date_str_2"), 'dd MMM yyyy')) \
    .withColumn("day_of_month", F.dayofmonth(F.col("date_1"))) \
    .withColumn("week_of_year", F.weekofyear(F.col("date_1"))) \
    .withColumn("day_of_year", F.dayofyear(F.col("date_1"))) \
    .withColumn("day_of_week", F.dayofweek(F.col("date_1")))

df.show()

In [0]:
import datetime

HUMAN_READABLE_TIMESTAMP_FORMAT = "%Y%m%d_%H%M%S"

def human_readable_to_timestamp(yyyyMMdd_HHmmss: str) -> datetime.datetime:
    return datetime.datetime.strptime(
        yyyyMMdd_HHmmss, HUMAN_READABLE_TIMESTAMP_FORMAT
    ).replace(tzinfo=datetime.timezone.utc)

print(human_readable_to_timestamp('20230915_123456'))
print(human_readable_to_timestamp('20240522_080000'))

timestamp_format = "yyyyMMdd_HHmmss"
data = [("20230915_123456",), ("20240522_080000",)]
df = spark.createDataFrame(data, ["human_readable_timestamp"])
df = df.withColumn("datetime_utc", F.to_timestamp("human_readable_timestamp", timestamp_format))
df.show()

In [0]:
# How to convert year-month string to dates corresponding to the 4th day of the month

df = spark.createDataFrame([('Jan 2010',), ('Feb 2011',), ('Mar 2012',)], ['MonthYear'])

df.show()

df = df.withColumn("converted_date", F.to_date(F.col("MonthYear"), 'MMM yyyy'))
df = df.withColumn("final_date", F.date_add(F.col("converted_date"), 3))
df.show()

In [0]:
# How to filter words that contain atleast 2 vowels from a series?

df = spark.createDataFrame([('Apple',), ('Orange',), ('Plan',) , ('Python',) , ('Money',)], ['Word'])
df.show()

vowels = 'AEIOUaeiou'
df_translated = df.withColumn("translated", F.translate(F.col('Word'), 'AEIOUaeiou', ''))
df_translated.show()


# Method 1: using translate
df_filtered = df.where((F.length(col('Word')) - F.length(F.translate(col('Word'), 'AEIOUaeiou', ''))) >= 2)
# or
df_filtered = df.filter((F.length(col('Word')) - F.length(F.translate(col('Word'), 'AEIOUaeiou', ''))) >= 2)
df_filtered.show()


# Method 2: using expr or regexp_extract
df_filtered = df.filter(
    F.length(F.regexp_replace(F.col("Word"), "[^AEIOUaeiou]", "")) >= 2
)
df_filtered.show()

df_filtered = df.filter(
    F.expr("length(regexp_replace(Word, '[^AEIOUaeiou]', '')) >= 2")
)
df_filtered.show()

In [0]:
# How to filter valid emails from a list?

import re
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf

regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'

def check(email):

    # pass the regular expression
    # and the string into the fullmatch() method
    if(re.fullmatch(regex, email)):
        return True

    return False

check("ankitrai326@gmail.com")
check("my.ownsite@ourearth.org")
check("ankitrai326.com")

check_mail = udf(check, BooleanType())

data = ['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']

# Convert the list to DataFrame
df = spark.createDataFrame(data, "string")
df.show(truncate =False)

df = df.withColumn("is_valid_email", check_mail(F.col("value")))
df.show()

df = df.filter(check_mail(F.col("value"))).select("Value")
df.show()

In [0]:
# How to Pivot PySpark DataFrame?

# Sample data
data = [
(2021, 1, "US", 5000),
(2021, 1, "EU", 4000),
(2021, 2, "US", 5500),
(2021, 2, "EU", 4500),
(2021, 3, "US", 6000),
(2021, 3, "EU", 5000),
(2021, 4, "US", 7000),
(2021, 4, "EU", 6000),
]

# Create DataFrame
columns = ["year", "quarter", "region", "revenue"]
df = spark.createDataFrame(data, columns)
df.show()

df = df.groupBy("year", "quarter").pivot("region").sum("revenue")
df.show()

In [0]:
# How to get the mean of a variable grouped by another variable?

# Sample data
data = [("1001", "Laptop", 1000),
("1002", "Mouse", 50),
("1003", "Laptop", 1200),
("1004", "Mouse", 30),
("1005", "Smartphone", 700)]

# Create DataFrame
columns = ["OrderID", "Product", "Price"]
df = spark.createDataFrame(data, columns)

df.show()

df = df.groupBy("Product").agg(F.mean(F.col("Price")))
df.show()

In [0]:
# How to replace missing spaces in a string with the least frequent character?

from collections import Counter

df = spark.createDataFrame([('dbcc debabedgade',),], ["string"])
df.show()

def check_least_frequent_char(string_value):
    counter = Counter(string_value.replace(' ', ''))
    least_frequent_character = min(counter, key=counter.get)
    return string_value.replace(" ", least_frequent_character)

udf_check_least_frequent_char = udf(check_least_frequent_char, StringType())

df = df.withColumn("modified_string", udf_check_least_frequent_char(F.col("string")))
df.show()

In [0]:
from pyspark.sql.functions import array_contains, array_sort, array_union, array_intersect
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("array_functions_example").getOrCreate()

# Create sample data
data = [("Alice", [2, 4, 6]), 
        ("Bob", [1, 2, 3]),
        ("Charlie", [4, 5, 6])]

df = spark.createDataFrame(data, ["Name", "Numbers"])
df.show(truncate=False)

# array_contains
df_after = df.filter(array_contains(df.Numbers, 4))
df_after.show()

# array_contains
df_after = df.select(
        "Name", 
        array_sort(F.col("Numbers")).alias("sorted_numbers")
    )
df_after.show()

df.select(
        "Name",
        array_union(F.col("Numbers"), F.array(F.lit(9))).alias("union_numbers")
    ).show(truncate=False)

    

In [0]:
# How to create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values?

from pyspark.sql.functions import expr, explode, sequence, rand

# Start date and end date (start + 10 weekends)
start_date = '2000-01-01'
end_date = '2000-03-04' # Calculated manually: 10 weekends (Saturdays) from start date

df = spark.range(1).select(
    explode(
        sequence(
            expr(f"date'{start_date}'"),
            expr(f"date('{end_date}')"),
            expr("interval 1 day")
        )
    ).alias("date")
)

df = df.filter(F.dayofweek(F.col("date"))==7)
df = df.withColumn("random_num", ((rand(seed=42)*10) +1).cast("int"))

df.show()

In [0]:
from pyspark.sql.functions import Column, lit
from pyspark.sql.types import IntegerType

def integration_succeed(http_status: Column) -> Column:
    return http_status.cast(IntegerType()).between(F.lit(200), F.lit(300))

def integration_succeed(http_status: Column) -> Column:
    return http_status.cast(IntegerType()).between(F.lit(200), F.lit(300))  # Success range: [200, 300)

# Sample DataFrame with HTTP status codes
test_data = [
    ("200",),   # Success case
    ("201",),   # Success case
    ("299",),   # Success case
    ("300",),   # Out of range
    ("404",),   # Out of range (failure)
    ("500",),   # Out of range (failure)
    (None,),    # Missing value
]

# Define schema and create the DataFrame
df = spark.createDataFrame(test_data, ["http_status"])

# Apply the `integration_succeed` function and create a new column to test it
df = df.withColumn("integration_success", integration_succeed(col("http_status")))

# Show the results
df.show(truncate=False)

In [0]:
# How to check if a dataframe has any missing values and count of missing values in each column?
# Assuming df is your DataFrame
df = spark.createDataFrame([
("A", 1, None),
("B", None, "123" ),
("B", 3, "456"),
("D", None, None),
], ["Name", "Value", "id"])

df.show()

from pyspark.sql.functions import col, sum

missing = df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in df.columns))
has_missing = any(row.asDict().values() for row in missing.collect())
print(has_missing)

missing_count = missing.collect()[0].asDict()
print(missing_count)

In [0]:
df = spark.createDataFrame([
("A", 1, None),
("B", None, "123" ),
("B", 3, "456"),
("D", None, None),
], ["Name", "Value", "id"])

df.show()

from pyspark.sql.functions import col, sum

missing = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
missing.show()

# dict in string
result = [row.asDict() for row in missing.collect()]
print(result)

# dict
missing_count = df.select([(F.count(F.when(F.col(c).isNull(), c)).alias(c)) for c in df.columns]).collect()[0].asDict()
print(missing_count)

has_missing = any(v > 0 for v in missing_count.values())
print(has_missing)

# dict
column_expressions=[]
for c in df.columns:
    # Create the expression to count null values for each column
    column_expression = sum(col(c).isNull().cast("int")).alias(c)
    # Append the expression to the list
    column_expressions.append(column_expression)

print("column_expressions", column_expressions)
# Use the column expressions in the select statement
missing = df.select(column_expressions)

print("wthout *")
missing.show()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum

# Sample DataFrame
df = spark.createDataFrame([
    ("A", 1, None),
    ("B", None, "123"),
    ("C", 3, None)
], ["Name", "Value", "id"])

# Prepare list of column expressions to count nulls
column_expressions = [
    sum(col(c).isNull().cast("int")).alias(c)  # Count nulls for each column
    for c in df.columns
]

# Correct: Use * to unpack the list
missing = df.select(column_expressions)
missing.show()

In [0]:
# How to replace missing values of multiple numeric columns with the mean

df = spark.createDataFrame([
("A", 1, None),
("B", None, 123 ),
("B", 3, 456),
("D", 6, None),
], ["Name", "var1", "var2"])

df.show()

fill_na_list = ["var1", "var2"]

column_expressions = []

for c in fill_na_list:
    column_expression = F.mean(F.col(c)).alias(c)
    column_expressions.append(column_expression)

mean_values_row = df.select(*column_expressions).collect()[0]
mean_values = {col: mean_values_row[col] for col in fill_na_list}
print(mean_values)

mean_values_row = df.select(
    *[
        F.mean(F.col(c)).alias(c)
        for c in fill_na_list
    ]
).collect()[0]
print(mean_values_row)

mean_values = {col: mean_values_row[col] for col in fill_na_list}
print(mean_values)

df_filled = df.fillna(mean_values)

df_filled.show()

In [0]:
# How to format or suppress scientific notations in a PySpark DataFrame?
df = spark.createDataFrame([(1, 0.000000123), (2, 0.000023456), (3, 0.000345678)], ["id", "your_column"])

df.show()

from pyspark.sql.functions import format_number

decimal_point = 5

df = df.withColumn("formatted_number", format_number(F.col("your_column"), decimal_point))
df.show()

In [0]:
# How to format all the values in a dataframe as percentages?

data = [(0.1, .08), (0.2, .06), (0.33, .02)]
df = spark.createDataFrame(data, ["numbers_1", "numbers_2"])

df.show()

df = df.withColumn("numbers_1", F.concat((F.col("numbers_1")*100).cast("string"), F.lit('%'))) \
        .withColumn("numbers_2", F.concat((F.col("numbers_2")*100).cast("string"), F.lit('%')))
df.show()

In [0]:
# How to filter every nth row in a dataframe

data = [("Alice", 1), ("Bob", 2), ("Charlie", 3), ("Dave", 4), ("Eve", 5),
("Frank", 6), ("Grace", 7), ("Hannah", 8), ("Igor", 9), ("Jack", 10)]

# Create DataFrame
df = spark.createDataFrame(data, ["Name", "Number"])

df.show()

nth_row = 5

window = Window.orderBy(monotonically_increasing_id())

df = df.withColumn("index", row_number().over(window))

df = df.filter(df.index % nth_row == 0)
df.show()


In [0]:
# How to calculate missing value percentage in each column?

# Create a sample dataframe
data = [("John", "Doe", None),
(None, "Smith", "New York"),
("Mike", "Smith", None),
("Anna", "Smith", "Boston"),
(None, None, None)]

df = spark.createDataFrame(data, ["FirstName", "LastName", "City"])

df.show()

total_row = df.count()

for c in df.columns:
    null_value = df.filter(F.col(c).isNull()).count()
    print("Missing values in column {c}: ", null_value/total_row * 100, "%")

In [0]:
# How to get the last n rows of a dataframe with row sum > 100

# Sample data
data = [(10, 25, 70),
(40, 5, 20),
(70, 80, 100),
(10, 2, 60),
(40, 50, 20)]

# Create DataFrame
df = spark.createDataFrame(data, ["col1", "col2", "col3"])

# Display original DataFrame
df.show()

from functools import reduce
df = df.withColumn("accumulative_sum", reduce(lambda x, y: x + y, [F.col(c) for c in df.columns]))
df.show()

df = df.filter(df.accumulative_sum > 100)
df.show()

df = df.withColumn("id", monotonically_increasing_id()).sort("id", ascending=False).limit(2)
df.show()

In [0]:
# How to create a column that contains the penultimate value in each row?

data = [(10, 20, 30),
(40, 60, 50),
(80, 70, 90)]

df = spark.createDataFrame(data, ["Column1", "Column2", "Column3"])

df.show()

from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, IntegerType

# Define UDF to sort array in descending order
sort_array_desc = F.udf(lambda arr: sorted(arr), ArrayType(IntegerType()))

# Create array from columns, sort in descending order and get the penultimate value
df = df.withColumn("row_as_array", sort_array_desc(F.array(df.columns)))
df.show()

df = df.withColumn("Penultimate", df['row_as_array'].getItem(1))
df.show()

df = df.drop('row_as_array')

df.show()

In [0]:
# How to create a column that contains the penultimate value in each row?

data = [(10, 20, 30),
(40, 60, 50),
(80, 70, 90)]

df = spark.createDataFrame(data, ["Column1", "Column2", "Column3"])

df.show()

df = df.withColumn("row_array", F.array(df.columns))
df.show()
df =  df.withColumn("reversed_row_array", F.sort_array(F.col("row_array"), asc=False))
df.show()
df = df.withColumn("penultimate", F.col("reversed_row_array")[1])
df.select("Column1", "Column2", "Column3", "penultimate").show()

In [0]:
# How to create lags and leads of a column by group in a dataframe?

from pyspark.sql.window import Window

# Create a sample DataFrame
data = [("2023-01-01", "Store1", 100),
("2023-01-02", "Store1", 150),
("2023-01-03", "Store1", 200),
("2023-01-04", "Store1", 250),
("2023-01-05", "Store1", 300),
("2023-01-01", "Store2", 50),
("2023-01-02", "Store2", 60),
("2023-01-03", "Store2", 80),
("2023-01-04", "Store2", 90),
("2023-01-05", "Store2", 120)]

df = spark.createDataFrame(data, ["Date", "Store", "Sales"])

df.show()

df = df.withColumn("Date", F.to_date(F.col("Date")))

windows = Window.orderBy(F.col("Date")).partitionBy(F.col("Store"))

df = df.withColumn("lag_col", F.lag(F.col("Sales")).over(windows))
df = df.withColumn("lead_col", F.lead(F.col("Sales")).over(windows))

df.show()

In [0]:
# How to get the frequency of unique values in the entire dataframe?
# Create a numeric DataFrame
data = [(1, 2, 3),
(2, 3, 4),
(1, 2, 3),
(4, 5, 6),
(2, 3, 4)]
df = spark.createDataFrame(data, ["Column1", "Column2", "Column3"])

# Print DataFrame
df.show()

df_col1 = df.select("Column1")
df_col2 = df.select("Column2")
df_col3 = df.select("Column3")

df = df_col1.unionAll(df_col2).unionAll(df_col3)

df = df.groupBy("Column1").count()
df.show()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Create sample DataFrame
data = [
    (1, 2, 3),
    (2, 3, 4),
    (1, 2, 3),
    (4, 5, 6),
    (2, 3, 4),
]
df = spark.createDataFrame(data, ["Column1", "Column2", "Column3"])

print("Original DataFrame:")
df.show()

# Step 1: Convert columns into an array for each row
df_array = df.withColumn("values_array", F.array(*[F.col(c) for c in df.columns]))
print("DataFrame With Array Column:")
df_array.show()

# Step 2: Explode the array to flatten values into rows
df_exploded = df_array.select(F.explode(F.col("values_array")).alias("value"))
print("Exploded DataFrame:")
df_exploded.show()

# Step 3: Group by unique values and count occurrences
df_frequency = df_exploded.groupBy("value").count()
print("Frequency of Unique Values Across Entire DataFrame:")
df_frequency.show()

In [0]:
# How to replace both the diagonals of dataframe with 0

from pyspark.sql.functions import monotonically_increasing_id

# Create a numeric DataFrame
data = [(1, 2, 3, 4),
(2, 3, 4, 5),
(1, 2, 3, 4),
(4, 5, 6, 7)]

df = spark.createDataFrame(data, ["col_1", "col_2", "col_3", "col_4"])

# Print DataFrame
df.show()

column_list = df.columns
max_row = df.count()
max_col = len(column_list)

df = df.withColumn("id", monotonically_increasing_id())
df.show()

df = df.withColumn(column_list[0], F.when(F.col("id")==0, 0).otherwise(F.col(column_list[0])))
df.show()

df = df.withColumn(column_list[-1], F.when(F.col("id")==0, 0).otherwise(F.col(column_list[-1])))
df.show()

df = df.withColumn(column_list[0], F.when(F.col("id")==max_row-1, 0).otherwise(F.col(column_list[0])))
df.show()

df = df.withColumn(column_list[-1], F.when(F.col("id")==max_row-1, 0).otherwise(F.col(column_list[-1])))
df.show()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, col, lit, when
import pyspark.sql.functions as F

# Create a 4x4 DataFrame
data = [(1, 2, 3, 4),
        (2, 3, 4, 5),
        (1, 2, 3, 4),
        (4, 5, 6, 7)]
df = spark.createDataFrame(data, ["col_1", "col_2", "col_3", "col_4"])

print("Original DataFrame:")
df.show()

# Step 1: Add row index column (id)
df = df.withColumn("id", monotonically_increasing_id())

# Step 2: Compute the diagonal and anti-diagonal values
column_list = df.columns[:-1]  # Exclude the 'id' column from columns to process
n = len(column_list)           # Number of numeric columns

print(column_list)

# Replace diagonals
for i, col_name in enumerate(column_list):
    print(i, col_name)
    main_diagonal_condition = F.col("id") == i                        # Main diagonal (row == col)
    anti_diagonal_condition = F.col("id") + i == n - 1                # Anti-diagonal (row + col == n-1)
    
    # Apply conditional replacement for diagonals
    df = df.withColumn(
        col_name,
        when(main_diagonal_condition | anti_diagonal_condition, lit(0)).otherwise(F.col(col_name))
    )

df = df.drop("id")

print("DataFrame with Diagonals Replaced with 0:")
df.show()

In [0]:
# How to reverse the rows of a dataframe

# Create a numeric DataFrame
data = [(1, 2, 3, 4),
(2, 3, 4, 5),
(3, 4, 5, 6),
(4, 5, 6, 7)]

df = spark.createDataFrame(data, ["col_1", "col_2", "col_3", "col_4"])

# Print DataFrame
df.show()

df = df.withColumn("id", monotonically_increasing_id())
df.show()

df = df.orderBy(F.desc("id"))
df.show()

In [0]:
# How to UnPivot the dataframe (converting columns into rows) 
# UnPivot EU, US columns and create region, revenue Columns

# Sample data
data = [(2021, 2, 4500, 5500),
(2021, 1, 4000, 5000),
(2021, 3, 5000, 6000),
(2021, 4, 6000, 7000)]

# Create DataFrame
columns = ["year", "quarter", "EU", "US"]
df = spark.createDataFrame(data, columns)

df.show()

df = df.unpivot(["year", "quarter"], ["EU", "US"], "region", "revenue")
df.show()