# Hello Spark Sandbox, random PySpark stuff 

In [1]:
# Prerequisites
from pyspark.sql import SparkSession

In [3]:
# Get SparkSession
spark = SparkSession.builder.master("local").getOrCreate()
print("Spark Version: ", spark.version)

Spark Version:  3.4.1


#### Loop over columns

In [6]:
from pyspark.sql.functions import col, isnan, when, count

# Sample DataFrame
data = [
    (1, "Alice", 25),
    (2, None, 30),
    (3, "Charlie", None),
    (4, None, None)
]
columns = ["id", "name", "age"]
df_simple = spark.createDataFrame(data, columns)

# Show the DataFrame
df_simple.show()

# Loop over columns and check for missing values
for column in df_simple.columns:
    missing_count = df_simple.select(
        count(when(col(column).isNull() | isnan(col(column)), column)).alias("missing_count")
    ).collect()[0]["missing_count"]
    print(f"Column '{column}' has {missing_count} missing value(s).")


+---+-------+----+
| id|   name| age|
+---+-------+----+
|  1|  Alice|  25|
|  2|   null|  30|
|  3|Charlie|null|
|  4|   null|null|
+---+-------+----+

Column 'id' has 0 missing value(s).
Column 'name' has 2 missing value(s).
Column 'age' has 2 missing value(s).


In [7]:
from pyspark.sql.functions import col, isnan, when, count


# Sample DataFrame
data = [
    (1, "Alice", 25.0),
    (2, None, float('NaN')),
    (3, "Charlie", None),
    (4, None, None),
    (None, "Bob", None)
]
columns = ["id", "name", "age"]
df = spark.createDataFrame(data, columns)

# Show the DataFrame
df.show()

# Check for NULL or NaN values in all columns
missing_counts = df.select([
    count(when(col(c).isNull() | isnan(col(c)), c)).alias(c)
    for c in df.columns
])

# Show the counts of missing values
missing_counts.show()


+----+-------+----+
|  id|   name| age|
+----+-------+----+
|   1|  Alice|25.0|
|   2|   null| NaN|
|   3|Charlie|null|
|   4|   null|null|
|null|    Bob|null|
+----+-------+----+

+---+----+---+
| id|name|age|
+---+----+---+
|  1|   2|  4|
+---+----+---+



#### Drop Duplicate Columns

In [12]:

# Sample DataFrame
data = [
    (1, "Alice", 25),
    (2, "Bob", 30),
    (3, "Alice", 25),
    (4, "Bob", 30),
    (5, "Charlie", 35)
]
columns = ["id", "name", "age"]

df = spark.createDataFrame(data, columns)

# Show the original DataFrame
print("Original DataFrame:")
df.show()

# Remove duplicate rows based on the subset of columns 'name' and 'age'
df_no_duplicates = df.dropDuplicates(subset=["name", "age"])

# Show the DataFrame after removing duplicates
print("DataFrame after removing duplicates based on 'name' and 'age':")
df_no_duplicates.show()


Original DataFrame:
+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|  Alice| 25|
|  4|    Bob| 30|
|  5|Charlie| 35|
+---+-------+---+

DataFrame after removing duplicates based on 'name' and 'age':
+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  5|Charlie| 35|
|  2|    Bob| 30|
+---+-------+---+



#### Simple Correclation

In [10]:

# Create a sample DataFrame
data = [
    (1, 2.0),
    (2, 3.5),
    (3, 5.0),
    (4, 7.0),
    (5, 8.5)
]
columns = ["x", "y"]
df = spark.createDataFrame(data, columns)

# Show the DataFrame
df.show()

# Calculate the correlation between 'x' and 'y'
correlation = df.corr("x", "y")
print(f"Correlation between x and y: {correlation}")


+---+---+
|  x|  y|
+---+---+
|  1|2.0|
|  2|3.5|
|  3|5.0|
|  4|7.0|
|  5|8.5|
+---+---+

Correlation between x and y: 0.9986254289035241


While corr() is straightforward for small datasets, for larger or complex datasets, you can use pyspark.ml.stat.Correlation for advanced statistical analysis (e.g., calculating correlations for multiple columns).

In [14]:

# Sample DataFrame
data = [
    (1, 2.0, 3.0),
    (2, 4.5, 5.0),
    (3, 6.0, 7.5),
    (4, 8.0, 9.0),
    (5, 10.5, 12.0)
]
columns = ["col1", "col2", "col3"]

df = spark.createDataFrame(data, columns)

# Show the DataFrame
df.show()

# Get the list of column names
columns = df.columns

# Initialize an empty list to store the results
correlations = []

# Loop over pairs of columns and calculate correlation
for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        col1 = columns[i]
        col2 = columns[j]
        corr_value = df.corr(col1, col2)  # Calculate correlation
        correlations.append((col1, col2, corr_value))

# Show the correlations
for col1, col2, corr_value in correlations:
    print(f"Correlation between {col1} and {col2}: {corr_value}")


+----+----+----+
|col1|col2|col3|
+----+----+----+
|   1| 2.0| 3.0|
|   2| 4.5| 5.0|
|   3| 6.0| 7.5|
|   4| 8.0| 9.0|
|   5|10.5|12.0|
+----+----+----+

Correlation between col1 and col2: 0.9967441085689461
Correlation between col1 and col3: 0.9958932064677039
Correlation between col2 and col3: 0.9948516856149789


### Advanced Correlation

In [13]:
from pyspark.ml.stat import Correlation
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

# Sample data as vectors
data = [(Vectors.dense([1.0, 2.0, 3.0]),),
        (Vectors.dense([2.0, 3.5, 5.0]),),
        (Vectors.dense([3.0, 5.0, 7.0]),),
        (Vectors.dense([4.0, 7.0, 8.5]),)]

df = spark.createDataFrame(data, ["features"])
df.show()

# Compute correlation matrix
correlation_matrix = Correlation.corr(df, "features", "pearson").head()[0]
print("Correlation matrix:\n", correlation_matrix.toArray())


+-------------+
|     features|
+-------------+
|[1.0,2.0,3.0]|
|[2.0,3.5,5.0]|
|[3.0,5.0,7.0]|
|[4.0,7.0,8.5]|
+-------------+

Correlation matrix:
 [[1.         0.99725651 0.9978158 ]
 [0.99725651 1.         0.99018848]
 [0.9978158  0.99018848 1.        ]]


In [2]:
import numpy as np

# Example: Generate a random 50x50 correlation matrix
np.random.seed(42)
correlation_matrix = np.random.rand(50, 50)
correlation_matrix = (correlation_matrix + correlation_matrix.T) / 2  # Make it symmetric
np.fill_diagonal(correlation_matrix, 1)  # Fill diagonal with 1s

# Step 1: Extract the upper triangular part of the matrix, excluding the diagonal
row_indices, col_indices = np.triu_indices_from(correlation_matrix, k=1)
upper_triangular_values = correlation_matrix[row_indices, col_indices]

# Step 2: Sort correlations in descending order and find the top 10
sorted_indices = np.argsort(upper_triangular_values)[-10:]  # Indices of top 10
top_10_correlations = upper_triangular_values[sorted_indices]

# Step 3: Map back to column pairs
top_10_pairs = [(row_indices[i], col_indices[i], upper_triangular_values[i]) for i in sorted_indices]

# Step 4: Find the highest correlation and its column pair
highest_correlation_index = np.argmax(upper_triangular_values)
highest_pair = (row_indices[highest_correlation_index], col_indices[highest_correlation_index])
highest_correlation = upper_triangular_values[highest_correlation_index]

# Display Results
print("Top 10 Correlations:")
for i, (row, col, value) in enumerate(sorted(top_10_pairs, key=lambda x: -x[2]), start=1):
    print(f"{i}. Columns ({row}, {col}) - Correlation: {value:.4f}")

print(f"\nHighest Correlation is between Columns {highest_pair[0]} and {highest_pair[1]} with a value of {highest_correlation:.4f}")


Top 10 Correlations:
1. Columns (24, 28) - Correlation: 0.9853
2. Columns (0, 34) - Correlation: 0.9657
3. Columns (0, 1) - Correlation: 0.9601
4. Columns (5, 6) - Correlation: 0.9560
5. Columns (20, 24) - Correlation: 0.9476
6. Columns (23, 35) - Correlation: 0.9463
7. Columns (7, 16) - Correlation: 0.9434
8. Columns (3, 4) - Correlation: 0.9421
9. Columns (10, 32) - Correlation: 0.9384
10. Columns (2, 40) - Correlation: 0.9344

Highest Correlation is between Columns 24 and 28 with a value of 0.9853


### Outliers

In [5]:
from pyspark.sql.functions import col, expr


# Sample DataFrame
data = [
    (1, 100, 15.0),
    (2, 200, 20.0),
    (3, 300, 25.0),
    (4, 400, 30.0),
    (5, 500, 1000.0),  # Outlier in column3
    (6, 600, 35.0),
    (7, 700, 40.0),
    (8, 800, -500.0),  # Outlier in column3
    (9, 900, 45.0),
    (10, 1000, 50.0)
]
columns = ["id", "column2", "column3"]
df = spark.createDataFrame(data, columns)

# Calculate Q1, Q3, and IQR for a numeric column
numeric_column = "column3"

# Calculate Q1 and Q3
quantiles = df.approxQuantile(numeric_column, [0.25, 0.75], 0.05)
Q1, Q3 = quantiles

# Calculate IQR
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")
print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")

# Filter rows with outliers
outliers = df.filter((col(numeric_column) < lower_bound) | (col(numeric_column) > upper_bound))
non_outliers = df.filter((col(numeric_column) >= lower_bound) & (col(numeric_column) <= upper_bound))

# Show results
print("Outliers:")
outliers.show()

print("Non-Outliers:")
non_outliers.show()


Q1: 20.0, Q3: 45.0, IQR: 25.0
Lower Bound: -17.5, Upper Bound: 82.5
Outliers:
+---+-------+-------+
| id|column2|column3|
+---+-------+-------+
|  5|    500| 1000.0|
|  8|    800| -500.0|
+---+-------+-------+

Non-Outliers:
+---+-------+-------+
| id|column2|column3|
+---+-------+-------+
|  1|    100|   15.0|
|  2|    200|   20.0|
|  3|    300|   25.0|
|  4|    400|   30.0|
|  6|    600|   35.0|
|  7|    700|   40.0|
|  9|    900|   45.0|
| 10|   1000|   50.0|
+---+-------+-------+



----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 34288)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 253, in poll
    if func():
       ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 257, in accum_updates
    num_updates = read_int(self.rfile)
                  