<a href="https://colab.research.google.com/github/jcims123/spark_in_colab/blob/main/20250802_spark_colab_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ===== CORRECTED PYSPARK SETUP FOR GOOGLE COLAB =====

# Step 1: Install Java (required for Spark)
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Step 2: Download Apache Spark 3.5.0 (stable version that works reliably)
!wget -q https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz

# Step 3: Install findspark
!pip install findspark

# Step 4: Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

# Step 5: Initialize Spark
import findspark
findspark.init()

print("✅ PySpark 3.5.0 installed successfully!")

✅ PySpark 3.5.0 installed successfully!


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pyspark.sql.functions as F

# Create optimized Spark session for interviews
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("InterviewPrep") \
    .config("spark.driver.memory", "8g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

print(f"✅ Spark {spark.version} session created!")
print(f"🔧 Using {spark.sparkContext.defaultParallelism} cores")

✅ Spark 3.5.0 session created!
🔧 Using 2 cores


In [5]:
# Sample data for interviews
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Create sample employee data
employees_data = [
    (1, "John", "Engineering", 75000, "2020-01-15"),
    (2, "Alice", "Marketing", 65000, "2019-03-20"),
    (3, "Bob", "Engineering", 80000, "2021-06-10"),
    (4, "Carol", "Sales", 70000, "2020-11-05"),
    (5, "David", "Engineering", 85000, "2018-08-12")
]

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("department", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("hire_date", StringType(), True)
])

df = spark.createDataFrame(employees_data, schema)
df.show()
df.printSchema()

# Common interview operations
print(f"Total employees: {df.count()}")
print(f"Columns: {df.columns}")

+---+-----+-----------+------+----------+
| id| name| department|salary| hire_date|
+---+-----+-----------+------+----------+
|  1| John|Engineering| 75000|2020-01-15|
|  2|Alice|  Marketing| 65000|2019-03-20|
|  3|  Bob|Engineering| 80000|2021-06-10|
|  4|Carol|      Sales| 70000|2020-11-05|
|  5|David|Engineering| 85000|2018-08-12|
+---+-----+-----------+------+----------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- hire_date: string (nullable = true)

Total employees: 5
Columns: ['id', 'name', 'department', 'salary', 'hire_date']
