<a href="https://colab.research.google.com/github/jugalpanchal/bd-chef/blob/main/spark_koalas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Follow the steps to install the dependencies:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # install java
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz # spark package download
!tar xf spark-3.1.2-bin-hadoop3.2.tgz # unzip spark package
!pip install -q findspark # install spark

# Set the location of Java and Spark:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# create or get spark session
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Spark_App1") \
        .getOrCreate()

# Spark 2.x does not give direct sparkcontext so we need to get it from SparkSession
sc = spark.sparkContext

In [3]:
!pip install pandas



In [5]:
# be consistent in the types
gen_df = spark.createDataFrame(
    [
        (1, "foo"),
        (2, "bar"),
    ],
    ["id", "label"]  # add your column names here
)

gen_df.show()

+---+-----+
| id|label|
+---+-----+
|  1|  foo|
|  2|  bar|
+---+-----+



In [6]:
# Pandas is the facto standard(singe-node) DataFrame implementation in Python.
# While Spark DataFrame is the facto standard for big data processing.
# The RDD and DataFrame data are distributed but once we convert it to Pandas then 
# It's on sigle machine/node.
# This operation(toPandas) is very costly in context of memory and compute.
gen_df_pandas = gen_df.toPandas()
gen_df_pandas

# So Koalas(the pandas DataFrame API on top of Apache Spark) is born to resolve this(Big Data) issue.
# https://github.com/databricks/koalas

Unnamed: 0,id,label
0,1,foo
1,2,bar
