## Content

- Spark Installation
- Import libraries
- Load Data into Spark Rdd
- Data Transformation with Spark (Eg. Total hours logged by Projects)

### Spark Installation

In [1]:
!pip install pyspark -q

### Import libraries

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum

### Load Data into Spark Rdd

In [5]:
# create spark session state
def spark_session():
    appName = "spark transformation"
    spark = SparkSession.builder \
        .appName(appName) \
        .getOrCreate()

    return spark


# create function to load data conveniently into spark
def load_clickup_data(spark, file_path, file_format):
  """create function to load data conveniently into spark

  :param spark: session running spark process
  :param file_path: directory path to dataset
  :file_format: format of data to transform eg. parquet, csv etc.

  """
  try:
      print("load_clickup data Started ...")
      df = spark. \
          read. \
          format(file_format). \
          options(header=True). \
          options(inferSchema=True). \
          options(delimiter=','). \
          load(file_path)
  except Exception as e:
      print("Error in the method - load_clickup_data. Please check the Stack Trace. " + str(e))

  else:
      print(f"The input File {file_path} is loaded to the data frame successfully.")
  return df

### Data Transformation with Spark

In [6]:
# init spark session state
spark = spark_session()

# init clickup data path
clickup_data_path = "/content/sample_data/ClickUp.csv"

In [7]:
# setup spark dataframe

clickup_rdd = load_clickup_data(spark=spark, file_path=clickup_data_path, file_format='csv')

load_clickup data Started ...


In [8]:
# run transformation
project_hours_df = clickup_rdd.groupBy("Project").agg(sum("Hours").alias("Total_Hours"))

# inspect the result
project_hours_df.show()

+--------------------+-----------+
|             Project|Total_Hours|
+--------------------+-----------+
|Book Localization...|      466.0|
| Website Development|      477.0|
|     Brand Guideline|      174.5|
+--------------------+-----------+

