In [None]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [None]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("TimeSeriesPreprocessing_holtwintermodel").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp
import pandas as pd
import matplotlib.pyplot as plt 


# Load the data into Spark DataFrame
file_path = 'file:///home/talentum/pune.csv'
data_spark = spark.read.csv(file_path, header=True, inferSchema=True)

# Convert 'date_time' column to timestamp
data_spark = data_spark.withColumn('date_time', to_timestamp(col('date_time')))

# Drop rows with missing 'date_time'
data_spark = data_spark.dropna(subset=['date_time'])

# Rename columns
data_spark = data_spark.withColumnRenamed('date_time', 'ds').withColumnRenamed('tempC', 'y')

# Ensure 'y' column is numeric
data_spark = data_spark.withColumn('y', col('y').cast('double'))

# Drop rows with NaNs in 'y'
data_spark = data_spark.dropna(subset=['y'])

# Set 'ds' column as index (we need to collect to Pandas DataFrame for this operation)
data_spark = data_spark.toPandas()

# Set index and sort
data_spark.set_index('ds', inplace=True)
data_spark.index = pd.to_datetime(data_spark.index)
data_spark.sort_index(inplace=True)

# Plot the data to understand trends and seasonality
plt.figure(figsize=(10, 5))
plt.plot(data_spark.index, data_spark['y'], label='Observed Data')
plt.title('Temperature Data')
plt.xlabel('Date')
plt.ylabel('Temperature (C)')
plt.legend()
plt.show()

# Split the data into training and test sets
train_size = int(len(data_spark) * 0.8)
train, test = data_spark[:train_size], data_spark[train_size:]
