# Clustering the daily load profiles

## Install and import necessary libraries

In [1]:
!pip install pyspark



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, min, max, hour, avg, to_timestamp
from pyspark.sql.types import StructType
from pyspark.ml.clustering import KMeans
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.ml.functions import vector_to_array
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Define necessary functions

In [4]:
def process_csv(spark, data_path):

    # Read CSV data
    df = spark.read.csv(data_path, header=True)

    # Cast PowerFlowValue to float and Timestamp to timestamp type
    df = df.withColumn('PowerFlowValue', col('PowerFlowValue').cast('float'))
    df = df.withColumn('Timestamp', to_timestamp(col('Timestamp'), 'yyyy-MM-dd HH:mm:ss'))

    # Select relevant feature for normalization
    assembler = VectorAssembler(inputCols=["PowerFlowValue"], outputCol="features")
    df = assembler.transform(df)

    # Initialize the MinMaxScaler
    scaler = MinMaxScaler(inputCol="features", outputCol="NormalizedPowerFlowValue")

    # Compute summary statistics and generate the MinMaxScalerModel
    scaler_model = scaler.fit(df)

    # Transform the data
    df_normalized = scaler_model.transform(df)

    # Convert vector column to array
    df_normalized = df_normalized.withColumn("NormalizedPowerFlowArray", vector_to_array("NormalizedPowerFlowValue"))

    # Extract individual values from the array into separate columns
    df_normalized = df_normalized.withColumn("NormalizedPowerFlowValue", col("NormalizedPowerFlowArray")[0])

    # Drop the intermediate array column
    df_normalized = df_normalized.drop("NormalizedPowerFlowArray")

    # Group by hour and calculate the average power flow value for each hour
    hourly_avg_df = (
        df_normalized
        .withColumn("Hour", hour("Timestamp"))
        .groupBy("PowerLineID", "Hour")
        .agg(avg("NormalizedPowerFlowValue").alias("AveragePowerFlow"))
    )

    # Sort the resulting DataFrame
    hourly_avg_df = hourly_avg_df.sort('PowerLineID', 'Hour')

    return hourly_avg_df

In [5]:
def process_and_train_model(data_path, save_path):
    # Create a Spark session
    spark = SparkSession.builder.appName("Clustering").getOrCreate()

    # Process CSV
    processed_df = process_csv(spark, data_path)

    # Specify the columns to be used as features
    feature_columns = ['AveragePowerFlow']

    # Create a vector assembler
    vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

    # Transform the DataFrame to include the features vector
    feature_df = vector_assembler.transform(processed_df)

    # Define the number of clusters
    num_clusters = 50

    # Initialize the KMeans model
    kmeans = KMeans(featuresCol="features", predictionCol="cluster", k=num_clusters, seed=1)

    # Fit the model to the data
    model = kmeans.fit(feature_df)

    # Save the model
    model.save(save_path)

## Train and save models

In [6]:
# Process and train models for all CSV files
data_folder = '/content/drive/My Drive/Power_flow_data'
models_save_folder = '/content/drive/My Drive/kmeans_models'

# Iterate over all CSV files
for csv_file in os.listdir(data_folder):
    if csv_file.endswith(".csv"):
        # Create full path for CSV and model save
        csv_path = os.path.join(data_folder, csv_file)
        model_save_path = os.path.join(models_save_folder, f"model_{csv_file.replace('.csv', '')}")

        # Process and train model
        process_and_train_model(csv_path, model_save_path)
        print("Model for " + str(csv_file) + " trained and saved!")

Model for 2010-07-02.csv trained and saved!
Model for 2010-07-01.csv trained and saved!
Model for 2010-07-03.csv trained and saved!
Model for 2010-07-04.csv trained and saved!
Model for 2010-07-05.csv trained and saved!
Model for 2010-07-06.csv trained and saved!
Model for 2010-07-07.csv trained and saved!
Model for 2010-07-08.csv trained and saved!
Model for 2010-07-09.csv trained and saved!
Model for 2010-07-10.csv trained and saved!
Model for 2010-07-11.csv trained and saved!
Model for 2010-07-12.csv trained and saved!
Model for 2010-07-13.csv trained and saved!
Model for 2010-07-14.csv trained and saved!
Model for 2010-07-15.csv trained and saved!
Model for 2010-07-16.csv trained and saved!
Model for 2010-07-17.csv trained and saved!
Model for 2010-07-18.csv trained and saved!
Model for 2010-07-19.csv trained and saved!
Model for 2010-07-20.csv trained and saved!
Model for 2010-07-21.csv trained and saved!
Model for 2010-07-22.csv trained and saved!
Model for 2010-07-23.csv trained