In [None]:
%run ~/skatt-naering/src/settings.py
%run ~/skatt-naering/production/naeringsspesifikasjon/config_naeringsspesifikasjon.py

In [None]:
import time

In [None]:
import gcsfs
import pandas as pd
import pyarrow.parquet as pq
import pyspark.pandas as ps
from dapla.auth import AuthClient
from pyspark.sql import SparkSession

In [None]:
from nst import functions

In [None]:
functions.use_virtualenv_in_pyspark()

# With Configurations

In [None]:
spark = (
    SparkSession.builder.appName("ParquetPerformanceTest")
    .config(
        "spark.hadoop.fs.AbstractFileSystem.gs.impl",
        "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem",
    )
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true")
    .getOrCreate()
)

# Define the paths
bredt_path = f"{TEMP_PATH}/resultregnskap_balanseregnskap_testfiler/resultregnskap_balanseregnskap_testfil_bredt"
langt_path = f"{TEMP_PATH}/resultregnskap_balanseregnskap_testfiler/resultregnskap_balanseregnskap_testfil_langt"
# hovedtema_path = f"{TEMP_PATH}/resultregnskap_balanseregnskap_testfiler/resultregnskap_balanseregnskap_partitioned_data/hovedtema"

# We will store total time taken for each dataset in these variables
total_time_bredt = 0
total_time_langt = 0
# total_time_hovedtema = 0

iterations = 10

for _ in range(iterations):
    # 'bredt' dataset
    start_time = time.time()
    bredt_df = spark.read.parquet(bredt_path)
    total_time_bredt += time.time() - start_time

    # 'langt' dataset
    start_time = time.time()
    langt_df = spark.read.parquet(langt_path)
    total_time_langt += time.time() - start_time

    # # 'hovedtema' partitioned dataset
    # start_time = time.time()
    # hovedtema_df = spark.read.parquet(hovedtema_path)
    # total_time_hovedtema += time.time() - start_time

# Compute the average time for each dataset
avg_time_bredt = total_time_bredt / iterations
avg_time_langt = total_time_langt / iterations
# avg_time_hovedtema = total_time_hovedtema / iterations

print(f"Average time taken to read 'bredt' dataset: {avg_time_bredt:.2f} seconds")
print(f"Average time taken to read 'langt' dataset: {avg_time_langt:.2f} seconds")
# print(f"Average time taken to read 'hovedtema' partitioned dataset: {avg_time_hovedtema:.2f} seconds")

# Remember to stop the SparkSession after your tests
spark.stop()

# Without configurations

In [None]:
spark = SparkSession.builder.getOrCreate()

# Define the paths
bredt_path = f"{TEMP_PATH}/resultregnskap_balanseregnskap_testfiler/resultregnskap_balanseregnskap_testfil_bredt"
langt_path = f"{TEMP_PATH}/resultregnskap_balanseregnskap_testfiler/resultregnskap_balanseregnskap_testfil_langt"
# hovedtema_path = f"{TEMP_PATH}/resultregnskap_balanseregnskap_testfiler/resultregnskap_balanseregnskap_partitioned_data/hovedtema"

# We will store total time taken for each dataset in these variables
total_time_bredt = 0
total_time_langt = 0
# total_time_hovedtema = 0

iterations = 10

for _ in range(iterations):
    # 'bredt' dataset
    start_time = time.time()
    bredt_df = spark.read.parquet(bredt_path)
    total_time_bredt += time.time() - start_time

    # 'langt' dataset
    start_time = time.time()
    langt_df = spark.read.parquet(langt_path)
    total_time_langt += time.time() - start_time

    # # 'hovedtema' partitioned dataset
    # start_time = time.time()
    # hovedtema_df = spark.read.parquet(hovedtema_path)
    # total_time_hovedtema += time.time() - start_time

# Compute the average time for each dataset
avg_time_bredt = total_time_bredt / iterations
avg_time_langt = total_time_langt / iterations
# avg_time_hovedtema = total_time_hovedtema / iterations

print(
    f"Average time taken to read 'bredt' dataset using PySpark: {avg_time_bredt:.2f} seconds"
)
print(
    f"Average time taken to read 'langt' dataset using PySpark: {avg_time_langt:.2f} seconds"
)
# print(f"Average time taken to read 'hovedtema' partitioned dataset: {avg_time_hovedtema:.2f} seconds")

# Remember to stop the SparkSession after your tests
spark.stop()

# Read in and convert to pandas dataframe

In [None]:
functions.use_virtualenv_in_pyspark()
spark = SparkSession.builder.getOrCreate()

# Define the paths
bredt_path = f"{TEMP_PATH}/resultregnskap_balanseregnskap_testfiler/resultregnskap_balanseregnskap_testfil_bredt"
langt_path = f"{TEMP_PATH}/resultregnskap_balanseregnskap_testfiler/resultregnskap_balanseregnskap_testfil_langt"
# hovedtema_path = f"{TEMP_PATH}/resultregnskap_balanseregnskap_testfiler/resultregnskap_balanseregnskap_partitioned_data/hovedtema"

# We will store total time taken for each dataset in these variables
total_time_bredt = 0
total_time_langt = 0
# total_time_hovedtema = 0

iterations = 3

for _ in range(iterations):
    # 'bredt' dataset
    start_time = time.time()
    bredt_df = spark.read.parquet(bredt_path)
    bredt_df = bredt_df.toPandas()
    total_time_bredt += time.time() - start_time

    # # 'langt' dataset
    # start_time = time.time()
    # langt_df = spark.read.parquet(langt_path)
    # langt_df = langt_df.toPandas()
    # total_time_langt += time.time() - start_time

    # # 'hovedtema' partitioned dataset
    # start_time = time.time()
    # hovedtema_df = spark.read.parquet(hovedtema_path)
    # total_time_hovedtema += time.time() - start_time

# Compute the average time for each dataset
avg_time_bredt = total_time_bredt / iterations
# avg_time_langt = total_time_langt / iterations
# avg_time_hovedtema = total_time_hovedtema / iterations

print(
    f"Average time taken to read 'bredt' dataset using PySpark: {avg_time_bredt:.2f} seconds"
)
# print(f"Average time taken to read 'langt' dataset using PySpark: {avg_time_langt:.2f} seconds")
print(
    f"Average time taken to read 'langt' dataset using PySpark: Failure. Not enough memory"
)
# print(f"Average time taken to read 'hovedtema' partitioned dataset: {avg_time_hovedtema:.2f} seconds")

# Remember to stop the SparkSession after your tests
spark.stop()

import time

import pyspark.sql.functions as F

In [None]:
from pyspark.sql import SparkSession


# Initialize a Spark session
spark = SparkSession.builder.appName("PerformanceTesting").getOrCreate()

# Define the paths
# Note: Assuming TEMP_PATH has been defined earlier in the code
bredt_path = f"{TEMP_PATH}/resultregnskap_balanseregnskap_testfiler/resultregnskap_balanseregnskap_testfil_bredt"
langt_path = f"{TEMP_PATH}/resultregnskap_balanseregnskap_testfiler/resultregnskap_balanseregnskap_testfil_langt"
partitioned_path = f"{TEMP_PATH}/resultregnskap_balanseregnskap_testfiler/partitioned_langt_data/hovedtema=resultatregnskap/undertema=driftsinntekt"

# We will store total time taken for each dataset in these variables
total_time_bredt = 0
total_time_langt = 0
total_time_partitioned = 0

iterations = 5

for _ in range(iterations):
    # 'bredt' dataset
    start_time = time.time()

    bredt_df = spark.read.parquet(bredt_path)
    filtered_bredt_df = bredt_df.filter(F.col("norskIdentifikator") == "00002047889")
    aggregation_columns = [
        F.sum(F.col(c)).alias(c)
        for c in filtered_bredt_df.columns
        if c.startswith("p3")
    ]
    avg_p3_columns = filtered_bredt_df.select(
        [F.col(c) for c in filtered_bredt_df.columns if c.startswith("p3")]
    ).agg(*aggregation_columns)

    total_time_bredt += time.time() - start_time

    # 'langt' dataset
    start_time = time.time()

    langt_df = spark.read.parquet(langt_path)
    langt_filtered_df = langt_df.filter(
        (F.col("norskIdentifikator") == "00002047889")
        & F.col("felt_id").startswith("p3")
    )
    langt_filtered_df = langt_filtered_df.withColumn(
        "felt_verdi", F.col("felt_verdi").cast("double")
    )
    avg_p3_value = langt_filtered_df.agg(F.sum("felt_verdi").alias("total")).collect()[
        0
    ]["total"]

    total_time_langt += time.time() - start_time

    # 'partitioned' dataset
    start_time = time.time()

    partitioned_df = spark.read.parquet(partitioned_path)
    partitioned_filtered_df = partitioned_df.filter(
        (F.col("norskIdentifikator") == "00002047889")
        & F.col("felt_id").startswith("p3")
    )
    partitioned_filtered_df = partitioned_filtered_df.withColumn(
        "felt_verdi", F.col("felt_verdi").cast("double")
    )
    avg_p3_partitioned_value = partitioned_filtered_df.agg(
        F.sum("felt_verdi").alias("total")
    ).collect()[0]["total"]

    total_time_partitioned += time.time() - start_time

# Compute the average time for each dataset
avg_time_bredt = total_time_bredt / iterations
avg_time_langt = total_time_langt / iterations
avg_time_partitioned = total_time_partitioned / iterations

print(
    f"Average time taken using PySpark for the 'bredt' dataset: {avg_time_bredt:.2f} seconds"
)
print(
    f"Average time taken using PySpark for the 'langt' dataset: {avg_time_langt:.2f} seconds"
)
print(
    f"Average time taken using PySpark for the 'partitioned' dataset: {avg_time_partitioned:.2f} seconds"
)

# Stop the Spark session
spark.stop()