In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder\
    .appName("Recap-retro")\
    .config("spark.executor.memory","4g")\
    .config("spark.dynamicAllocation.enabled", "true")\
    .config("spark.dynamicAllocation.maxExecutors", "30")\
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
    .getOrCreate()


In [2]:
# get the data

# Specify the S3 path
s3_path = "s3a://inesh/parseable/*/*/*/*/"

# Read Parquet files from the S3 path
df = spark.read.parquet(s3_path)

# nb of lines
df.count()


                                                                                

3621

In [3]:
namespace = "user-inesh"

df = df.filter(df["namespace"]== namespace)


In [4]:
from pyspark.sql.functions import col, when, sum as _sum

# Define conditions for the specific counts
conditions = [
    (col("type") == "service.install", "df_install"),
    (col("type") == "service.uninstall", "df_uninstall"),
    (col("packageName") == "vscode-python", "vscode_python"),
    (col("packageName") == "jupyter-python", "jupyter_python"),
    (col("packageName") == "rstudio", "rstudio"),
    (col("catalogId") == "automation", "automation"),
    (col("catalogId") == "dataviz", "dataviz"),
    (col("catalogId") == "databases", "databases"),
    (col("catalogId") == "divers", "divers"),
    (col("catalogId") == "ide", "ide"),
    (col("catalogId") == "inseefrlab-helm-charts-trainings", "trainings"),
    (col("catalogId") == "inseefrlab-helm-charts-datascience", "datascience"),
    
]

# Add a new column for each condition and count occurrences
df_with_conditions = df.select(
    *[
        when(cond, 1).otherwise(0).alias(name)
        for cond, name in conditions
    ]
)

# Sum up the counts for each condition
result = df_with_conditions.agg(
    *[_sum(col(name)).alias(name) for _, name in conditions]
).collect()[0]

# Extract the results
(
    df_install,
    df_uninstall,
    vscode_python,
    jupyter_python,
    rstudio,
    automation,
    dataviz,
    databases,
    divers,
    ide,
    trainings, 
    datascience
) = result

# Print the results
print(f"nb services installés : {df_install}")
print(f"nb services désinstallés : {df_uninstall}")
print(f"nb de services du catalogue ide installés : {ide} dont {vscode_python} vscode-python, {jupyter_python} jupyter-python et {rstudio} rstudio.")
print(f"nb de services du catalogue automation installés : {automation}")
print(f"nb de services du catalogue dataviz installés : {dataviz}")
print(f"nb de services du catalogue databases installés : {databases}")
print(f"nb de services du catalogue divers installés : {divers}")
print(f"nb de services du catalogue de training installés : {trainings}")
print(f"nb de services du catalogue ide datascience : {datascience}")




nb services installés : 41
nb services désinstallés : 38
nb de services du catalogue ide installés : 33 dont 23 vscode-python, 7 jupyter-python et 1 rstudio.
nb de services du catalogue automation installés : 0
nb de services du catalogue dataviz installés : 1
nb de services du catalogue databases installés : 7
nb de services du catalogue divers installés : 0
nb de services du catalogue de training installés : 0
nb de services du catalogue ide datascience : 2


                                                                                