In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, when

spark = SparkSession.builder.getOrCreate()

In [None]:
import pyspark.sql.functions as f

In [None]:
df = spark.read.json([
    "/home/jovyan/data/test.jsonl",
    "/home/jovyan/data/test2.jsonl"
])
df.select('steam_appid').distinct().count()

In [None]:
df = df.distinct()
df.select('steam_appid').distinct().count()

In [None]:
df.show(5)

In [None]:
df.printSchema()

In [None]:
df.describe().show()


# Remove duplicates

In [None]:
df.groupBy("steam_appid").agg(f.count("*")).filter(col('count(1)')>1).show()

In [None]:
def find_differences_by_id(df, id_col):
    dup_ids = df.groupBy(id_col).count().filter(col("count") > 1).select(id_col)
    dup_df = df.join(dup_ids, on=id_col, how="inner")

    cols_to_check = [c for c in df.columns if c != id_col]

    diff_exprs = [
        f.collect_set(col(c)).alias(c) for c in cols_to_check
    ]

    grouped = dup_df.groupBy(id_col).agg(*diff_exprs)

    mismatch_exprs = [
        (f.size(col(c)) > 1).alias(c) for c in cols_to_check
    ]

    mismatches = grouped.select(id_col, *mismatch_exprs)

    mismatch_long = mismatches.selectExpr(
        f"`{id_col}`", 
        "stack(" + str(len(cols_to_check)) + ", " +
        ", ".join([f"'{c}', {c}" for c in cols_to_check]) +
        ") as (column, has_difference)"
    ).filter("has_difference = true")

    return mismatch_long


In [None]:
result = find_differences_by_id(df, "steam_appid")
result.show(100)

In [None]:
df.createOrReplaceTempView("df")


In [None]:
spark.sql("""
SELECT recommendations, ROW_NUMBER() OVER (PARTITION BY steam_appid ORDER BY recommendations DESC) AS rank FROM df WHERE steam_appid = '34330'
""").show()

In [None]:
df = spark.sql("""
SELECT *
FROM
(SELECT *, ROW_NUMBER() OVER (PARTITION BY steam_appid ORDER BY recommendations DESC) AS rank FROM df)
WHERE rank = 1
""").drop("rank")

In [None]:
result = find_differences_by_id(df, "steam_appid")
result.show(5)

In [None]:
df.count()

# Clean columns

In [None]:
def get_null_counts(df):
    null_counts = df.select([
        _sum(f.when(col(c).isNull(), 1).otherwise(0)).alias(c)
        for c in df.columns
    ])
    
    result = null_counts.selectExpr(
        "stack({0}, {1}) as (column_name, null_count)".format(
            len(df.columns),
            ", ".join([f"'{c}', `{c}`" for c in df.columns])
        )
    )
    
    return result.orderBy(f.desc("null_count"))


In [None]:
null_summary = get_null_counts(df)
null_summary = null_summary.withColumn("percent", col("null_count")/df.count())
null_summary.show(100)

In [None]:
null_summary.filter('percent >=0.8').select('column_name').show()

In [None]:
df.explain(mode="formatted")