# Cleaning XL-Sum

In [1]:
from pyspark.sql import SparkSession   
import pyspark.sql.functions as F
from pyspark.sql.functions import rand
from pyspark.sql.types import *     


In [2]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .config("spark.hadoop.io.native.lib.available", "false")
    .getOrCreate()
)


print(spark)    

<pyspark.sql.session.SparkSession object at 0x000002251D646120>


In [3]:
schema = StructType([
    StructField("text", StringType(), False),
    StructField("summary", StringType(), False)
])

df_xlsum = spark.read.schema(schema).json("../datasets/raw/english_train.jsonl")

In [4]:
df_xlsum = df_xlsum.withColumn("summary_len", F.length("summary"))
df_xlsum = df_xlsum.withColumn("text_len", F.length("text"))

df_xlsum = df_xlsum.filter(
    ((F.col("summary_len") >= 20) & (F.col("text_len") >= 20)) &
    ((F.col("summary_len") <= 2000) & (F.col("text_len") <= 2000))
)

In [5]:
# 1. Add a temporary column with a random number, setting a fixed seed
# This step ensures the shuffle order is the same every run.
df_shuffled = df_xlsum.withColumn("rand_sort_key", rand(seed=42))

# 2. Sort the DataFrame by the random key
df_sorted = df_shuffled.orderBy("rand_sort_key")

# 3. Take exactly 100 rows
df_100_reproducible = df_sorted.limit(100)

# 4. (Optional but good practice) Drop the temporary column
df_100_reproducible = df_100_reproducible.drop("rand_sort_key", "summary_len", "text_len")

# Verification: This will always print 100
print(f"Final count: {df_100_reproducible.count()}")    

Final count: 100


In [6]:
df_100_reproducible.show()

+--------------------+--------------------+
|                text|             summary|
+--------------------+--------------------+
|Defence Estates, ...|About 2,500 Minis...|
|By Danny ShawHome...|A suspected tripl...|
|Loves flying and ...|21-year-old bird ...|
|Everton Nurseries...|A garden centre h...|
|Visitors have com...|Bosses at a Leice...|
|The far-right lea...|Cuba is pulling t...|
|Residents of Mose...|People living in ...|
|Sergio Cabral ser...|The former govern...|
|By John CampbellB...|Northern Ireland'...|
|Wiltshire Police ...|An employee has d...|
|Under proposals t...|The Black Country...|
|A community inter...|Tens of thousands...|
|May, Clarkson's c...|Presenter James M...|
|The 13 ft-high (4...|A memorial to the...|
|Members of the Ra...|Thousands of Tube...|
|Middlesbrough Cou...|Department store ...|
|The Northern Irel...|The Northern Irel...|
|Carr, who plays f...|Silent Witness ac...|
|He said additiona...|The hospital bill...|
|Built in 1797, th...|The latest

In [7]:
pdf = df_100_reproducible.toPandas()

In [8]:
pdf.to_csv("../datasets/cleaned/cleaned_xlsum.csv", index=False)

end