In [0]:
path = "/mnt/processed_data_criminal_case_analysis/drug_related_DrugTypeAmount_Penalty_Location_March_19/*.csv"

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(path)

df.printSchema()

from pyspark.sql.functions import lit

# Assuming `common_columns` is a list of all columns identified
def align_dataframe(df, common_columns):
    for col in common_columns:
        if col not in df.columns:
            df = df.withColumn(col, lit(None))
    return df.select(common_columns)


In [0]:
# Register the DataFrame as a temp view
df.createOrReplaceTempView("cases_view")

# Execute SQL query to find non-integer values
non_integer_values_query = """
SELECT OriginalLink, CaseNumber,	CaseName,	Court,	Location,	CaseType,	TrialProcedure,	JudgmentDate,PublicationDate, TotalImprisonmentLengthforCriminalA, PartiesInvolved,	CausesofAction,	LegalBasis,	FullText,	drug_a,	amount_a,	drug_b,	amount_b,	ResponseText,	Charge1forCriminalA	FineforCriminalA,	TotalImprisonmentLengthforCriminalA,	SuspendedforCriminalA,	Charge2forCriminalA,	Charge1forCriminalB,	Charge2forCriminalB	FineforCriminalB,	TotalImprisonmentLengthforCriminalB,	SuspendedforCriminalB,	Province,	City,	District,	CourtLevel,	Adcode
FROM cases_view
WHERE CAST(TotalImprisonmentLengthforCriminalA AS INT) IS NULL
AND TotalImprisonmentLengthforCriminalA IS NOT NULL
"""

non_integer_values = spark.sql(non_integer_values_query)

# Show the results
non_integer_values.show()


In [0]:
from pyspark.sql.functions import col, isnan, when, count

# Assuming 'df' is your DataFrame
# Replace 'TotalImprisonmentLengthforCriminalA' with the actual column name you want to check
# Repeat the process for other TotalImprisonmentLength columns as needed

# Try casting the column to an integer type
df_with_cast = df.withColumn("TotalImprisonmentLengthInt", col("TotalImprisonmentLengthforCriminalB").cast("int"))

# Filter to find rows where cast is not successful
# This condition checks for nulls in the casted column which indicates unsuccessful casts
non_integer_rows = df_with_cast.filter(df_with_cast["TotalImprisonmentLengthInt"].isNull() & ~df_with_cast["TotalImprisonmentLengthforCriminalB"].isNull())

# Show the rows with non-integer values
non_integer_rows.select("TotalImprisonmentLengthforCriminalB").show(100, truncate=False)
#non_integer_rows.show(non_integer_rows.count(), truncate=False)
