In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Bucketizer

# Start Spark session
spark = SparkSession.builder.appName("DrugCrimeAnalysis").getOrCreate()


df_infer = spark.read.option("header", "true").option("multiLine", "true").csv("/mnt/processed_data_criminal_case_analysis/drug_related_data_enrich_cleaning_Apr_15/2_lawyer_presence_data_enrich")

df_infer.printSchema()
df_infer.show()




In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Bucketizer

# Start Spark session
spark = SparkSession.builder.appName("DrugCrimeAnalysis").getOrCreate()

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType

schema = StructType([
    StructField("amount_a", IntegerType(), True),
    StructField("amount_b", IntegerType(), True),
    StructField("lawyer", BooleanType(), True),
    StructField("public_defender", BooleanType(), True),
    StructField("TotalImprisonmentLengthforCriminalA", IntegerType(), True)
])



# Load data
df = spark.read.csv("/mnt/processed_data_criminal_case_analysis/drug_related_data_enrich_cleaning_Apr_15/2_lawyer_presence_data_enrich", header=True, schema=schema)

df.head()

#DEBUG:

# Filter out rows where TotalImprisonmentLengthforCriminalA is not NULL
df_filtered = df.filter(col("TotalImprisonmentLengthforCriminalA").isNotNull())

# Count the number of rows in the filtered DataFrame
row_count_filtered = df_filtered.count()

print(f"Total number of rows with non-NULL TotalImprisonmentLengthforCriminalA: {row_count_filtered}")


# Show the first few rows of the column
df.select("TotalImprisonmentLengthforCriminalA").show()

# Get descriptive statistics for the column
df.describe("TotalImprisonmentLengthforCriminalA").show()


# Check the output of the aggregation
max_length = df.agg({"TotalImprisonmentLengthforCriminalA": "max"}).collect()[0][0]
print("Maximum Imprisonment Length:", max_length)

# Stop Spark session
spark.stop()


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Bucketizer

# Start Spark session
spark = SparkSession.builder.appName("DrugCrimeAnalysis").getOrCreate()

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType

schema = StructType([
    StructField("amount_a", IntegerType(), True),
    StructField("amount_b", IntegerType(), True),
    StructField("lawyer", BooleanType(), True),
    StructField("public_defender", BooleanType(), True),
    StructField("TotalImprisonmentLengthforCriminalA", IntegerType(), True)
])



# Load data
df = spark.read.csv("/mnt/processed_data_criminal_case_analysis/drug_related_data_enrich_cleaning_Apr_15/2_lawyer_presence_data_enrich", header=True, schema=schema)

df.head()

#DEBUG:

# Filter out rows where TotalImprisonmentLengthforCriminalA is not NULL
df_filtered = df.filter(col("TotalImprisonmentLengthforCriminalA").isNotNull())

# Count the number of rows in the filtered DataFrame
row_count_filtered = df_filtered.count()

print(f"Total number of rows with non-NULL TotalImprisonmentLengthforCriminalA: {row_count_filtered}")


# Show the first few rows of the column
df.select("TotalImprisonmentLengthforCriminalA").show()

# Get descriptive statistics for the column
df.describe("TotalImprisonmentLengthforCriminalA").show()


# Check the output of the aggregation
max_length = df.agg({"TotalImprisonmentLengthforCriminalA": "max"}).collect()[0][0]
print("Maximum Imprisonment Length:", max_length)


# Define the bucketizer
splits = list(range(0, int(df.agg({"TotalImprisonmentLengthforCriminalA": "max"}).collect()[0][0]) + 3, 3))
bucketizer = Bucketizer(splits=splits, inputCol="TotalImprisonmentLengthforCriminalA", outputCol="ImprisonmentLengthBucket")

# Transform the data
df_binned = bucketizer.transform(df)

# Group by lawyer status and imprisonment length bucket, then count
result_df = df_binned.groupBy("lawyer", "ImprisonmentLengthBucket").count()

# Convert the result to Pandas DataFrame for plotting (if the dataset is small enough)
result_pandas = result_df.toPandas()

# Plot using matplotlib
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
for label, grp in result_pandas.groupby('lawyer'):
    grp.plot(x='ImprisonmentLengthBucket', y='count', ax=ax, label=label, kind='bar')
plt.title('Histogram of Imprisonment Length by Lawyer Presence')
plt.xlabel('Imprisonment Length (months)')
plt.ylabel('Number of Cases')
plt.xticks(ticks=range(len(splits)-1), labels=[f"{splits[i]}-{splits[i+1]}" for i in range(len(splits)-1)])
plt.legend(title='Lawyer Present')
plt.show()

# Stop Spark session
spark.stop()
