In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Bucketizer

# Start Spark session
spark = SparkSession.builder.appName("DrugCrimeAnalysis").getOrCreate()

# Load data
df = spark.read.csv("/path/to/your/csvfile.csv", header=True, inferSchema=True)

# Define the bucketizer
splits = list(range(0, int(df.agg({"TotalImprisonmentLengthforCriminalA": "max"}).collect()[0][0]) + 3, 3))
bucketizer = Bucketizer(splits=splits, inputCol="TotalImprisonmentLengthforCriminalA", outputCol="ImprisonmentLengthBucket")

# Transform the data
df_binned = bucketizer.transform(df)

# Group by lawyer status and imprisonment length bucket, then count
result_df = df_binned.groupBy("lawyer", "ImprisonmentLengthBucket").count()

# Convert the result to Pandas DataFrame for plotting (if the dataset is small enough)
result_pandas = result_df.toPandas()

# Plot using matplotlib
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
for label, grp in result_pandas.groupby('lawyer'):
    grp.plot(x='ImprisonmentLengthBucket', y='count', ax=ax, label=label, kind='bar')
plt.title('Histogram of Imprisonment Length by Lawyer Presence')
plt.xlabel('Imprisonment Length (months)')
plt.ylabel('Number of Cases')
plt.xticks(ticks=range(len(splits)-1), labels=[f"{splits[i]}-{splits[i+1]}" for i in range(len(splits)-1)])
plt.legend(title='Lawyer Present')
plt.show()

# Stop Spark session
spark.stop()
