In [None]:
#1.Set Up Environment
#Start by installing necessary libraries, including PySpark and any visualization libraries like Matplotlib or Seaborn

#2.Load Data
#python

from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder \
    .appName("Credit Card Transactions Analysis") \
    .getOrCreate()

# Load the dataset (assuming it's a CSV or JSON format)
df = spark.read.json("path/to/your/dataset.json")  # or .csv if applicable

#3.Data Exploration
df.show()
df.printSchema()

#4.Data Cleaning and Transformation
#Handling PII Data:Identify and discuss which columns contain PII (e.g., names, credit card numbers, etc.)
#Data Quality Assurance:

# Check for null values
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

# Example of dropping rows with null values
df = df.na.drop()

#JSON Flattening:Flatten any JSON data if applicable, using functions like explode() if you encounter nested structures

# Flatten the JSON structure (if needed)
# df = df.withColumn("new_column", explode(df.json_column))

#Timestamp Conversion:

from pyspark.sql.functions import from_utc_timestamp, to_timestamp

df = df.withColumn("trans_date_trans_time", from_utc_timestamp(to_timestamp("trans_date_trans_time"), "UTC+8"))
# Repeat for other timestamp columns as necessary.


#Name Derivation:

#Derive first and last names from a person_name column, handling any dirty

from pyspark.sql.functions import split, trim

df = df.withColumn("first", trim(split(df.person_name, ",").getItem(0)))
df = df.withColumn("last", trim(split(df.person_name, ",").getItem(1)))


#5.Visualizations and Analysis

Import matplotlib.pyplot as plt
import pandas as pd

# Convert to Pandas for visualization if necessary
pdf = df.toPandas()

# Example visualization: Total transactions by category
transaction_counts = pdf['category'].value_counts()
transaction_counts.plot(kind='bar')
plt.title('Transaction Counts by Category')
plt.xlabel('Category')
plt.ylabel('Counts')
plt.show()
### Data Cleaning Process
#The dataset contained various null values and inconsistencies, particularly within the `person_name` field. We separated this into first and last names using a split approach, ensuring values are trimmed to remove excess whitespace.

