In [None]:
import os
# Find the latest version of spark 3.x  from https://www.mongodb.com/try/download/database-tools and enter as the spark version
# For example:
# spark_version = 'spark-3.5.1'
spark_version = 'spark-3.5.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

In [None]:
 # Import packages
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType, IntegerType

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kartik2112/fraud-detection")

print("Path to dataset files:", path)


# List all files in the downloaded directory
files = os.listdir(path)
print("Files in the dataset:", files)


In [None]:
# File location and type
file_location = path + "/fraudTrain.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

In [None]:
# Create a view or table

temp_table_name = "fraudTrain"

df.createOrReplaceTempView(temp_table_name)

In [None]:
spark.sql("""select *
            from fraudTrain""").show(truncate=False)

In [None]:
# Write DataFrame to Parquet with partitioning by a column (e.g., 'Class')
df.write.mode("overwrite").partitionBy("is_fraud").parquet("fraud_train")

In [None]:
# Read in our new parquet formatted data
p_df=spark.read.parquet('fraud_train')
p_df.createOrReplaceTempView('p_fraudTrain')

In [None]:
import time

start_time = time.time()
spark.sql("""select is_fraud,
                    round(avg(amt),2),
                    round(min(amt),2),
                    round(max(amt),2),
                    round(count(amt),2)
            from p_fraudTrain
            group by is_fraud""").show(truncate=False)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
 # Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf

In [None]:
query = "select cc_num, amt, zip, lat, long, city_pop, unix_time, merch_lat, merch_long, is_fraud from p_fraudTrain"
spark_df = spark.sql(query)

# Convert Spark DataFrame to Pandas DataFrame
pandas_df = spark_df.toPandas()

# Display the Pandas DataFrame
pandas_df.head()

In [None]:
# Extract the 'is_fraud' column as a list
y = p_df.select("is_fraud").rdd.flatMap(lambda x: x).collect()

# Drop the "is_fraud" column and convert the features to a Pandas DataFrame or NumPy array
X = pandas_df.drop("is_fraud", axis=1)

# Use train_test_split from Scikit-learn
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [None]:
# Preprocess numerical data for neural network

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
import numpy as np

y_train = np.array(y_train)
y_test = np.array(y_test)

print(type(X_train_scaled))  # Should be <class 'numpy.ndarray'>
print(type(y_train))         # Should be <class 'numpy.ndarray'>


In [None]:
 # Define the deep learning model
n_features = X_train_scaled.shape[1]

nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Input(shape=(n_features,)))
nn_model.add(tf.keras.layers.Dense(units=18, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=9, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=5)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")