In [None]:
import os
# Find the latest version of spark and enter as the spark version
# spark_version = 'spark-3.5.1'
spark_version = 'spark-3.5.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

In [None]:
# Import pyspark packages
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType, IntegerType

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [None]:
import kagglehub

# Download latest version of Kaggle dataset
path = kagglehub.dataset_download("kartik2112/fraud-detection")

print("Path to dataset files:", path)


# List all files in the downloaded directory
files = os.listdir(path)
print("Files in the dataset:", files)


In [None]:
# File location and type
file_location = path + "/fraudTrain.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df.show()

In [None]:
# Create a view or table
temp_table_name = "fraudTrain"

df.createOrReplaceTempView(temp_table_name)

spark.sql("""select * from fraudTrain""").show(truncate=False)

In [None]:
# Convert unix_time to timestamp to see if it is the same as trans_date_trans_time
spark.sql("""
SELECT
    cc_num,
    unix_time,
    trans_date_trans_time,
    from_unixtime(unix_time + 220924800) unix_convert,
    is_fraud,
    CASE
        WHEN from_unixtime(unix_time + 220924800) = trans_date_trans_time THEN 'Match'
        ELSE 'Mismatch'
    END as comparison_result
FROM fraudTrain
order by cc_num,trans_date_trans_time, is_fraud
""").show()

In [None]:
# Write DataFrame to Parquet with partitioning by a column (e.g., 'is_fraud')
df.write.mode("overwrite").partitionBy("is_fraud").parquet("fraud_train")

In [None]:
# Read in our new parquet formatted data
p_df=spark.read.parquet('fraud_train')
p_df.createOrReplaceTempView('p_fraudTrain')

In [None]:
import time

# Get min, max, avg amounts and counts of transactions
start_time = time.time()
spark.sql("""select is_fraud,
                    round(avg(amt),2),
                    round(min(amt),2),
                    round(max(amt),2),
                    round(count(amt),2)
            from p_fraudTrain
            group by is_fraud""").show(truncate=False)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
import folium
from geopy.distance import geodesic  # To calculate distance
import ipywidgets as widgets
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
fraud_query = "select cc_num, amt, trans_date_trans_time, lat, long, merchant, merch_lat, merch_long from p_fraudTrain where is_fraud == 1"
spark_fraud_df = spark.sql(fraud_query)

# Convert Spark DataFrame to Pandas DataFrame
fraudulent_transactions_df = spark_fraud_df.toPandas()

# Display the Pandas DataFrame
fraudulent_transactions_df.head()

In [None]:
def calculate_distance(row):
    transaction_coords = (row["lat"], row["long"])
    merchant_coords = (row["merch_lat"], row["merch_long"])
    return geodesic(transaction_coords, merchant_coords).km

# Add a distance column
fraudulent_transactions_df["distance_km"] = fraudulent_transactions_df.apply(calculate_distance, axis=1)

fraudulent_transactions_df.head()


In [None]:
# Unique credit card numbers for the dropdown
cc_nums = fraudulent_transactions_df["cc_num"].unique()
dropdown = widgets.Dropdown(
    options=cc_nums,
    description="Credit Card:",
    value=cc_nums[0],
)

# Function to create the map for a selected credit card number
def create_map(cc_num):
    filtered_df = fraudulent_transactions_df[fraudulent_transactions_df["cc_num"] == cc_num]

    # Initialize the map
    if not filtered_df.empty:
        map_center = [filtered_df.iloc[0]["lat"], filtered_df.iloc[0]["long"]]
        fraud_map = folium.Map(location=map_center, zoom_start=10)

        # Add markers and lines
        for _, row in filtered_df.iterrows():
            # Customer home location
            folium.Marker(
                location=[row["lat"], row["long"]],
                popup=f"Customer",
                icon=folium.Icon(color="blue"),
            ).add_to(fraud_map)

            # Merchant location
            folium.Marker(
                location=[row["merch_lat"], row["merch_long"]],
                popup=f"Merchant: {row['merchant']}<br>Distance: {0.62137 * row['distance_km']:.2f} miles<br>Amount: ${row['amt']:,.2f}",
                icon=folium.Icon(color="green"),
            ).add_to(fraud_map)

            # Draw a line between customer home and merchant
            folium.PolyLine(
                locations=[(row["lat"], row["long"]), (row["merch_lat"], row["merch_long"])],
                color="red",
                weight=2,
            ).add_to(fraud_map)

        # Display the map
        return fraud_map
    else:
        return folium.Map(location=[0, 0], zoom_start=2)

# Function to update the map when the dropdown changes
def update_map(change):
    selected_cc_num = change["new"]
    map_display.clear_output()
    with map_display:
        fraud_map = create_map(selected_cc_num)
        # fraud_map.save("fraud_map.html")
        display(fraud_map)

# Display the map with the initial credit card number
map_display = widgets.Output()
with map_display:
    display(create_map(cc_nums[0]))

# Update the map when a new credit card number is selected
dropdown.observe(update_map, names="value")

# Display the dropdown and map
display(dropdown, map_display)


In [None]:
# Extract hour and day
fraudulent_transactions_df["hour"] = fraudulent_transactions_df["trans_date_trans_time"].dt.hour
fraudulent_transactions_df["weekday"] = fraudulent_transactions_df["trans_date_trans_time"].dt.day_name()

# Define the order of weekdays
weekday_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

# Aggregate the count of frauds by weekday and hour
heatmap_data = fraudulent_transactions_df.groupby(["weekday", "hour"]).size().reset_index(name="count")

# Pivot for heatmap format
heatmap_pivot = heatmap_data.pivot(index="weekday", columns="hour", values="count").fillna(0).reindex(weekday_order)

# Plot the heat map
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_pivot, annot=True, fmt=".0f", cmap="YlGnBu", cbar_kws={'label': 'Fraud Count'})
plt.title("Heat Map of Fraudulent Transactions by Weekday and Hour")
plt.xlabel("Hour of Day")
plt.ylabel("Weekday")
plt.show()


In [None]:
 # Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import tensorflow as tf

In [None]:
query = "select cc_num, amt, zip, lat, long, city_pop, unix_time, merch_lat, merch_long, is_fraud from p_fraudTrain"
spark_df = spark.sql(query)

# Convert Spark DataFrame to Pandas DataFrame
pandas_df = spark_df.toPandas()

# Display the Pandas DataFrame
pandas_df.head()

In [None]:
# Extract the 'is_fraud' column as a list
y = p_df.select("is_fraud").rdd.flatMap(lambda x: x).collect()

# Drop the "is_fraud" column and convert the features to a Pandas DataFrame or NumPy array
X = pandas_df.drop("is_fraud", axis=1)

# Use train_test_split from Scikit-learn
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [None]:
# Preprocess numerical data for neural network

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
import numpy as np

y_train = np.array(y_train)
y_test = np.array(y_test)

print(type(X_train_scaled))  # Should be <class 'numpy.ndarray'>
print(type(y_train))         # Should be <class 'numpy.ndarray'>


In [None]:
 # Define the deep learning model
n_features = X_train_scaled.shape[1]

nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Input(shape=(n_features,)))
nn_model.add(tf.keras.layers.Dense(units=18, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=9, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=5)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")