In [None]:
import os
# Find the latest version of spark and enter as the spark version
# spark_version = 'spark-3.5.1'
spark_version = 'spark-3.5.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

In [None]:
# Import pyspark packages
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType, IntegerType

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [None]:
import kagglehub

# Download latest version of Kaggle dataset
path = kagglehub.dataset_download("kartik2112/fraud-detection")

print("Path to dataset files:", path)


# List all files in the downloaded directory
files = os.listdir(path)
print("Files in the dataset:", files)


In [None]:
# File location and type
file_location = path + "/fraudTrain.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df.show()

In [None]:
# Create a view or table
temp_table_name = "fraudTrain"

df.createOrReplaceTempView(temp_table_name)

spark.sql("""select * from fraudTrain""").show(truncate=False)

In [None]:
# Convert unix_time to timestamp to see if it is the same as trans_date_trans_time
spark.sql("""
SELECT
    cc_num,
    unix_time,
    trans_date_trans_time,
    from_unixtime(unix_time + 220924800) unix_convert,
    is_fraud,
    CASE
        WHEN from_unixtime(unix_time + 220924800) = trans_date_trans_time THEN 'Match'
        ELSE 'Mismatch'
    END as comparison_result
FROM fraudTrain
order by cc_num,trans_date_trans_time, is_fraud
""").show()

In [None]:
# Write DataFrame to Parquet with partitioning by a column (e.g., 'is_fraud')
df.write.mode("overwrite").partitionBy("is_fraud").parquet("fraud_train")

In [None]:
# Read in our new parquet formatted data
p_df=spark.read.parquet('fraud_train')
p_df.createOrReplaceTempView('p_fraudTrain')

In [None]:
import time

# Get min, max, avg amounts and counts of transactions
start_time = time.time()
spark.sql("""select is_fraud,
                    round(avg(amt),2),
                    round(min(amt),2),
                    round(max(amt),2),
                    round(count(amt),2)
            from p_fraudTrain
            group by is_fraud""").show(truncate=False)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
import folium
from geopy.distance import geodesic  # To calculate distance
import ipywidgets as widgets
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
fraud_query = "select cc_num, amt, trans_date_trans_time, lat, long, merchant, merch_lat, merch_long from p_fraudTrain where is_fraud == 1"
spark_fraud_df = spark.sql(fraud_query)

# Convert Spark DataFrame to Pandas DataFrame
fraudulent_transactions_df = spark_fraud_df.toPandas()

# Display the Pandas DataFrame
fraudulent_transactions_df.head()

In [None]:
def calculate_distance(row):
    transaction_coords = (row["lat"], row["long"])
    merchant_coords = (row["merch_lat"], row["merch_long"])
    return geodesic(transaction_coords, merchant_coords).km

# Add a distance column
fraudulent_transactions_df["distance_km"] = fraudulent_transactions_df.apply(calculate_distance, axis=1)

fraudulent_transactions_df.head()


In [None]:
# Unique credit card numbers for the dropdown
cc_nums = fraudulent_transactions_df["cc_num"].unique()
dropdown = widgets.Dropdown(
    options=cc_nums,
    description="Credit Card:",
    value=cc_nums[0],
)

# Function to create the map for a selected credit card number
def create_map(cc_num):
    filtered_df = fraudulent_transactions_df[fraudulent_transactions_df["cc_num"] == cc_num]

    # Initialize the map
    if not filtered_df.empty:
        map_center = [filtered_df.iloc[0]["lat"], filtered_df.iloc[0]["long"]]
        fraud_map = folium.Map(location=map_center, zoom_start=10)

        # Add markers and lines
        for _, row in filtered_df.iterrows():
            # Customer home location
            folium.Marker(
                location=[row["lat"], row["long"]],
                popup=f"Customer",
                icon=folium.Icon(color="blue"),
            ).add_to(fraud_map)

            # Merchant location
            folium.Marker(
                location=[row["merch_lat"], row["merch_long"]],
                popup=f"Merchant: {row['merchant']}<br>Distance: {0.62137 * row['distance_km']:.2f} miles<br>Amount: ${row['amt']:,.2f}",
                icon=folium.Icon(color="green"),
            ).add_to(fraud_map)

            # Draw a line between customer home and merchant
            folium.PolyLine(
                locations=[(row["lat"], row["long"]), (row["merch_lat"], row["merch_long"])],
                color="red",
                weight=2,
            ).add_to(fraud_map)

        # Display the map
        return fraud_map
    else:
        return folium.Map(location=[0, 0], zoom_start=2)

# Function to update the map when the dropdown changes
def update_map(change):
    selected_cc_num = change["new"]
    map_display.clear_output()
    with map_display:
        fraud_map = create_map(selected_cc_num)
        # fraud_map.save("fraud_map.html")
        display(fraud_map)

# Display the map with the initial credit card number
map_display = widgets.Output()
with map_display:
    display(create_map(cc_nums[0]))

# Update the map when a new credit card number is selected
dropdown.observe(update_map, names="value")

# Display the dropdown and map
display(dropdown, map_display)


In [None]:
# Extract hour and day
fraudulent_transactions_df["hour"] = fraudulent_transactions_df["trans_date_trans_time"].dt.hour
fraudulent_transactions_df["weekday"] = fraudulent_transactions_df["trans_date_trans_time"].dt.day_name()

# Define the order of weekdays
weekday_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

# Aggregate the count of frauds by weekday and hour
heatmap_data = fraudulent_transactions_df.groupby(["weekday", "hour"]).size().reset_index(name="count")

# Pivot for heatmap format
heatmap_pivot = heatmap_data.pivot(index="weekday", columns="hour", values="count").fillna(0).reindex(weekday_order)

# Plot the heat map
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_pivot, annot=True, fmt=".0f", cmap="YlGnBu", cbar_kws={'label': 'Fraud Count'})
plt.title("Heat Map of Fraudulent Transactions by Weekday and Hour")
plt.xlabel("Hour of Day")
plt.ylabel("Weekday")
plt.show()


In [None]:
query = """select cc_num, amt, zip, lat, long, city_pop, trans_date_trans_time,
                  unix_time, merch_lat, merch_long, category, gender, job, city,
                  state, is_fraud
           from p_fraudTrain"""
spark_df = spark.sql(query)

# Convert Spark DataFrame to Pandas DataFrame
data_cleaned = spark_df.toPandas()

# Display the Pandas DataFrame
data_cleaned

In [None]:
#import Dependencies
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# Convert 'trans_date_trans_time' to a numeric format (optional)
data_cleaned['trans_date_trans_time'] = pd.to_datetime(data_cleaned['trans_date_trans_time']).astype('int64') // 10**9  # Convert to Unix timestamp


In [None]:

# Encode non-numeric columns, including city and state
non_numeric_columns = ['category', 'gender', 'job', 'city', 'state']
label_encoders = {}
for col in non_numeric_columns:
    le = LabelEncoder()
    data_cleaned[col] = le.fit_transform(data_cleaned[col].astype(str))  # Ensure all data is string before encoding
    label_encoders[col] = le


In [None]:

# Extract target variable and features
X = data_cleaned.drop('is_fraud', axis=1)
y = data_cleaned['is_fraud']


In [None]:

# Ensure all features are numeric
print("Data types after encoding:", X.dtypes)


In [None]:

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:

# Display the first few rows of the processed dataset
print(X_train[:5], y_train[:5])


## **Training a Random Forest Model**

In [None]:

# Re-train and evaluate the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


# Performance Analysis of Random Forest Model

The Random Forest model performed exceptionally well in terms of overall accuracy, achieving **99.83% accuracy**. However, let's dive deeper into the results:


## **Performance Analysis**

### **Class 0 (Non-Fraudulent Transactions)**:
- **Precision**: 1.00 (Perfect precision; no false positives)
- **Recall**: 1.00 (Perfect recall; no false negatives)
- **F1-Score**: 1.00 (Excellent balance between precision and recall)

### **Class 1 (Fraudulent Transactions)**:
- **Precision**: 0.92 (Few false positives)
- **Recall**: 0.68 (Moderate recall; missed some fraudulent transactions)
- **F1-Score**: 0.78 (Good overall performance for fraud detection, but room for improvement)

### **Class Imbalance**
- Only **7,506 fraudulent transactions** vs. **1,289,169 non-fraudulent transactions**.
- This significant imbalance impacts the recall for fraud detection.



## **Adjusting the model's class weights to penalize misclassification of fraudulent transactions.**

In [None]:
# Initialize the Random Forest model with class weights
model = RandomForestClassifier(random_state=42, class_weight={0: 1, 1: 10})

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


# Performance Summary

### Class 0 (Non-Fraudulent Transactions):
- **Precision**: 1.00 (No false positives; perfect identification of non-fraudulent transactions).
- **Recall**: 1.00 (All non-fraudulent transactions correctly identified).

### Class 1 (Fraudulent Transactions):
- **Precision**: 0.94 (Slightly more false positives but still very high).
- **Recall**: 0.67 (Improved compared to the previous model, but some fraudulent transactions are still missed).
- **F1-Score**: 0.78 (Balanced performance for fraud detection).

### Overall Accuracy:
- **99.84%**: Excellent overall performance.

### Macro and Weighted Averages:
- **Macro Avg Recall**: 0.84 (Reflects the imbalanced dataset).
- **Weighted Avg Recall**: 1.00 (Dominated by the majority class).

## Observations:
- The weighted averages show near-perfect results due to the dominant majority class (non-fraudulent transactions).


## **Manual Oversampling the data to get maximum accuracy**

In [None]:
# Separate the majority and minority classes
minority_class = data_cleaned[data_cleaned['is_fraud'] == 1]
majority_class = data_cleaned[data_cleaned['is_fraud'] == 0]

In [None]:

# Oversample the minority class
oversampled_minority_class = minority_class.sample(n=len(majority_class), replace=True, random_state=42)


In [None]:

# Combine the majority class with the oversampled minority class
balanced_data = pd.concat([majority_class, oversampled_minority_class])


In [None]:

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:

# Split features and target variable
X_balanced = balanced_data.drop('is_fraud', axis=1)
y_balanced = balanced_data['is_fraud']


In [None]:

# Scale the features
scaler = StandardScaler()
X_balanced_scaled = scaler.fit_transform(X_balanced)


In [None]:

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced_scaled, y_balanced, test_size=0.2, random_state=42)

# Verify the class distribution
print("Class distribution in y_train:\n", y_train.value_counts())


## **Random Forest Classifier on Balanced Dataset**

In [None]:
# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


# Performance Summary

## **Overall Accuracy**
- **99.97%**: Almost perfect accuracy on the balanced dataset.



## **Class 0 (Non-Fraudulent Transactions)**
- **Precision**: 1.00 (No false positives).
- **Recall**: 1.00 (All non-fraudulent transactions correctly identified).
- **F1-Score**: 1.00 (Perfect balance between precision and recall).



## **Class 1 (Fraudulent Transactions)**
- **Precision**: 1.00 (Almost no false positives).
- **Recall**: 1.00 (All fraudulent transactions correctly identified).
- **F1-Score**: 1.00 (Perfect fraud detection).


## **Macro and Weighted Averages**
- **Precision, Recall, F1-Score**: All metrics are perfect due to the balanced dataset and model sensitivity.



## **Observations**
1. **Balanced Data**:
   - Balancing the dataset allowed the model to perform equally well for both classes.

2. **No Overfitting**:
   - Random oversampling combined with the Random Forest model handled the dataset effectively without signs of overfitting.



In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = model.feature_importances_
# We can sort the features by their importance
sorted(zip(model.feature_importances_, X.columns), reverse=True)

 # Visualize the features by importance
importances_df = pd.DataFrame(sorted(zip(model.feature_importances_, X.columns), reverse=True))
importances_df.set_index(importances_df[1], inplace=True)
importances_df.drop(columns=1, inplace=True)
importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
importances_sorted = importances_df.sort_values(by='Feature Importances')
importances_sorted.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False)

In [None]:
import numpy as np
from datetime import datetime

# Define the features in the correct order based on training
input_features = [
    "cc_num", "amt", "zip", "lat", "long", "city_pop",
    "trans_date_trans_time", "unix_time", "merch_lat", "merch_long",
    "category", "gender", "job", "city", "state"
]

# Collect user input for only essential features
user_inputs = []
print("Please enter the values for the required features:")
amt = float(input("Enter value for amt: "))
category = float(input("Enter transaction category (0-13): "))
date = str(input("Enter transaction date (YYYY-MM-DD): "))
time = str(input("Enter transaction time (HH:MM): "))

# Combine date and time to datetime object
datetime_str = f"{date} {time}"
dt_object = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M")

# Convert datetime to Unix timestamp
unix_time = int(dt_object.timestamp())
trans_date_trans_time = int(dt_object.timestamp())

# Default values for features including those provided by the user
default_values = {
    "cc_num": 3524574586339330, # Default credit card number
    "amt": amt,
    "zip": 32960,             # Default zip code
    "lat": 27.6330,           # Example latitude
    "long": -80.4031,        # Example longitude
    "city_pop": 105638,         # Example city population
    "trans_date_trans_time": trans_date_trans_time,
    "unix_time": unix_time - 220924800,
    "merch_lat": 26.888686,   # Example merchant latitude
    "merch_long": -80.834389, # Example merchant longitude
    "category": category,
    "gender": 0,              # Example gender encoding
    "job": 271,                 # Placeholder for job (encoded as numeric)
    "city": 829,                # Placeholder for city (encoded as numeric)
    "state": 9                # Placeholder for state (encoded as numeric)
}


# Append default values for the remaining features
# for feature in input_features[4:]:  # Skip first four features
for feature in input_features:
    user_inputs.append(default_values[feature])

print (f"trans_date_trans_time: {trans_date_trans_time}\n unix_time: {unix_time - 220924800}")

# Convert inputs to a model-compatible format
input_array = np.array([user_inputs])  # Reshape for prediction

# Debugging: Check if input matches expected shape
print(f"Input shape: {input_array.shape}")
print(f"Model expects: {model.n_features_in_} features")

# Make prediction
prediction = model.predict(input_array)

# Output the prediction
if prediction[0] == 1:
    print("Prediction: Fraud")
else:
    print("Prediction: Not Fraud")

In [None]:
print(X.columns.tolist())
print(f"Training data shape: {X.shape}")

In [None]:
print(f"Input shape: {input_array.shape}")
print(f"Model expects: {model.n_features_in_} features")

In [None]:
print(trans_date_trans_time)

In [None]:
gender = label_encoders['gender'].inverse_transform([0, 1])
print(gender)

In [None]:
data_cleaned['category'].value_counts()

In [None]:
# Access the LabelEncoder for the 'category' column
category_encoder = label_encoders['state']

# Get the mapping of numerical labels to original categories
categories_decoded = {index: label for index, label in enumerate(category_encoder.classes_)}

# Print the decoded categories
print("Decoded Categories:")
for number, category in categories_decoded.items():
    print(f"{number}: {category}")

In [None]:
# Access the LabelEncoder for the 'category' column
category_encoder = label_encoders['job']

# Get the mapping of numerical labels to original categories
categories_decoded = {index: label for index, label in enumerate(category_encoder.classes_)}

# Print the decoded categories
print("Decoded Categories:")
for number, category in categories_decoded.items():
    print(f"{number}: {category}")

In [None]:
# Access the LabelEncoder for the 'category' column
category_encoder = label_encoders['city']

# Get the mapping of numerical labels to original categories
categories_decoded = {index: label for index, label in enumerate(category_encoder.classes_)}

# Print the decoded categories
print("Decoded Categories:")
for number, category in categories_decoded.items():
    print(f"{number}: {category}")

In [None]:
# Access the LabelEncoder for the 'category' column
category_encoder = label_encoders['category']

# Get the mapping of numerical labels to original categories
categories_decoded = {index: label for index, label in enumerate(category_encoder.classes_)}

# Print the decoded categories
print("Decoded Categories:")
for number, category in categories_decoded.items():
    print(f"{number}: {category}")