In [None]:
pip install streamlit



In [None]:
!pip install -q kagglehub pandas scikit-learn imbalanced-learn tensorflow networkx pyngrok

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import networkx as nx
import plotly.express as px
import streamlit as st
import random
import time
from pyngrok import ngrok
import threading
print("Starting fraud detection project...")

Starting fraud detection project...


In [None]:
import kagglehub
import os
try:
    path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
    file_path = os.path.join(path, "creditcard.csv")
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
except Exception as e:
    print(f"Error loading dataset from Kaggle: {e}")
    # Create a dummy dataset for demonstration if download fails
    np.random.seed(42)
    df = pd.DataFrame(np.random.rand(10000, 29), columns=[f'V{i}' for i in range(1, 29)] + ['Amount'])
    df['Class'] = np.random.choice([0, 1], size=10000, p=[0.99, 0.01])
    print("Using dummy dataset for demonstration.")

Dataset loaded successfully.


In [None]:
df.columns

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class',
       'anomaly_score'],
      dtype='object')

In [None]:
#df = df.drop('Time', axis=1)
df = df.drop('anomaly_score', axis=1)
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df[['Amount']])

X = df.drop('Class', axis=1)
y = df['Class']
print("DataFrame columns after dropping 'Time':")
print(X.columns)

DataFrame columns after dropping 'Time':
Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')


In [None]:
print("\nTraining Supervised Model (Random Forest)...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
print("Random Forest Model Evaluation:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
y


Training Supervised Model (Random Forest)...
Random Forest Model Evaluation:
Confusion Matrix:
[[85128    21]
 [    0 85440]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85149
           1       1.00      1.00      1.00     85440

    accuracy                           1.00    170589
   macro avg       1.00      1.00      1.00    170589
weighted avg       1.00      1.00      1.00    170589



Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0
...,...
284802,0
284803,0
284804,0
284805,0


In [None]:
print("\nTraining Unsupervised Model (Isolation Forest)...")
iso_forest_model = IsolationForest(contamination=0.01, random_state=42)
iso_forest_model.fit(X)
df['anomaly_score'] = iso_forest_model.predict(X)
anomalies = df[df['anomaly_score'] == -1]
print(f"Detected {len(anomalies)} anomalies using Isolation Forest.")


Training Unsupervised Model (Isolation Forest)...
Detected 2849 anomalies using Isolation Forest.


In [None]:
print("\nTraining Deep Learning Model (Autoencoder)...")
input_dim = X.shape[1]
latent_dim = 14
input_layer = Input(shape=(input_dim,))
encoded = Dense(latent_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)
autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

normal_transactions = X[y == 0]
scaler_auto = StandardScaler()
normal_scaled = scaler_auto.fit_transform(normal_transactions)
autoencoder.fit(normal_scaled, normal_scaled, epochs=5, batch_size=32, shuffle=True, verbose=0)


Training Deep Learning Model (Autoencoder)...


<keras.src.callbacks.history.History at 0x7bb0f009b320>

In [None]:
print("\nStep 4: Saving models and data for dashboard...")
joblib.dump(rf_model, 'rf_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
# Correction: Save the complete DataFrame (df), not just the features (X)
df.to_csv('preprocessed_data.csv', index=False)
print("Models and data saved to files.")


Step 4: Saving models and data for dashboard...
Models and data saved to files.


In [None]:
def send_alert(transaction_details):
    print("------------------------------------------------------------------")
    print("!!! FRAUD ALERT !!!")
    print(f"High-risk transaction detected: {transaction_details}")
    print("------------------------------------------------------------------")

def predict_new_transaction(model, scaler, new_transaction_data):
    expected_features = [f'V{i}' for i in range(1, 29)] + ['Amount']

    transaction_df = pd.DataFrame([new_transaction_data])

    for col in expected_features:
        if col not in transaction_df.columns:
            transaction_df[col] = 0

    transaction_df = transaction_df[expected_features]

    transaction_df['Amount'] = scaler.transform(transaction_df[['Amount']])

    prediction_class = model.predict(transaction_df)[0]
    prediction_proba = model.predict_proba(transaction_df)[0][1]

    return prediction_class, prediction_proba

def real_time_stream_processor(model, scaler, new_transactions_data):
    print("\nStarting real-time fraud monitoring simulation...")
    for i, new_transaction_data in enumerate(new_transactions_data):
        prediction, probability = predict_new_transaction(model, scaler, new_transaction_data)
        if prediction == 1 and probability > 0.7:
            transaction_details = {'id': i + 1, 'amount': new_transaction_data['Amount'], 'risk_score': f"{probability:.2f}"}
            send_alert(transaction_details)
            time.sleep(0.5)

simulated_transactions = [
    {**{f'V{i}': 0.1 for i in range(1, 29)}, 'Amount': 500, 'source_user': 'userA', 'target_user': 'userB'},
    {**{f'V{i}': -1.2 for i in range(1, 29)}, 'Amount': 9999, 'source_user': 'userB', 'target_user': 'userC'},
    {**{f'V{i}': 0.5 for i in range(1, 29)}, 'Amount': 100, 'source_user': 'userC', 'target_user': 'userA'}
]

def build_transaction_graph(transactions_df):
    G = nx.DiGraph()
    for trans in transactions_df:
        G.add_edge(trans['source_user'], trans['target_user'], weight=trans['Amount'])
    return G

def detect_fraudulent_patterns(graph):
    print("\n--- Graph-Based Analysis ---")
    cycles = list(nx.simple_cycles(graph))
    if cycles:
        print(f"Detected {len(cycles)} potential fraud rings (cycles):")
        for cycle in cycles:
            print(cycle)
    else:
        print("No fraudulent cycles detected.")


real_time_stream_processor(rf_model, scaler, simulated_transactions)


Starting real-time fraud monitoring simulation...


In [None]:
def build_transaction_graph(transactions_df):
    """Builds a graph from transaction data."""
    G = nx.DiGraph()
    graph_data = {
        'source_user': ['userA', 'userB', 'userC', 'userD', 'userA'],
        'target_user': ['userB', 'userC', 'userA', 'userB', 'userE'],
        'amount': [100, 200, 300, 50, 150]
    }
    graph_df = pd.DataFrame(graph_data)
    for _, row in graph_df.iterrows():
        G.add_edge(row['source_user'], row['target_user'], weight=row['amount'])
    return G

def detect_fraudulent_patterns(graph):
    """Analyzes the graph to find cycles."""
    print("\n--- Graph-Based Analysis ---")
    cycles = list(nx.simple_cycles(graph))
    if cycles:
        print(f"Detected {len(cycles)} potential fraud rings (cycles):")
        for cycle in cycles:
            print(cycle)
    else:
        print("No fraudulent cycles detected.")

transaction_graph = build_transaction_graph(None)
detect_fraudulent_patterns(transaction_graph)


--- Graph-Based Analysis ---
Detected 1 potential fraud rings (cycles):
['userA', 'userB', 'userC']


In [None]:
%%writefile dashboard.py
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import joblib


st.set_page_config(layout="wide")

try:
    model = joblib.load('rf_model.pkl')
    scaler = joblib.load('scaler.pkl')
    df_display = pd.read_csv('preprocessed_data.csv').sample(5000, random_state=42)
    st.success("Models and data loaded successfully!")
except FileNotFoundError:
    st.error("Model or data file not found. Please run the main notebook first.")
    st.stop()

st.title("Financial Fraud Detection Dashboard 💰")
st.markdown("### Real-time Insights and Anomaly Visualization")

# KPI's
total_transactions = df_display.shape[0]
fraudulent_transactions = df_display[df_display['Class'] == 1].shape[0]
fraud_rate = (fraudulent_transactions / total_transactions) * 100

col1, col2, col3 = st.columns(3)
with col1:
    st.metric("Total Transactions", total_transactions)
with col2:
    st.metric("Fraudulent Transactions", fraudulent_transactions)
with col3:
    st.metric("Fraud Rate", f"{fraud_rate:.2f}%")

# Plotly visualization
st.header("Transaction Amount Distribution")
fig = px.histogram(df_display, x="Amount", color="Class",
                   title="Transaction Amount Distribution (0=Normal, 1=Fraud)",
                   marginal="box",
                   hover_data=df_display.columns)
st.plotly_chart(fig, use_container_width=True)

st.header("Real-time Prediction Demo")
with st.form("my_form"):
    st.write("Enter new transaction details to get a fraud prediction:")
    V_features = {f'V{i}': st.slider(f'V{i}', -30.0, 30.0, 0.0) for i in range(1, 10)}
    amount = st.slider("Amount", 0, 2000, 100)
    submitted = st.form_submit_button("Get Prediction")

if submitted:
    new_transaction = {**V_features, 'Amount': amount}
    expected_features = [f'V{i}' for i in range(1, 29)] + ['Amount']
    transaction_df = pd.DataFrame([new_transaction])
    for col in expected_features:
        if col not in transaction_df.columns:
            transaction_df[col] = 0.0
    transaction_df = transaction_df[expected_features]
    transaction_df['Amount'] = scaler.transform(transaction_df[['Amount']])
    prediction = model.predict(transaction_df)[0]
    if prediction == 1:
        st.error("🔴 This transaction is likely FRAUDULENT!")
    else:
        st.success("🟢 This transaction appears to be LEGITIMATE.")

Overwriting dashboard.py


In [None]:
from pyngrok import ngrok
import threading
import time
!ngrok config add-authtoken 31iwsRTU2SOzjic3aurj6vXDCfP_2nZmkZvaCUEjs8rvQvdaF
ngrok.kill()
public_url = ngrok.connect(addr="8501")

if public_url:
    print("Streamlit app running at:", public_url)

    def run_streamlit():
        !streamlit run dashboard.py

    threading.Thread(target=run_streamlit, daemon=True).start()

    time.sleep(5)
else:
    print("Failed to start ngrok tunnel. Please check your ngrok auth token.")

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Streamlit app running at: NgrokTunnel: "https://537dff64dd48.ngrok-free.app" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8502[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8502[0m
[34m  External URL: [0m[1mhttp://34.80.187.185:8502[0m
[0m
