# Challenge

Another approach to identifying fraudulent transactions is to look for outliers in the data. Standard deviation or quartiles are often used to detect outliers. Using this starter notebook, code two Python functions:

* One that uses standard deviation to identify anomalies for any cardholder.

* Another that uses interquartile range to identify anomalies for any cardholder.

## Identifying Outliers using Standard Deviation

In [60]:
# Initial imports
import pandas as pd
import numpy as np
import random
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, BigInteger, DateTime, ForeignKey, text


In [61]:
# Create a connection to the database
engine = create_engine("postgresql://postgres:postgres@localhost:5432/fraud_detection")



In [62]:
# Define table structures
metadata = MetaData()

merchant_category = Table('merchant_category', metadata,
                          Column('merchant_category_id', Integer, primary_key=True),
                          Column('name', String(50))
                         )

merchant = Table('merchant', metadata,
                 Column('merchant_id', Integer, primary_key=True),
                 Column('name', String(100)),
                 Column('merchant_category_id', Integer, ForeignKey('merchant_category.merchant_category_id'))
                )

card_holder = Table('card_holder', metadata,
                    Column('card_holder_id', Integer, primary_key=True),
                    Column('name', String(100))
                   )

credit_card = Table('credit_card', metadata,
                    Column('card_number', BigInteger, primary_key=True),
                    Column('card_holder_id', Integer, ForeignKey('card_holder.card_holder_id'))
                   )

transaction = Table('transaction', metadata,
                    Column('transaction_id', Integer, primary_key=True),
                    Column('date', DateTime),
                    Column('amount', Float),
                    Column('card_number', BigInteger, ForeignKey('credit_card.card_number')),
                    Column('merchant_id', Integer, ForeignKey('merchant.merchant_id'))
                   )

# Drop tables if they exist
metadata.drop_all(engine, checkfirst=True)

# Create tables
metadata.create_all(engine)

In [63]:
# Path to the SQL file
sql_file_path = 'Data/all_tables_seed.sql'

# Read the SQL file and execute statements
with engine.begin() as connection:
    with open(sql_file_path, 'r') as file:
        # Temporary storage for current insert statement being processed
        current_statement = ""
        for line in file:
            if not line.strip():  # Skip empty lines
                continue
            if line.startswith("INSERT INTO"):
                if current_statement:
                    # Execute the previous statement
                    connection.execute(text(current_statement))
                # Start a new statement
                current_statement = line
            else:
                # Continue building the current statement
                current_statement += line
        # Execute the last statement if not empty
        if current_statement:
            connection.execute(text(current_statement))

In [64]:
import pandas as pd
from sqlalchemy import create_engine

# Create a connection to the database
engine = create_engine("postgresql://postgres:postgres@localhost:5432/fraud_detection")

# Define a query to select all rows from the transaction table
query = "SELECT * FROM transaction;"

# Load data into a DataFrame using the read_sql() method from pandas
transaction_df = pd.read_sql(query, engine)

# Show the data of the new DataFrame
print(transaction_df)



      transaction_id                date  amount          card_number  \
0                222 2018-01-01 21:35:10    6.22     3561954487988605   
1               2045 2018-01-01 21:43:12    3.83     5135837688671496   
2                395 2018-01-01 22:41:21    9.61      213193946980303   
3               3309 2018-01-01 23:13:30   19.03     4263694062533017   
4                567 2018-01-01 23:15:10    2.95        4498002758300   
...              ...                 ...     ...                  ...   
3495            1979 2018-12-31 01:24:15    4.84  4723783028106084756   
3496            2342 2018-12-31 03:33:28    3.26  4165305432349489280   
3497             948 2018-12-31 05:53:58   10.73     5361779664174555   
3498            1168 2018-12-31 08:22:17   11.87     4188164051171486   
3499            2476 2018-12-31 09:50:25   19.75  4723783028106084756   

      merchant_id  
0              69  
1              85  
2              82  
3               5  
4              64  
...

In [65]:
import pandas as pd
from sqlalchemy import create_engine

# Create a connection to the database
engine = create_engine("postgresql://postgres:postgres@localhost:5432/fraud_detection")

# Define a query to select all rows from the transaction table
query = "SELECT * FROM transaction;"

# Load data into a DataFrame using the read_sql() method from pandas
transaction_df = pd.read_sql(query, engine)

# Calculate mean and standard deviation of transaction amounts
mean_amount = transaction_df['amount'].mean()
std_dev = transaction_df['amount'].std()

# Define threshold for identifying outliers
threshold = 3  # You can adjust this value as needed
lower_threshold = mean_amount - threshold * std_dev
upper_threshold = mean_amount + threshold * std_dev

# Identify outliers
outliers = transaction_df[(transaction_df['amount'] < lower_threshold) | (transaction_df['amount'] > upper_threshold)]

# Print outliers
print("Outliers found:")
print(outliers)





Outliers found:
      transaction_id                date  amount          card_number  \
15                99 2018-01-02 23:27:46  1031.0         501879657465   
27              2650 2018-01-04 03:05:18  1685.0     3516952396080247   
62              1291 2018-01-08 02:34:32  1029.0     3581345943543942   
212             1442 2018-01-22 08:07:03  1131.0     5570600642865857   
219             2667 2018-01-23 06:29:37  1678.0         501879657465   
...              ...                 ...     ...                  ...   
3389            2696 2018-12-19 16:10:03  1724.0       30181963913340   
3405            2461 2018-12-21 09:56:32  1301.0       30142966699187   
3429            2520 2018-12-24 15:55:06  1634.0     5570600642865857   
3433            1119 2018-12-25 19:10:42  1035.0       30142966699187   
3492            1293 2018-12-30 23:23:09  1033.0  4761049645711555811   

      merchant_id  
15             95  
27             80  
62            145  
212           144  
219    

In [66]:
import pandas as pd
from sqlalchemy import create_engine

# Create a connection to the database
engine = create_engine("postgresql://postgres:postgres@localhost:5432/fraud_detection")

# Define a query to select 3 random card holders
query_card_holders = """
SELECT * FROM card_holder
ORDER BY RANDOM()
LIMIT 3;
"""

# Load data into a DataFrame using the read_sql() method from pandas
card_holders_df = pd.read_sql(query_card_holders, engine)

# Print the randomly selected card holders
print("Randomly selected card holders:")
print(card_holders_df)

# Define a query to find anomalous transactions for the selected card holders
query_anomalous_transactions = """
SELECT * FROM transaction
WHERE card_number IN (
    SELECT card_number FROM credit_card
    WHERE card_holder_id IN ({})
);
""".format(", ".join(str(id) for id in card_holders_df['card_holder_id']))

# Load data into a DataFrame using the read_sql() method from pandas
anomalous_transactions_df = pd.read_sql(query_anomalous_transactions, engine)

# Print the anomalous transactions
print("\nAnomalous transactions for the selected card holders:")
print(anomalous_transactions_df)


Randomly selected card holders:
   card_holder_id             name
0              21  Dana Washington
1               8    Michael Floyd
2              22   Austin Johnson

Anomalous transactions for the selected card holders:
     transaction_id                date  amount       card_number  merchant_id
0              1077 2018-01-03 18:16:55   10.27      501809222273           84
1              2922 2018-01-04 03:00:19   17.59      501809222273          100
2              1995 2018-01-05 01:10:27    5.09      501809222273           46
3               447 2018-01-07 07:33:17    7.07  4834483169177062           14
4              1340 2018-01-08 04:29:20   10.64      501809222273          116
..              ...                 ...     ...               ...          ...
248            1619 2018-12-24 18:01:29   10.80    30063281385429           38
249            2251 2018-12-26 18:02:58    1.20  4834483169177062           65
250            1702 2018-12-26 19:29:43   11.18  4834483169177

## Identifying Outliers Using Interquartile Range

In [67]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# Create a connection to the database
engine = create_engine("postgresql://postgres:postgres@localhost:5432/fraud_detection")

# Query the transaction table to fetch the transaction amounts
query = "SELECT amount FROM transaction;"
transaction_df = pd.read_sql(query, engine)

# Extract the transaction amounts
amounts = transaction_df['amount']

# Calculate first quartile (Q1) and third quartile (Q3)
q1 = np.percentile(amounts, 25)
q3 = np.percentile(amounts, 75)

# Calculate interquartile range (IQR)
iqr = q3 - q1

# Define lower and upper bounds for outliers
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Identify outliers
outliers = transaction_df[(transaction_df['amount'] < lower_bound) | (transaction_df['amount'] > upper_bound)]

# Print outliers
print("Outliers found:")
print(outliers)



Outliers found:
      amount
15    1031.0
27    1685.0
53     175.0
62    1029.0
67     333.0
...      ...
3405  1301.0
3429  1634.0
3433  1035.0
3472   313.0
3492  1033.0

[110 rows x 1 columns]


In [68]:
# Find anomalous transactions for 3 random card holders
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, DateTime, Float, BigInteger, ForeignKey
import pandas as pd

# Define table structures
metadata = MetaData()

merchant_category = Table('merchant_category', metadata,
                          Column('merchant_category_id', Integer, primary_key=True),
                          Column('name', String(50))
                         )

merchant = Table('merchant', metadata,
                 Column('merchant_id', Integer, primary_key=True),
                 Column('name', String(100)),
                 Column('merchant_category_id', Integer, ForeignKey('merchant_category.merchant_category_id'))
                )

card_holder = Table('card_holder', metadata,
                    Column('card_holder_id', Integer, primary_key=True),
                    Column('name', String(100))
                   )

credit_card = Table('credit_card', metadata,
                    Column('card_number', BigInteger, primary_key=True),
                    Column('card_holder_id', Integer, ForeignKey('card_holder.card_holder_id'))
                   )

transaction = Table('transaction', metadata,
                    Column('transaction_id', Integer, primary_key=True),
                    Column('date', DateTime),
                    Column('amount', Float),
                    Column('card_number', BigInteger, ForeignKey('credit_card.card_number')),
                    Column('merchant_id', Integer, ForeignKey('merchant.merchant_id'))
                   )

# Create a connection to the database
engine = create_engine("postgresql://postgres:postgres@localhost:5432/fraud_detection")

# Generate a list of 3 random card holders
query_random_card_holders = "SELECT card_holder_id FROM card_holder ORDER BY RANDOM() LIMIT 3;"
random_card_holders = pd.read_sql(query_random_card_holders, engine)['card_holder_id'].tolist()

# Find anomalous transactions for the 3 random card holders
anomalous_transactions = pd.DataFrame()
for card_holder_id in random_card_holders:
    query = f"SELECT amount FROM transaction WHERE card_number IN (SELECT card_number FROM credit_card WHERE card_holder_id = {card_holder_id});"
    transactions = pd.read_sql(query, engine)
    anomalous_transactions = pd.concat([anomalous_transactions, transactions])

print(anomalous_transactions)



     amount
0     18.05
1      6.52
2     15.97
3     10.85
4     11.25
..      ...
134   10.12
135    2.51
136   13.54
137    3.04
138    3.85

[305 rows x 1 columns]
