# Finding fraud patterns with FP-growth

# Data Collection and Investigation

In [1]:
import pandas as pd

# Input data files are available in the "../input/" directory
df = pd.read_csv('D:/Python Project/Credit Card Fraud Detection/benchmark dataset/Test FP-Growth.csv')

# printing the first 5 columns for data visualization 
df.head()


Unnamed: 0,M,O,N,K,E,Y
0,D,O,N,K,E,Y
1,M,A,K,E,,
2,M,U,C,K,Y,
3,C,O,O,K,I,E


## Execute FP-growth algorithm

## Spark

In [2]:
# import environment path to pyspark
import os
import sys

spark_path = r"D:\apache-spark" # spark installed folder
os.environ['SPARK_HOME'] = spark_path
sys.path.insert(0, spark_path + "/bin")
sys.path.insert(0, spark_path + "/python/pyspark/")
sys.path.insert(0, spark_path + "/python/lib/pyspark.zip")
sys.path.insert(0, spark_path + "/python/lib/py4j-0.10.7-src.zip")


In [8]:
# Export csv to txt file
df.to_csv('processed_itemsets.txt', index=None, sep=' ', mode='w+')


In [9]:
import csv

# creating necessary variable
new_itemsets_list = []
skip_first_iteration = 1

# find the duplicate item and add a counter at behind
with open("processed_itemsets.txt", 'r') as fp:
    itemsets_list = csv.reader(fp, delimiter =' ', skipinitialspace=True) 
    for itemsets in itemsets_list:
        unique_itemsets = []
        counter = 2
        for item in itemsets:
            if itemsets.count(item) > 1:
                
                if skip_first_iteration == 1:
                    unique_itemsets.append(item)
                    skip_first_iteration = skip_first_iteration + 1
                    continue
                    
                duplicate_item = item + "__(" + str(counter) + ")"
                unique_itemsets.append(duplicate_item)
                counter = counter + 1
            else:
                unique_itemsets.append(item)
        print(itemsets)
        new_itemsets_list.append(unique_itemsets)

        

['M', 'O', 'N', 'K', 'E', 'Y']
['D', 'O', 'N', 'K', 'E', 'Y']
['M', 'A', 'K', 'E', '']
['M', 'U', 'C', 'K', 'Y', '']
['C', 'O', 'O', 'K', 'I', 'E']


In [10]:
# write the new itemsets into file
with open('processed_itemsets.txt', 'w+') as f:
    for items in new_itemsets_list:
        for item in items:
            f.write("{} ".format(item))
        f.write("\n")


In [11]:
from pyspark import SparkContext
from pyspark.mllib.fpm import FPGrowth

# initialize spark
sc = SparkContext.getOrCreate()

In [12]:
data = sc.textFile('processed_itemsets.txt').cache()
transactions = data.map(lambda line: line.strip().split(' '))


__minSupport__: The minimum support for an itemset to be identified as frequent. <br>
For example, if an item appears 3 out of 5 transactions, it has a support of 3/5=0.6.

__minConfidence__: Minimum confidence for generating Association Rule. Confidence is an indication of how often an association rule has been found to be true. For example, if in the transactions itemset X appears 4 times, X and Y co-occur only 2 times, the confidence for the rule X => Y is then 2/4 = 0.5.

__numPartitions__: The number of partitions used to distribute the work. By default the param is not set, and number of partitions of the input dataset is used

In [14]:
model = FPGrowth.train(transactions, minSupport=0.6, numPartitions=10)
result = model.freqItemsets().collect()


In [15]:
print("Frequent Itemsets : Item Support")
print("====================================")
for index, frequent_itemset in enumerate(result):
    print(str(frequent_itemset.items) + ' : ' + str(frequent_itemset.freq))


Frequent Itemsets : Item Support
['K'] : 5
['E'] : 4
['E', 'K'] : 4
['M'] : 3
['M', 'K'] : 3
['O'] : 3
['O', 'E'] : 3
['O', 'E', 'K'] : 3
['O', 'K'] : 3
['Y'] : 3
['Y', 'K'] : 3


In [16]:
rules = sorted(model._java_model.generateAssociationRules(0.8).collect(), key=lambda x: x.confidence(), reverse=True)


In [17]:
print("Antecedent => Consequent : Min Confidence")
print("========================================")
for rule in rules[:200]:
    print(rule)


Antecedent => Consequent : Min Confidence
{O} => {E}: 1.0
{O} => {K}: 1.0
{E} => {K}: 1.0
{Y} => {K}: 1.0
{M} => {K}: 1.0
{O,E} => {K}: 1.0
{O,K} => {E}: 1.0
{K} => {E}: 0.8


In [18]:
# stop spark session
sc.stop()