# Finding fraud patterns with FP-growth

# Data Collection and Investigation

In [None]:
import pandas as pd

df = pd.read_excel('D:/Python Project/Credit Card Fraud Detection/cardzone dataset/fraud_set.xlsx')

# printing the first 5 columns for data visualization 
df.head()


## Execute FP-growth algorithm
spark, orange, pyfpgrowth

## Spark

In [None]:
# import environment path to pyspark
import os
import sys

spark_path = r"D:\apache-spark" # spark installed folder
os.environ['SPARK_HOME'] = spark_path
sys.path.insert(0, spark_path + "/bin")
sys.path.insert(0, spark_path + "/python/pyspark/")
sys.path.insert(0, spark_path + "/python/lib/pyspark.zip")
sys.path.insert(0, spark_path + "/python/lib/py4j-0.10.7-src.zip")


In [None]:
# Export csv to txt file. Put header=None for exclude first row as column name
df.to_csv('processed_itemsets.txt', header=None, index=None, sep=' ', mode='w+')


In [None]:
import csv

# creating necessary variable
new_itemsets_list = []
skip_first_iteration = 1

# find the duplicate item and add a subscript at behind
with open("processed_itemsets.txt", 'r') as fp:
    itemsets_list = csv.reader(fp, delimiter =' ', skipinitialspace=True) 
    for itemsets in itemsets_list:
        unique_itemsets = []
        counter = 2
        for item in itemsets:
            if itemsets.count(item) > 1:
                
                if skip_first_iteration == 1:
                    unique_itemsets.append(item)
                    skip_first_iteration = skip_first_iteration + 1
                    continue
                   
                duplicate_item = item + "__(" + str(counter) + ")"
                unique_itemsets.append(duplicate_item)
                counter = counter + 1
            else:
                unique_itemsets.append(item)
#         print(itemsets)
        new_itemsets_list.append(unique_itemsets)

        

In [None]:
new_final_list = []
for item_list in new_itemsets_list:
    item_list = [item.replace(' ', '') for item in item_list]
    new_final_list.append(item_list)


In [None]:
# write the new itemsets into file
with open('processed_itemsets.txt', 'w+') as f:
    for items in new_final_list:
        for item in items:
            f.write("{} ".format(item))
        f.write("\n")
        

In [None]:
from pyspark import SparkContext
from pyspark.mllib.fpm import FPGrowth

# initialize spark
sc = SparkContext.getOrCreate()

In [None]:
data = sc.textFile('processed_itemsets.txt').cache()
transactions = data.map(lambda line: line.strip().split(' '))


__minSupport__: The minimum support for an itemset to be identified as frequent. <br>
For example, if an item appears 3 out of 5 transactions, it has a support of 3/5=0.6.

__minConfidence__: Minimum confidence for generating Association Rule. Confidence is an indication of how often an association rule has been found to be true. For example, if in the transactions itemset X appears 4 times, X and Y co-occur only 2 times, the confidence for the rule X => Y is then 2/4 = 0.5.

__numPartitions__: The number of partitions used to distribute the work. By default the param is not set, and number of partitions of the input dataset is used

In [None]:
model = FPGrowth.train(transactions, minSupport=0.5, numPartitions=10)
result = model.freqItemsets().collect()


In [None]:
print("Frequent Itemsets : Item Support")
print("====================================")
for index, frequent_itemset in enumerate(result):
    print(str(frequent_itemset.items) + ' : ' + str(frequent_itemset.freq))
#     if index != 10:
#         print(str(fi.items) + ' : ' + str(frequent_itemset.freq))
#     else:
#         break


In [None]:
# Association Rules
rules = sorted(model._java_model.generateAssociationRules(0.8).collect(), key=lambda x: x.confidence(), reverse=True)


In [None]:
print("Antecedent => Consequent : Min Confidence")
print("========================================")
for i, rule in enumerate(rules[:200]):
#     print(i)
    print(rule)

In [None]:
# stop spark session
sc.stop()