<a href="https://colab.research.google.com/github/kaushalkumawat77/CodSoft/blob/main/Credit_card_fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score

from sklearn import metrics
from collections import Counter

In [None]:
train_data = pd.read_csv("/content/fraudTrain.csv", index_col=0)

In [None]:
train_data.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')

In [None]:
train_data.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [None]:
subset_data = train_data.copy()

In [None]:
from math import radians, cos, sin, asin, sqrt

def calculate_distance(row):
    lon1 = radians(row["long"])
    lon2 = radians(row["merch_long"])
    lat1 = radians(row["lat"])
    lat2 = radians(row["merch_lat"])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2

    c = 2 * asin(sqrt(a))

    # Radius of earth in kilometers. Use 3956 for miles
    r = 6371

    # calculate the result
    return(c * r)


In [None]:
subset_data["distance"] = subset_data.apply(calculate_distance, axis=1)

In [None]:
subset_data.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,distance
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,78.597568
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,30.212176
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,108.206083
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,95.673231
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,77.556744


In [None]:
import datetime
subset_data = subset_data[["trans_date_trans_time", "dob", "amt", "city_pop", "distance", "is_fraud"]]
subset_data.head()

Unnamed: 0,trans_date_trans_time,dob,amt,city_pop,distance,is_fraud
0,2019-01-01 00:00:18,1988-03-09,4.97,3495,78.597568,0
1,2019-01-01 00:00:44,1978-06-21,107.23,149,30.212176,0
2,2019-01-01 00:00:51,1962-01-19,220.11,4154,108.206083,0
3,2019-01-01 00:01:16,1967-01-12,45.0,1939,95.673231,0
4,2019-01-01 00:03:06,1986-03-28,41.96,99,77.556744,0


In [None]:
subset_data.dtypes

trans_date_trans_time     object
dob                       object
amt                      float64
city_pop                   int64
distance                 float64
is_fraud                   int64
dtype: object

In [None]:
subset_data["trans_date"] = pd.to_datetime(pd.to_datetime(subset_data["trans_date_trans_time"], format="%Y-%m-%d %H:%M:%S").dt.date, format="%Y-%m-%d")
subset_data["dob_date"] = pd.to_datetime(subset_data["dob"], format="%Y-%m-%d")
subset_data.head()

Unnamed: 0,trans_date_trans_time,dob,amt,city_pop,distance,is_fraud,trans_date,dob_date
0,2019-01-01 00:00:18,1988-03-09,4.97,3495,78.597568,0,2019-01-01,1988-03-09
1,2019-01-01 00:00:44,1978-06-21,107.23,149,30.212176,0,2019-01-01,1978-06-21
2,2019-01-01 00:00:51,1962-01-19,220.11,4154,108.206083,0,2019-01-01,1962-01-19
3,2019-01-01 00:01:16,1967-01-12,45.0,1939,95.673231,0,2019-01-01,1967-01-12
4,2019-01-01 00:03:06,1986-03-28,41.96,99,77.556744,0,2019-01-01,1986-03-28


In [None]:
subset_data.dtypes

trans_date_trans_time            object
dob                              object
amt                             float64
city_pop                          int64
distance                        float64
is_fraud                          int64
trans_date               datetime64[ns]
dob_date                 datetime64[ns]
dtype: object

In [None]:
subset_data["age"] = (subset_data["trans_date"]-subset_data["dob_date"]) / (np.timedelta64(1, 'D')*365)
subset_data.head()

Unnamed: 0,trans_date_trans_time,dob,amt,city_pop,distance,is_fraud,trans_date,dob_date,age
0,2019-01-01 00:00:18,1988-03-09,4.97,3495,78.597568,0,2019-01-01,1988-03-09,30.835616
1,2019-01-01 00:00:44,1978-06-21,107.23,149,30.212176,0,2019-01-01,1978-06-21,40.558904
2,2019-01-01 00:00:51,1962-01-19,220.11,4154,108.206083,0,2019-01-01,1962-01-19,56.989041
3,2019-01-01 00:01:16,1967-01-12,45.0,1939,95.673231,0,2019-01-01,1967-01-12,52.005479
4,2019-01-01 00:03:06,1986-03-28,41.96,99,77.556744,0,2019-01-01,1986-03-28,32.786301


In [None]:
import numpy as np
import pandas as pd
import math

def f1_score(data, y, mask, total_frd, min_recall, fraud_col_name, amt_col_name):

	hit_data = data[mask]

	capture = hit_data[fraud_col_name].sum()
	temp_tot_frd = data[fraud_col_name].sum()
	trx_amt_sum = hit_data[amt_col_name].sum()
	if trx_amt_sum > 0:
		hit_rate = capture/trx_amt_sum
	else:
		hit_rate = 0
	if temp_tot_frd > 0:
		capture_rate = capture/temp_tot_frd
	else:
		capture_rate = 0

	if hit_rate+capture_rate == 0:
		return 0

	f1 = (2*hit_rate*capture_rate)/(hit_rate+capture_rate)

	if f1 is None:
		return 0

	if capture_rate < min_recall:
		return 0

	return f1

def max_f1_score_split(data, x, y, total_frd, subset_x, min_recall, fraud_col_name, amt_col_name):

	split_value = []
	f1 = []
	le_gr = []
	print("Checking threshold for {}".format(x.name))
	options = subset_x.sort_values().unique()[1:]
	print("Old size was {}".format(len(options)))
	subset_x = subset_x[subset_x >= 0]

	options = subset_x.sort_values().unique()[1:]

	if len(options) > 100:
		arr_percentiles = np.arange(0,100)
		final_options = np.zeros(100)
		np.percentile(options, arr_percentiles, out = final_options)
	else:
		final_options = options
	print("Current size is {}".format(len(final_options)))

	# Calculate ig for all values
	for ind, val in enumerate(options):
		nonull_data = data[x>-9998]
		mask =   x < val
		val_f1 = f1_score(nonull_data, y, mask, total_frd, min_recall, fraud_col_name, amt_col_name)
		mask_2 = x > val
		val_new_f1 = f1_score(nonull_data, y, mask_2, total_frd, min_recall, fraud_col_name, amt_col_name)
		# Append results
		if val_new_f1 < val_f1:
			le_gr.append(1)
			f1.append(val_f1)
		else:
			le_gr.append(0)
			f1.append(val_new_f1)
		split_value.append(val)

	# Check if there are more than 1 results if not, return False
	if len(f1) == 0:
		return(None,None,None, False)

	else:
	# Get results with highest IG
		best_f1 = max(f1)
		best_f1_index = f1.index(best_f1)
		best_split = split_value[best_f1_index]
		best_ineq = le_gr[best_f1_index]
		return(best_f1,best_split,best_ineq, True)

def get_best_split(y, data, x_vars, total_frd, min_recall, fraud_col_name, amt_col_name):

	split_value = []
	f1 = []
	le_gr = []

	for x in x_vars:
		dropped_data = data.dropna(axis=0, subset = [x])
		fraud_data = dropped_data[dropped_data[y]==1]
		f1_score, split, ineq, _ = max_f1_score_split(dropped_data, dropped_data[x], dropped_data[y], total_frd, fraud_data[x], min_recall, fraud_col_name, amt_col_name)
		if f1_score is None:
			print("Found None")
			f1_score = 0
		le_gr.append(ineq)
		f1.append(f1_score)
		split_value.append(split)

	best_f1 = max(f1)
	best_f1_index = f1.index(best_f1)
	best_split = split_value[best_f1_index]
	best_ineq = le_gr[best_f1_index]
	best_var = x_vars[best_f1_index]
	return(best_var, best_split, best_f1, best_ineq)

def make_split(variable, value, data, ineq):
	print(variable)
	print(value)
	if(ineq==1):
		data_1 = data[data[variable] < value]
	else:
		data_1 = data[data[variable] > value]

	return data_1

def calc_metrics(data, total_frd, fraud_col_name, amt_col_name):

	hit_data = data

	capture = hit_data[fraud_col_name].sum()
	hit_rate = capture/hit_data[amt_col_name].sum()
	capture_rate = capture/total_frd

	return hit_rate, capture_rate

def train_tree(data,y, total_frd, x_vars, fraud_col_name="fraud_amount", amt_col_name="transaction_amount", max_depth = None, min_samples_split = None, min_recall = None, min_recall_overall = None, min_precision = 0.1, counter = 0):

	# check for depth conditions
	print(counter)
	precision, recall = calc_metrics(data, total_frd, fraud_col_name, amt_col_name)

	if max_depth == None:
		depth_cond = True

	else:
		if counter < max_depth:
			depth_cond = True

		else:
			depth_cond = False

	# Check for sample conditions
	if min_samples_split == None:
		sample_cond = True

	else:
		if data.shape[0] > min_samples_split:
			sample_cond = True

		else:
			sample_cond = False

	# Check for recall condition
	if min_recall_overall == None:
		recall_cond = True

	else:
		if min_recall_overall < recall:
			recall_cond = True

		else:
			recall_cond = False

	# Check for condition
	if depth_cond & sample_cond & recall_cond:

		var, val, f1, ineq = get_best_split(y, data, x_vars, total_frd, min_recall, fraud_col_name, amt_col_name)
		counter += 1

		new_data = make_split(var, val, data, ineq)

		# Instantiate sub-tree
		if ineq == 1:
			split_type = "<"
		else:
			split_type = ">"
		question = "{} {} {}".format(var, split_type, val)
		new_precision, new_recall = calc_metrics(new_data, total_frd, fraud_col_name, amt_col_name)
		question = question + " Precision:{}, Recall:{}".format(new_precision, new_recall)
		path = [question]
		print(question)


		#Find answers (recursion)

		next_path = train_tree(new_data, y, total_frd, x_vars, fraud_col_name, amt_col_name, max_depth, min_samples_split, min_recall, min_recall_overall, min_precision, counter)

		if next_path is not None:
			path.extend(next_path)

		return path

	print("Run Complete")
	return None

In [None]:
algo_data = subset_data.copy()
#algo_data["transaction_amount"] = algo_data["amt"]
algo_data["fraud_amount"] = np.where(algo_data["is_fraud"]==1, algo_data["amt"], 0)
algo_data = algo_data[["amt", "city_pop", "age", "fraud_amount", "distance", "is_fraud"]]
algo_data.head()

Unnamed: 0,amt,city_pop,age,fraud_amount,distance,is_fraud
0,4.97,3495,30.835616,0.0,78.597568,0
1,107.23,149,40.558904,0.0,30.212176,0
2,220.11,4154,56.989041,0.0,108.206083,0
3,45.0,1939,52.005479,0.0,95.673231,0
4,41.96,99,32.786301,0.0,77.556744,0


In [None]:
total_frd = algo_data["fraud_amount"].sum()
print(total_frd)

3988088.6100000003


In [None]:
algo_data.shape

(1296675, 6)

In [None]:
rf_vars = ["amt", "city_pop", "age", "distance"]
decisions = train_tree(algo_data,"is_fraud", total_frd, rf_vars, "fraud_amount", "amt", max_depth = 50, min_samples_split = None, min_recall = 0.9, min_recall_overall = 0.3, min_precision = None, counter = 0)

0
Checking threshold for amt
Old size was 6638
Current size is 100
Checking threshold for city_pop
Old size was 690
Current size is 100
Checking threshold for age
Old size was 1448
Current size is 100
Checking threshold for distance
Old size was 7505
Current size is 100
amt
288.3
amt > 288.3 Precision:0.17930064373603957, Recall:0.955395998585899
1
Checking threshold for amt
Old size was 4945
Current size is 100
Checking threshold for city_pop
Old size was 686
Current size is 100
Checking threshold for age
Old size was 1396
Current size is 100
Checking threshold for distance
Old size was 5207
Current size is 100
amt
1206.74
amt < 1206.74 Precision:0.230919172467846, Recall:0.9412035080133287
2
Checking threshold for amt
Old size was 4900
Current size is 100
Checking threshold for city_pop
Old size was 686
Current size is 100
Checking threshold for age
Old size was 1395
Current size is 100
Checking threshold for distance
Old size was 5162
Current size is 100
amt
296.02
amt > 296.02 Prec