# Preparing the probability column

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
!pip install geopy





In [3]:
from geopy.distance import geodesic

In [4]:
# Load the dataset
df = pd.read_csv('fraudTrain.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2449,1/2/2019 1:06,4613310000000.0,fraud_Rutherford-Mertz,grocery_pos,281.06,Jason,Murphy,M,542 Steve Curve Suite 011,...,35.9946,-81.7266,885,Soil scientist,9/15/1988,e8a81877ae9a0a7f883e15cb39dc4022,1325466397,36.430124,-81.179483,1
1,2472,1/2/2019 1:47,340187000000000.0,"fraud_Jenkins, Hauck and Friesen",gas_transport,11.52,Misty,Hart,F,27954 Hall Mill Suite 575,...,29.44,-98.459,1595797,Horticultural consultant,10/28/1960,bc7d41c41103877b03232f03f1f8d3f5,1325468849,29.819364,-99.142791,1
2,2523,1/2/2019 3:05,340187000000000.0,fraud_Goodwin-Nitzsche,grocery_pos,276.31,Misty,Hart,F,27954 Hall Mill Suite 575,...,29.44,-98.459,1595797,Horticultural consultant,10/28/1960,b98f12f4168391b2203238813df5aa8c,1325473523,29.273085,-98.83636,1
3,2546,1/2/2019 3:38,4613310000000.0,fraud_Erdman-Kertzmann,gas_transport,7.03,Jason,Murphy,M,542 Steve Curve Suite 011,...,35.9946,-81.7266,885,Soil scientist,9/15/1988,397894a5c4c02e3c61c784001f0f14e4,1325475483,35.909292,-82.09101,1
4,2553,1/2/2019 3:55,340187000000000.0,fraud_Koepp-Parker,grocery_pos,275.73,Misty,Hart,F,27954 Hall Mill Suite 575,...,29.44,-98.459,1595797,Horticultural consultant,10/28/1960,7863235a750d73a244c07f1fb7f0185a,1325476547,29.786426,-98.68341,1


In [6]:
# Define weights for different factors

AMOUNT_WEIGHT = 0.3
CATEGORY_WEIGHT = 0.2
TIME_WEIGHT = 0.25
DISTANCE_WEIGHT = 0.25

### calculate distance between customer and merchant

In [7]:
# Function to calculate distance between customer and merchant
def calculate_distance(row):
    customer_location = (row['lat'], row['long'])
    merchant_location = (row['merch_lat'], row['merch_long'])
    return geodesic(customer_location, merchant_location).kilometers

### **Calculate time risk**

In [8]:
# Function to calculate time risk (higher at late night or early morning)
def calculate_time_risk(trans_time):
    hour = datetime.strptime(trans_time, '%m/%d/%Y %H:%M').hour
    if 0 <= hour < 6 or hour >= 22:
        return 1  # High risk
    elif 6 <= hour < 9 or 18 <= hour < 22:
        return 0.5  # Moderate risk
    else:
        return 0.1  # Low risk

### **Calculate category risk**

In [9]:
# Function to calculate category risk (set high for risky categories)
def calculate_category_risk(category):
    risky_categories = ['grocery_pos', 'shopping_net', 'misc_net', 'shopping_pos']
    if category in risky_categories:
        return 1  # High risk
    else:
        return 0.2  # Low risk

### **Calculate amount risk**

In [10]:
# Function to calculate amount risk (high for larger amounts)
def calculate_amount_risk(amount):
    if amount > 1000:
        return 1  # High risk
    elif amount > 500:
        return 0.7  # Moderate risk
    else:
        return 0.3  # Low risk

### **Calculate distance risk**

In [11]:
# Function to calculate distance risk (capped at 1.0)
def calculate_distance_risk(row):
    distance = calculate_distance(row)
    return min(1.0, distance / 100)  # Normalize and cap at 1.0

### **Fraud probability calculation function**

In [12]:
def calculate_fraud_probability(row):
    amount_risk = calculate_amount_risk(row['amt'])
    category_risk = calculate_category_risk(row['category'])
    time_risk = calculate_time_risk(row['trans_date_trans_time'])
    distance_risk = calculate_distance_risk(row)
    
    # Calculate weighted probability
    fraud_probability = (amount_risk * AMOUNT_WEIGHT +
                         category_risk * CATEGORY_WEIGHT +
                         time_risk * TIME_WEIGHT +
                         distance_risk * DISTANCE_WEIGHT)
    
    # Ensure the probability does not exceed 1
    return min(fraud_probability, 1)

In [13]:
# Apply the probability calculation to each row
df['risk_probability'] = df.apply(calculate_fraud_probability, axis=1)

In [14]:
# Save the modified dataset with risk probability
df.to_csv('Train_dataset_with_probability.csv', index=False)

In [15]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,risk_probability
0,2449,1/2/2019 1:06,4613310000000.0,fraud_Rutherford-Mertz,grocery_pos,281.06,Jason,Murphy,M,542 Steve Curve Suite 011,...,-81.7266,885,Soil scientist,9/15/1988,e8a81877ae9a0a7f883e15cb39dc4022,1325466397,36.430124,-81.179483,1,0.712407
1,2472,1/2/2019 1:47,340187000000000.0,"fraud_Jenkins, Hauck and Friesen",gas_transport,11.52,Misty,Hart,F,27954 Hall Mill Suite 575,...,-98.459,1595797,Horticultural consultant,10/28/1960,bc7d41c41103877b03232f03f1f8d3f5,1325468849,29.819364,-99.142791,1,0.576108
2,2523,1/2/2019 3:05,340187000000000.0,fraud_Goodwin-Nitzsche,grocery_pos,276.31,Misty,Hart,F,27954 Hall Mill Suite 575,...,-98.459,1595797,Horticultural consultant,10/28/1960,b98f12f4168391b2203238813df5aa8c,1325473523,29.273085,-98.83636,1,0.642621
3,2546,1/2/2019 3:38,4613310000000.0,fraud_Erdman-Kertzmann,gas_transport,7.03,Jason,Murphy,M,542 Steve Curve Suite 011,...,-81.7266,885,Soil scientist,9/15/1988,397894a5c4c02e3c61c784001f0f14e4,1325475483,35.909292,-82.09101,1,0.46553
4,2553,1/2/2019 3:55,340187000000000.0,fraud_Koepp-Parker,grocery_pos,275.73,Misty,Hart,F,27954 Hall Mill Suite 575,...,-98.459,1595797,Horticultural consultant,10/28/1960,7863235a750d73a244c07f1fb7f0185a,1325476547,29.786426,-98.68341,1,0.650312


In [16]:
df.tail()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,risk_probability
6001,1047089,3/10/2020 3:59,3589290000000000.0,fraud_Kris-Weimann,misc_net,690.49,Paula,Estrada,F,350 Stacy Glens,...,-97.5936,343,"Development worker, international aid",3/5/1972,fb1ddd251bbec9b84c9755e856d51723,1362887989,43.254214,-98.267759,1,0.854881
6002,1047157,3/10/2020 4:31,3546670000000000.0,"fraud_Casper, Hand and Zulauf",grocery_pos,324.74,Jordan,May,M,1626 Susan Course,...,-97.6039,13602,Optometrist,7/5/1984,4dca0549e43b7e265cae7fd8a7e563b4,1362889904,33.607221,-97.996506,1,0.79
6003,1047208,3/10/2020 4:59,3589290000000000.0,fraud_Kiehn Inc,grocery_pos,331.33,Paula,Estrada,F,350 Stacy Glens,...,-97.5936,343,"Development worker, international aid",3/5/1972,d18c55035998e461aa9040e254b74925,1362891561,44.228731,-98.33052,1,0.737749
6004,1047521,3/10/2020 8:22,3589290000000000.0,fraud_Rau and Sons,grocery_pos,356.2,Paula,Estrada,F,350 Stacy Glens,...,-97.5936,343,"Development worker, international aid",3/5/1972,bdaeb5e3413a408d7e6c3720a35337d5,1362903771,43.988931,-97.989985,1,0.517672
6005,1047918,3/10/2020 12:09,3589290000000000.0,fraud_O'Connell-Ullrich,home,249.56,Paula,Estrada,F,350 Stacy Glens,...,-97.5936,343,"Development worker, international aid",3/5/1972,8f0bac74e340483b44babb0d6d07b85b,1362917373,42.868322,-98.537668,1,0.405


In [17]:
# Load the test dataset
df_test = pd.read_csv('fraudTest.csv')

In [18]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,1685,6/21/2020 22:06,3560730000000000.0,fraud_Hamill-D'Amore,health_fitness,24.84,Brooke,Smith,F,63542 Luna Brook Apt. 012,...,31.8599,-102.7413,23,Cytogeneticist,9/15/1969,16bf2e46c54369a8eab2214649506425,1371852399,32.575873,-102.60429,1
1,1767,6/21/2020 22:32,6564460000000000.0,"fraud_Rodriguez, Yost and Jenkins",misc_net,780.52,Douglas,Willis,M,619 Jeremy Garden Apt. 681,...,42.5545,-90.3508,1306,Public relations officer,9/10/1958,ab4b379d2c0c9c667d46508d4e126d72,1371853942,42.461127,-91.147148,1
2,1781,6/21/2020 22:37,6564460000000000.0,fraud_Nienow PLC,entertainment,620.33,Douglas,Willis,M,619 Jeremy Garden Apt. 681,...,42.5545,-90.3508,1306,Public relations officer,9/10/1958,47a9987ae81d99f7832a54b29a77bf4b,1371854247,42.771834,-90.158365,1
3,1784,6/21/2020 22:38,4005680000000000.0,"fraud_Heathcote, Yost and Kertzmann",shopping_net,1077.69,William,Perry,M,458 Phillips Island Apt. 768,...,30.459,-90.9027,71335,Herbalist,5/31/1994,fe956c7e4a253c437c18918bf96f7b62,1371854335,31.204974,-90.261595,1
4,1857,6/21/2020 23:02,3560730000000000.0,fraud_Hermann and Sons,shopping_pos,842.65,Brooke,Smith,F,63542 Luna Brook Apt. 012,...,31.8599,-102.7413,23,Cytogeneticist,9/15/1969,f6838c01f5d2262006e6b71d33ba7c6d,1371855736,31.315782,-102.73639,1


In [19]:
# Apply the probability calculation to each row
df_test['risk_probability'] = df_test.apply(calculate_fraud_probability, axis=1)

# Save the modified dataset with risk probability
df_test.to_csv('Test_dataset_with_probability.csv', index=False)

In [20]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,risk_probability
0,1685,6/21/2020 22:06,3560730000000000.0,fraud_Hamill-D'Amore,health_fitness,24.84,Brooke,Smith,F,63542 Luna Brook Apt. 012,...,-102.7413,23,Cytogeneticist,9/15/1969,16bf2e46c54369a8eab2214649506425,1371852399,32.575873,-102.60429,1,0.581096
1,1767,6/21/2020 22:32,6564460000000000.0,"fraud_Rodriguez, Yost and Jenkins",misc_net,780.52,Douglas,Willis,M,619 Jeremy Garden Apt. 681,...,-90.3508,1306,Public relations officer,9/10/1958,ab4b379d2c0c9c667d46508d4e126d72,1371853942,42.461127,-91.147148,1,0.825668
2,1781,6/21/2020 22:37,6564460000000000.0,fraud_Nienow PLC,entertainment,620.33,Douglas,Willis,M,619 Jeremy Garden Apt. 681,...,-90.3508,1306,Public relations officer,9/10/1958,47a9987ae81d99f7832a54b29a77bf4b,1371854247,42.771834,-90.158365,1,0.572102
3,1784,6/21/2020 22:38,4005680000000000.0,"fraud_Heathcote, Yost and Kertzmann",shopping_net,1077.69,William,Perry,M,458 Phillips Island Apt. 768,...,-90.9027,71335,Herbalist,5/31/1994,fe956c7e4a253c437c18918bf96f7b62,1371854335,31.204974,-90.261595,1,1.0
4,1857,6/21/2020 23:02,3560730000000000.0,fraud_Hermann and Sons,shopping_pos,842.65,Brooke,Smith,F,63542 Luna Brook Apt. 012,...,-102.7413,23,Cytogeneticist,9/15/1969,f6838c01f5d2262006e6b71d33ba7c6d,1371855736,31.315782,-102.73639,1,0.810834
