## EDA

In [None]:
import pandas as pd

df_train = pd.read_csv(r"C:\Users\kevin\Documents\Projects\Fraudvestigate\Dataset\fraudTrain.csv")
df_test = pd.read_csv(r"C:\Users\kevin\Documents\Projects\Fraudvestigate\Dataset\fraudTest.csv")

print(f"Train Shape: {df_train.shape}")
print(f"Test Shape:  {df_test.shape}")

# Check for Data Type Mismatches, if "amt" is float in train but object in test, merging will fail.
print("Training Data Types:")
print(df_train.dtypes[["trans_date_trans_time", "amt", "is_fraud"]]) 
print("\nTest Data Types:")
print(df_test.dtypes[["trans_date_trans_time", "amt", "is_fraud"]])

# Check class imbalance (The "Fraud" context), if the test set has a similar fraud rate to the train set.
train_fraud_rate = df_train["is_fraud"].mean() * 100
test_fraud_rate = df_test["is_fraud"].mean() * 100

print(f"Fraud Rate in Train: {train_fraud_rate:.2f}%")
print(f"Fraud Rate in Test:  {test_fraud_rate:.2f}%")

Train Shape: (1296675, 23)
Test Shape:  (555719, 23)
Training Data Types:
trans_date_trans_time     object
amt                      float64
is_fraud                   int64
dtype: object

Test Data Types:
trans_date_trans_time     object
amt                      float64
is_fraud                   int64
dtype: object
Fraud Rate in Train: 0.58%
Fraud Rate in Test:  0.39%


## Merge (Preserving data integrity)

In [None]:
df_train["dataset_source"] = "train"
df_test["dataset_source"] = "test"

# Merge vertically
df_merged = pd.concat([df_train, df_test], ignore_index=True)
print(f"Combined Shape: {df_merged.shape}")

# Check if we lost any rows (Should be 0 from manual check)
expected_rows = len(df_train) + len(df_test)
actual_rows = len(df_merged)
print(f"Row Count Match: {expected_rows == actual_rows}")

Combined Shape: (1852394, 24)
Row Count Match: True


## Upload to CSV

In [None]:
df_merged.rename(columns={
    "trans_num": "transaction_id",
    "amt": "transaction_amount",
    "merch_lat": "merchant_latitude",
    "merch_long": "merchant_longitude",
    "cc_num": "credit_card_number"
}, inplace=True)

df_merged.to_csv(r"C:\Users\kevin\Documents\Projects\Fraudvestigate\Dataset\cleaned_transactions_merged.csv", index=False)

In [6]:
df_merged.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,credit_card_number,merchant,category,transaction_amount,first,last,gender,street,...,long,city_pop,job,dob,transaction_id,unix_time,merchant_latitude,merchant_longitude,is_fraud,dataset_source
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,train
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,train
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,train
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,train
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,train


In [None]:
if 'Unnamed: 0' in df_merged.columns:
    df_merged.drop(columns=['Unnamed: 0'], inplace=True)

df_merged.to_csv(r"C:\Users\kevin\Documents\Projects\Fraudvestigate\Dataset\final_transactions.csv", index=False)

print(df_merged.columns.tolist())

['trans_date_trans_time', 'credit_card_number', 'merchant', 'category', 'transaction_amount', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'transaction_id', 'unix_time', 'merchant_latitude', 'merchant_longitude', 'is_fraud', 'dataset_source']


In [9]:
df_merged.head()

Unnamed: 0,trans_date_trans_time,credit_card_number,merchant,category,transaction_amount,first,last,gender,street,city,...,long,city_pop,job,dob,transaction_id,unix_time,merchant_latitude,merchant_longitude,is_fraud,dataset_source
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,train
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,train
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,train
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,train
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,train
