In [1]:
import pandas as pd

# Load CSV
df = pd.read_csv("mock_fraud_transactions_raw.csv")


df.head()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Transaction_ID      10000 non-null  object 
 1   Customer_ID         10000 non-null  object 
 2   Transaction_Amount  10000 non-null  float64
 3   Payment_Method      10000 non-null  object 
 4   Customer_Type       6418 non-null   object 
 5   Email               9705 non-null   object 
 6   Region              10000 non-null  object 
 7   Currency            10000 non-null  object 
 8   Transaction_Date    10000 non-null  object 
dtypes: float64(1), object(8)
memory usage: 703.3+ KB


In [2]:
# Column-wise null count
df.isnull().sum()

# Check unique values and sample
for col in df.columns:
    print(f"{col} ➜ {df[col].unique()[:10]}")


Transaction_ID ➜ ['TXN109442' 'TXN107838' 'TXN108311' 'TXN107658' 'TXN105544' 'TXN100851'
 'TXN108036' 'TXN102121' 'TXN108745' 'TXN101192']
Customer_ID ➜ ['CUST10442' 'CUST8838' 'CUST9311' 'CUST8658' 'CUST6544' 'CUST1851'
 'CUST9036' 'CUST3121' 'CUST9745' 'CUST2192']
Transaction_Amount ➜ [18962.94   738.95 53842.35 17819.17 13950.38 35212.14 11518.87 33022.73
  1213.62 15178.2 ]
Payment_Method ➜ ['Unknown' 'Credit Card' 'Debit Card' 'Crypto' 'Net Banking' 'UPI']
Customer_Type ➜ ['Individual' 'VIP' 'Business' nan 'Unknown' ' ']
Email ➜ ['user9442@outlook.com' 'user7838' ' ' 'user7658@fraudmail.com'
 'user5544@tempmail.org' 'user851@gmail.com' 'user8036'
 'user2121@yahoo.com' 'user8745@fraudmail.com' 'user1192@yahoo.com']
Region ➜ ['UK' 'India' 'USA' 'Thailand' 'Germany']
Currency ➜ ['EUR' 'THB' 'USD' 'INR' 'GBP']
Transaction_Date ➜ ['2024-06-09 05:00:00' '2024-01-11 06:00:00' '2024-12-08 07:00:00'
 '2024-11-05 20:00:00' '2024-09-22 20:00:00' '2024-11-15 11:00:00'
 '2024-11-28 05:00:00' 

In [3]:
# Strip all string columns
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Replace empty strings with NaN
df.replace('', pd.NA, inplace=True)


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [4]:
# Fix datetime
df['Transaction_Date'] = pd.to_datetime(df['Transaction_Date'], dayfirst=True, errors='coerce')

# Check currency + amount
df['Transaction_Amount'] = pd.to_numeric(df['Transaction_Amount'], errors='coerce')


In [5]:
# Numerical overview
df.describe()

# By region / payment method
df['Region'].value_counts()
df['Payment_Method'].value_counts()
df['Customer_Type'].value_counts(dropna=False)


Customer_Type
NaN           3582
Business      1572
Individual    1570
VIP           1541
Unknown       1535
<NA>           200
Name: count, dtype: int64

In [36]:


# Step 1: Clean Customer_Type
df['Customer_Type'] = df['Customer_Type'].fillna('Missing')

# Step 2: Convert Transaction_Date to datetime
df['Transaction_Date'] = pd.to_datetime(df['Transaction_Date'], dayfirst=True)

# Step 3: Flag high transaction amounts (> 75,000)
df['High_Amount'] = df['Transaction_Amount'] > 75000

# Step 4: Flag suspicious payment methods
suspicious_methods = ['Unknown', 'Crypto']
df['Suspicious_Method'] = df['Payment_Method'].isin(suspicious_methods)

# Step 5: Flag unknown or missing customer type
df['Unknown_Customer_Type'] = df['Customer_Type'].isin(['Unknown', 'Missing'])

# Step 6: Create a combined anomaly/fraud column based on your logic
df['Is_Anomaly'] = df[['High_Amount', 'Suspicious_Method', 'Unknown_Customer_Type']].all(axis=1)


# View anomalies
anomalies = df[df['Is_Anomaly']]


print(df['Is_Anomaly'].value_counts())



Is_Anomaly
False    9959
True       41
Name: count, dtype: int64


In [21]:
df['Is_Fraud'] = (
    ((df['Transaction_Amount'] > 75000) & df['Payment_Method'].isin(['Crypto', 'Unknown'])) |
    ((df['Customer_Type'].isin(['Unknown', 'Missing'])) & (df['Transaction_Amount'] > 75000)) |
    (df[['High_Amount', 'Suspicious_Method', 'Unknown_Customer_Type']].sum(axis=1) >= 2)
)


In [28]:

print(df['Is_Fraud'].value_counts())

Is_Fraud
False    8141
True     1859
Name: count, dtype: int64


In [29]:
df

Unnamed: 0,Transaction_ID,Customer_ID,Transaction_Amount,Payment_Method,Customer_Type,Email,Region,Currency,Transaction_Date,High_Amount,Suspicious_Method,Unknown_Customer_Type,Is_Anomaly,Is_Fraud
0,TXN109442,CUST10442,18962.94,Unknown,Individual,user9442@outlook.com,UK,EUR,2024-09-06 05:00:00,False,True,False,True,False
1,TXN107838,CUST8838,738.95,Credit Card,VIP,user7838,India,THB,2024-11-01 06:00:00,False,False,False,False,False
2,TXN108311,CUST9311,53842.35,Debit Card,Business,,UK,USD,2024-08-12 07:00:00,False,False,False,False,False
3,TXN107658,CUST8658,17819.17,Crypto,VIP,user7658@fraudmail.com,UK,INR,2024-05-11 20:00:00,False,True,False,True,False
4,TXN105544,CUST6544,13950.38,Crypto,Missing,user5544@tempmail.org,UK,THB,NaT,False,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,TXN107086,CUST8086,6612.77,Debit Card,Unknown,user7086@gmail.com,UK,GBP,NaT,False,False,True,True,False
9996,TXN105093,CUST6093,4650.68,Crypto,VIP,user5093@gmail.com,Thailand,THB,2024-09-01 00:00:00,False,True,False,True,False
9997,TXN102719,CUST3719,5822.96,Credit Card,Business,user2719@tempmail.org,Thailand,THB,NaT,False,False,False,False,False
9998,TXN105673,CUST6673,15795.57,Crypto,Business,user5673@tempmail.org,Germany,USD,NaT,False,True,False,True,False


In [34]:
df[df['Is_Fraud']].groupby('Region').size().sort_values(ascending=False)


Region
India       398
USA         387
Thailand    372
UK          369
Germany     333
dtype: int64

In [35]:
df.to_csv('cleaned_transaction_data.csv', index=False)
