Load data

In [2]:
import pandas as pd

In [4]:
# Load CSV files
transactions = pd.read_csv("Fraud_Data.csv")
IpAddress = pd.read_csv("IpAddress_to_Country.csv")

Merge datasets

In [6]:
# Ensure numeric columns
transactions['ip_address'] = transactions['ip_address'].astype(float)
IpAddress['lower_bound_ip_address'] = IpAddress['lower_bound_ip_address'].astype(float)
IpAddress['upper_bound_ip_address'] = IpAddress['upper_bound_ip_address'].astype(float)

# Perform range join using a loop (slower but functional for small data)
def map_country(ip_address):
    match = IpAddress[
        (ip_address >= IpAddress['lower_bound_ip_address']) &
        (ip_address <= IpAddress['upper_bound_ip_address'])
    ]
    return match['country'].iloc[0] if not match.empty else None

# Map country for each user
transactions['country'] = transactions['ip_address'].apply(map_country)

# Save merged data
transactions.to_csv("transactionsMerged.csv", index=False)

# Display a sample of the merged data
print(transactions.head())

   user_id          signup_time        purchase_time  purchase_value  \
0    22058  2015-02-24 22:55:49  2015-04-18 02:47:11              34   
1   333320  2015-06-07 20:39:50  2015-06-08 01:38:54              16   
2     1359  2015-01-01 18:52:44  2015-01-01 18:52:45              15   
3   150084  2015-04-28 21:13:25  2015-05-04 13:54:50              44   
4   221365  2015-07-21 07:09:52  2015-09-09 18:40:53              39   

       device_id source browser sex  age    ip_address  class        country  
0  QVPSPJUOCKZAR    SEO  Chrome   M   39  7.327584e+08      0          Japan  
1  EOGFQPIZPYXFZ    Ads  Chrome   F   53  3.503114e+08      0  United States  
2  YSSKYOSJHPPLJ    SEO   Opera   M   53  2.621474e+09      1  United States  
3  ATGTXKYKUDUQN    SEO  Safari   M   41  3.840542e+09      0           None  
4  NAUITBZFJKHWW    Ads  Safari   M   45  4.155831e+08      0  United States  


Investigation data summary

In [8]:
# Check general info
print(transactions.info())

# Check for missing values
print(transactions.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         151112 non-null  int64  
 1   signup_time     151112 non-null  object 
 2   purchase_time   151112 non-null  object 
 3   purchase_value  151112 non-null  int64  
 4   device_id       151112 non-null  object 
 5   source          151112 non-null  object 
 6   browser         151112 non-null  object 
 7   sex             151112 non-null  object 
 8   age             151112 non-null  int64  
 9   ip_address      151112 non-null  float64
 10  class           151112 non-null  int64  
 11  country         129146 non-null  object 
dtypes: float64(1), int64(4), object(7)
memory usage: 13.8+ MB
None
user_id               0
signup_time           0
purchase_time         0
purchase_value        0
device_id             0
source                0
browser               0
sex      

In [9]:
transactions['country'].fillna("Unknown", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  transactions['country'].fillna("Unknown", inplace=True)
