# Feature Engineering

In [7]:
import pandas as pd

# Load data
df = pd.read_csv("../data/processed_fraud_data.csv")
ip_map = pd.read_csv("../data/processed_IpAddress_to_Country.csv")


# IP conversion
df['ip_address'] = df['ip_address'].astype(int)


In [8]:
print("Shape:", df.shape)
df.info()
df.describe(include='all')

Shape: (151112, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   user_id            151112 non-null  int64  
 1   signup_time        151112 non-null  object 
 2   purchase_time      151112 non-null  object 
 3   purchase_value     151112 non-null  int64  
 4   device_id          151112 non-null  object 
 5   source             151112 non-null  object 
 6   browser            151112 non-null  object 
 7   sex                151112 non-null  object 
 8   age                151112 non-null  int64  
 9   ip_address         151112 non-null  int64  
 10  class              151112 non-null  int64  
 11  time_since_signup  151112 non-null  float64
dtypes: float64(1), int64(5), object(6)
memory usage: 13.8+ MB


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,time_since_signup
count,151112.0,151112,151112,151112.0,151112,151112,151112,151112,151112.0,151112.0,151112.0,151112.0
unique,,151112,150679,,137956,3,5,2,,,,
top,,2015-02-24 22:55:49,2015-07-17 23:22:55,,ITUMJCKWEYNDD,SEO,Chrome,M,,,,
freq,,1,3,,20,60615,61432,88293,,,,
mean,200171.04097,,,36.935372,,,,,33.140704,2152145000.0,0.093646,1370.008125
std,115369.285024,,,18.322762,,,,,8.617733,1248497000.0,0.291336,868.406422
min,2.0,,,9.0,,,,,18.0,52093.0,0.0,0.000278
25%,100642.5,,,22.0,,,,,27.0,1085934000.0,0.0,607.431528
50%,199958.0,,,35.0,,,,,33.0,2154770000.0,0.0,1368.429306
75%,300054.0,,,49.0,,,,,39.0,3243258000.0,0.0,2123.479028


In [9]:
# Convert time columns to datetime
df['purchase_time'] = pd.to_datetime(df['purchase_time'])
df['signup_time'] = pd.to_datetime(df['signup_time'])

df['time_since_signup'] = (df['purchase_time'] - df['signup_time']).dt.total_seconds() / 3600
df['hour_of_day'] = df['purchase_time'].dt.hour
df['day_of_week'] = df['purchase_time'].dt.dayofweek

In [10]:
# Merge IPs
def find_country(ip):
    match = ip_map[(ip_map['lower_bound_ip_address'] <= ip) & (ip_map['upper_bound_ip_address'] >= ip)]
    return match['country'].values[0] if not match.empty else 'Unknown'

df['country'] = df['ip_address'].apply(find_country)


In [11]:
# Save processed data
df.to_csv("../data/feature_engineering_Processed.csv", index=False)
df[['time_since_signup', 'hour_of_day', 'day_of_week', 'country']].head()


Unnamed: 0,time_since_signup,hour_of_day,day_of_week,country
0,1251.856111,2,5,Japan
1,4.984444,1,0,United States
2,0.000278,18,3,United States
3,136.690278,13,0,Unknown
4,1211.516944,18,2,United States
