In [1]:
import pandas as pd


In [3]:
# Read the CSV file
data = pd.read_csv('./data/Fraud_Data.csv')

In [4]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64


In [5]:
# Convert timestamp columns
data['signup_time'] = pd.to_datetime(data['signup_time'])
data['purchase_time'] = pd.to_datetime(data['purchase_time'])

In [6]:
# Remove duplicates
data.drop_duplicates(inplace=True)

In [9]:
# Validate IP addresses
def validate_ip(ip):
    parts = str(int(ip)).split('.')
    if len(parts) != 4:
        return False
    for part in parts:
        if not part.isdigit() or int(part) < 0 or int(part) > 255:
            return False
    return True

data['valid_ip'] = data['ip_address'].apply(validate_ip)

In [11]:
# Handle outliers - purchase_value
purchase_value_mean = data['purchase_value'].mean()
purchase_value_std = data['purchase_value'].std()
outlier_threshold = purchase_value_mean + 3 * purchase_value_std

In [12]:
data = data[data['purchase_value'] <= outlier_threshold]


In [13]:
# Handle outliers - age
age_mean = data['age'].mean()
age_std = data['age'].std()
outlier_threshold = age_mean + 3 * age_std

In [14]:
data = data[data['age'] <= outlier_threshold]

In [15]:

# Standardize categorical variables - source
data['source'] = data['source'].str.lower()

# Standardize categorical variables - browser
data['browser'] = data['browser'].str.lower()

In [16]:
# Feature engineering
data['signup_purchase_duration'] = data['purchase_time'] - data['signup_time']
data['age_bracket'] = pd.cut(data['age'], bins=[0, 18, 30, 40, 50, 120], labels=['<18', '18-30', '31-40', '41-50', '50+'])
data['device_id_info'] = data['device_id'].str[:3]

In [18]:
# Save the modified data as CSV
data.to_csv('Cleaned FraudData.csv', index=False)

print("Cleaned data saved as 'Cleaned FraudData.csv'")

Cleaned data saved as 'Cleaned FraudData.csv'


In [17]:
# Print the modified data
print("Modified data:\n", data)

Modified data:
         user_id         signup_time       purchase_time  purchase_value  \
0         22058 2015-02-24 22:55:49 2015-04-18 02:47:11              34   
1        333320 2015-06-07 20:39:50 2015-06-08 01:38:54              16   
2          1359 2015-01-01 18:52:44 2015-01-01 18:52:45              15   
3        150084 2015-04-28 21:13:25 2015-05-04 13:54:50              44   
4        221365 2015-07-21 07:09:52 2015-09-09 18:40:53              39   
...         ...                 ...                 ...             ...   
151107   345170 2015-01-27 03:03:34 2015-03-29 00:30:47              43   
151108   274471 2015-05-15 17:43:29 2015-05-26 12:24:39              35   
151109   368416 2015-03-03 23:07:31 2015-05-20 07:07:47              40   
151110   207709 2015-07-09 20:06:07 2015-09-07 09:34:46              46   
151111   138208 2015-06-10 07:02:20 2015-07-21 02:03:53              20   

            device_id  source browser sex  age    ip_address  class  valid_ip  \
0 