In [39]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample


In [40]:
# Read the CSV file into a DataFrame
data = pd.read_csv('../data/raw/Base.csv')

Data Preprocessing

In [41]:
# Data Cleaning
data['intended_balcon_amount'] = data['intended_balcon_amount'].apply(lambda x: -1 if x < 0 else x)
data = data[(data['current_address_months_count'] >= 0) & (data['session_length_in_minutes'] >= 0) & (data['device_distinct_emails_8w'] >= 0)]

# Convert the variables to the appropriate data types
data['fraud_bool'] = data['fraud_bool'].astype('category')
data['payment_type'] = data['payment_type'].astype('category')
data['employment_status'] = data['employment_status'].astype('category')
data['email_is_free'] = data['email_is_free'].astype('uint8')
data['housing_status'] = data['housing_status'].astype('category')
data['phone_home_valid'] = data['phone_home_valid'].astype('uint8')
data['phone_mobile_valid'] = data['phone_mobile_valid'].astype('uint8')
data['has_other_cards'] = data['has_other_cards'].astype('uint8')
data['foreign_request'] = data['foreign_request'].astype('uint8')
data['source'] = data['source'].astype('category')
data['device_os'] = data['device_os'].astype('category')

Dropping Redundant rows

In [42]:
data.drop(columns=['device_fraud_count'], inplace=True)

Ratio of 1 to 5

In [43]:
# Separate fraud and non-fraud data
fraud_data = data[data['fraud_bool'] == 1]
non_fraud_data = data[data['fraud_bool'] == 0]

# Undersample non-fraud data to match the size of the fraud data
undersampled_non_fraud_data = resample(non_fraud_data, replace=False, n_samples=5*len(fraud_data), random_state=42)

# Combine fraud and undersampled non-fraud data
undersampled_data = pd.concat([fraud_data, undersampled_non_fraud_data])

In [44]:
undersampled_data.to_csv('../data/processed/undersampled_data.csv', index=False)