## prepare data for modeling

In [1]:
# setup
import sys
import os
sys.path.append(os.path.abspath('../src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# from utils import 

In [2]:
df = pd.read_csv('../data/intermediate/clean_data_with_order_and_fees.csv', index_col=False)

In [3]:
df.head()

Unnamed: 0,tmsp,country,amount,success,psp,3d_secured,card,order_id,hour_of_day,day_of_week,fee
0,2019-01-01 00:01:11,Germany,89,0,UK_Card,0,Visa,1,0,1,1.0
1,2019-01-01 00:01:17,Germany,89,1,UK_Card,0,Visa,1,0,1,3.0
2,2019-01-01 00:02:49,Germany,238,0,UK_Card,1,Diners,2,0,1,1.0
3,2019-01-01 00:03:13,Germany,238,1,UK_Card,1,Diners,2,0,1,3.0
4,2019-01-01 00:04:33,Austria,124,0,Simplecard,0,Diners,3,0,1,0.5


In [7]:
df = pd.read_excel('../data/raw/psp_raw_data.xlsx', index_col=0)
df.rename(columns=str.lower, inplace=True)
df['success'] = df['success'].astype('object')
df['3d_secured'] = df['3d_secured'].astype('object')
df['is_peak_time'] = df['tmsp'].apply(lambda x: 1 if 13 <= x.hour < 18 else 0)
result = df[['country', 'card', '3d_secured', 'is_peak_time', 'amount', 'psp', 'success']]
result.head()

Unnamed: 0,country,card,3d_secured,is_peak_time,amount,psp,success
0,Germany,Visa,0,0,89,UK_Card,0
1,Germany,Visa,0,0,89,UK_Card,1
2,Germany,Diners,1,0,238,UK_Card,0
3,Germany,Diners,1,0,238,UK_Card,1
4,Austria,Diners,0,0,124,Simplecard,0


In [8]:
def preprocess_data(df):
    """
    Preprocesses the dataset for modeling.
    - Drops unnecessary columns.
    - Renames columns to lowercase.
    - Converts specified columns to object type.
    - Adds a new 'is_peak_time' feature.
    
    Parameters:
        df (pd.DataFrame): The input dataset.
        
    Returns:
        pd.DataFrame: The preprocessed dataset.
    """
    # Rename all columns to lowercase
    df.columns = df.columns.str.lower()

    # Add the 'is_peak_time' feature (1 if time is between 13:00 and 18:00, else 0)
    df['is_peak_time'] = df['tmsp'].apply(lambda x: 1 if 13 <= x.hour < 18 else 0)
    
    # Convert booleans to object type
    df['success'] = df['success'].astype('object')
    df['3d_secured'] = df['3d_secured'].astype('object')
    df['is_peak_time'] = df['is_peak_time'].astype('object')

    # Only grab the chosen features, reorder so target at end
    df = df[['country', 'card', '3d_secured', 'is_peak_time', 'amount', 'psp', 'success']]
    
    return df


In [10]:

df = pd.read_excel('../data/raw/psp_raw_data.xlsx', index_col=0)

# Apply the preprocessing function
processed_data = preprocess_data(df)

# Display the first few rows of the processed data
processed_data.head()

Unnamed: 0,country,card,3d_secured,is_peak_time,amount,psp,success
0,Germany,Visa,0,0,89,UK_Card,0
1,Germany,Visa,0,0,89,UK_Card,1
2,Germany,Diners,1,0,238,UK_Card,0
3,Germany,Diners,1,0,238,UK_Card,1
4,Austria,Diners,0,0,124,Simplecard,0


In [13]:
processed_data.dtypes

country         object
card            object
3d_secured      object
is_peak_time     int64
amount           int64
psp             object
success         object
dtype: object