In [1]:
# setup 
import pandas as pd

# Initital analysis of the business problem
## Taking into account the actual orders supposed to multiple records per
## Adding onto the dataframe to be able to group by and analyse them
- If amount, country and card are the same AND between timestamp is <= 1 minute then it's the same order 

In [2]:
# retrieve intermediate data
df = pd.read_csv('../data/intermediate/clean_data.csv', index_col=False)
df.head()

Unnamed: 0,tmsp,country,amount,success,psp,3d_secured,card
0,2019-01-01 00:01:11,Germany,89,0,UK_Card,0,Visa
1,2019-01-01 00:01:17,Germany,89,1,UK_Card,0,Visa
2,2019-01-01 00:02:49,Germany,238,0,UK_Card,1,Diners
3,2019-01-01 00:03:13,Germany,238,1,UK_Card,1,Diners
4,2019-01-01 00:04:33,Austria,124,0,Simplecard,0,Diners


In [3]:
# Sort data by timestamp, looks sorted already, just to make sure
df['tmsp'] = pd.to_datetime(df['tmsp'])
df = df.sort_values(by='tmsp')

# Create a function to identify the same transaction
# aggregate by within one minute of previous where country, card, amount is the same 
def identify_transactions(df):
    # Shift the relevant columns to compare with the next row
    df['time_diff'] = df['tmsp'].diff().fillna(pd.Timedelta(seconds=0)) # To be able to be within the 1 minute limit
    df['same_transaction'] = ((df['country'] == df['country'].shift()) &
                               (df['amount'] == df['amount'].shift()) &
                               (df['card'] == df['card'].shift()) &
                               (df['time_diff'] <= pd.Timedelta(minutes=1)))

    # Create a transaction group identifier by cumulative sum
    df['order_id'] = (~df['same_transaction']).cumsum()

    return df

In [4]:
# Apply the function
df_transaction = identify_transactions(df)

# Show the result
display(df_transaction)

Unnamed: 0,tmsp,country,amount,success,psp,3d_secured,card,time_diff,same_transaction,order_id
0,2019-01-01 00:01:11,Germany,89,0,UK_Card,0,Visa,0 days 00:00:00,False,1
1,2019-01-01 00:01:17,Germany,89,1,UK_Card,0,Visa,0 days 00:00:06,True,1
2,2019-01-01 00:02:49,Germany,238,0,UK_Card,1,Diners,0 days 00:01:32,False,2
3,2019-01-01 00:03:13,Germany,238,1,UK_Card,1,Diners,0 days 00:00:24,True,2
4,2019-01-01 00:04:33,Austria,124,0,Simplecard,0,Diners,0 days 00:01:20,False,3
...,...,...,...,...,...,...,...,...,...,...
50405,2019-02-28 23:45:39,Switzerland,415,0,UK_Card,0,Visa,0 days 00:00:00,True,27337
50406,2019-02-28 23:46:48,Austria,91,0,UK_Card,0,Master,0 days 00:01:09,False,27338
50407,2019-02-28 23:47:04,Austria,91,0,UK_Card,0,Master,0 days 00:00:16,True,27338
50408,2019-02-28 23:47:36,Austria,91,0,UK_Card,0,Master,0 days 00:00:32,True,27338


- there are 27338 order_id's

In [14]:
# Group by order_id and count the number of entries for each order
order_counts = df.groupby('order_id').size()

# Filter the orders where there is only one entry (i.e., successful on the first try)
single_entry_orders = order_counts[order_counts == 1].index

# Filter the DataFrame for only these single-entry orders
single_entry_df = df[df['order_id'].isin(single_entry_orders)]

# Count the number of successful single-entry orders (where success = 1)
successful_single_entry_orders = single_entry_df[single_entry_df['success'] == 1].shape[0]

# Group by order_id and calculate the max success for each order
order_success = df.groupby('order_id')['success'].max()

# Filter orders where max(success) == 0 (all failed transactions)
failed_orders = order_success[order_success == 0].index

# Filter the original DataFrame for these failed orders
failed_orders_df = df[df['order_id'].isin(failed_orders)]

# Calculate the total number of failed orders
total_failed_orders = failed_orders_df['order_id'].nunique()

# Filter orders where max(success) == 0 (all failed transactions)
success_orders = order_success[order_success == 1].index

# Filter the original DataFrame for these failed orders
success_orders_df = df[df['order_id'].isin(success_orders)]

# Calculate the total number of failed orders
total_success_orders_df = success_orders_df['order_id'].nunique()

# Calculate the total number of orders
total_orders = df['order_id'].nunique()

# Calculate the proportion of failed orders
failed_proportion = total_failed_orders / total_orders
failed_proportion_percentage = failed_proportion * 100

# Calculate the total number of tries for the failed orders
failed_orders_tries_before_giving_up = failed_orders_df.groupby('order_id').size() - 1

# Calculate the min, max, and mean number of tries before giving up (excluding the final attempt)
min_tries_before_giving_up = failed_orders_tries_before_giving_up.min()
max_tries_before_giving_up = failed_orders_tries_before_giving_up.max()
mean_tries_before_giving_up = failed_orders_tries_before_giving_up.mean()

# Display the results for failed orders
print(f"Total number of orders: {total_orders}")
print(f"Total number of failed orders: {total_failed_orders}")
print(f"Total number of overall successful transactions: {total_success_orders_df}")
print(f"Proportion of failed orders: {failed_proportion:.4f} ({failed_proportion_percentage:.2f}%)")

print(f"Total number of successful orders with only one attempt: {successful_single_entry_orders}")
print(f"Minimum number of tries before giving up (excluding final try): {min_tries_before_giving_up}")
print(f"Maximum number of tries before giving up (excluding final try): {max_tries_before_giving_up}")
print(f"Mean number of tries before giving up (excluding final try): {mean_tries_before_giving_up:.2f}")

Total number of orders: 27338
Total number of failed orders: 17113
Total number of overall successful transactions: 10225
Proportion of failed orders: 0.6260 (62.60%)
Total number of successful orders with only one attempt: 5639
Minimum number of tries before giving up (excluding final try): 0
Maximum number of tries before giving up (excluding final try): 8
Mean number of tries before giving up (excluding final try): 0.84


In [15]:
# Calculate the percentage of transactions that work on the first try
percentage_first_try_success = (successful_single_entry_orders / total_orders) * 100

# Display the result
print(f"Percentage of transactions that work on the first try: {percentage_first_try_success:.2f}%")

Percentage of transactions that work on the first try: 20.63%


In [18]:
# Filter out orders where success occurred on the first try
non_first_try_successful_transactions = df[df['order_id'].isin(single_entry_orders) & (df['success'] == 1) == False]

# For these non-first-try successful orders, calculate the cumulative count of transactions before success
transactions_before_success_non_first_try = non_first_try_successful_transactions.groupby('order_id').cumcount() + 1

# Calculate the mean number of tries before success for orders that did not succeed on the first try
mean_tries_before_success_non_first_try = transactions_before_success_non_first_try.mean()

# Display the result
print(f"Mean number of tries before success (excluding first-try successes): {mean_tries_before_success_non_first_try:.2f}")

Mean number of tries before success (excluding first-try successes): 1.96


- Overall it's not looking so good
    - only 20.63% of orders work on the first try
        - that means that half approx half of the customers that manage to pay need more than one try
    -- 4 out of 5 are experiencing issues with the payment
    - 62.6% of orders don't go through and get abandoned 
        - high loss of revenue 
        - the transaction fees should not be the biggest concern here 
        - customers abandon the cart on average on the second try
        -- should not get to it -- we need a high chance of success 