## Import Libraries

In [125]:
# Importing necessary libraries
import pandas as pd
import numpy as np

## Loading the Dataset

In [126]:
# Loading in and setting the data
# You can't download these files from the github, you have to drop them in the folder
tx_data = pd.read_csv('fake_transactional_data_24.csv')
tx_df = pd.DataFrame(tx_data)

In [127]:
tx_df.head()

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date
0,10371.0,4.0,CINEMA,01/01/2025
1,88339.0,2.4,40544,01/01/2025
2,18555.0,2.4,85149,01/01/2025
3,18555.0,4.1,HIPSTER_COFFEE_SHOP,01/01/2025
4,80792.0,1.95,18555,01/01/2025


### Creating Senders and Receivers List

In [128]:
# Creating senders and receivers list
senders_list = sorted(tx_df['from_totally_fake_account'])
receivers_list = sorted(tx_df['to_randomly_generated_account'])

print(tx_df['to_randomly_generated_account'])

0                        CINEMA
1                         40544
2                         85149
3           HIPSTER_COFFEE_SHOP
4                         18555
                   ...         
10148275            COFFEE_SHOP
10148276            COFFEE_SHOP
10148277               WINE_BAR
10148278                  57569
10148279    A_LOCAL_COFFEE_SHOP
Name: to_randomly_generated_account, Length: 10148280, dtype: object


## Business Data Extraction and Organization

In [129]:
# Creating business and users list
receivers_biz_list = []
receivers_user_list = []

for receiver in receivers_list:
    if receiver.isnumeric() == True:
        receivers_user_list.append(receiver)
    else:
        receivers_biz_list.append(receiver)

In [130]:
unique_senders_list = sorted(list(set(senders_list)))
unique_receivers_list = sorted(list(set(receivers_list)))
unique_receivers_biz_list = sorted(list(set(receivers_biz_list)))
unique_receivers_user_list = sorted(list(set(receivers_user_list)))

### Extracting Valuable Business Data

In [7]:
# Creating business table
# Pruning data from original dataframe
biz_tx_df = tx_df[tx_df['to_randomly_generated_account'].isin(unique_receivers_biz_list)]
biz_tx_df = biz_tx_df.sort_values('to_randomly_generated_account')

# Getting list of ordered names
biz_name_list = list(biz_tx_df['to_randomly_generated_account'].unique())

In [8]:
# Getting Descriptive Stats
biz_tx_count = biz_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].count()
biz_tx_sum = biz_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].sum()

biz_tx_mean = biz_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].mean()
biz_tx_median = biz_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].median()

In [9]:
# Needed to count words
from collections import Counter

# Custom aggregation function for sorted list
def sorted_list_agg(input_list):
    return sorted(input_list)

# Custom aggregation function for sorted unique list
def sorted_unique_list_agg(input_list):
    return sorted(input_list.unique())

def sorted_list_count_agg(input_list):
    # Sort the list
    input_list = sorted(input_list)
    
    # Count the frequencies
    input_word_freqs = Counter(input_list)
    
    # Generate the output list
    # dict.fromkeys iterates over the words in the order they occur, it ensures the words remain in order
    output_list = [input_word_freqs[word_count] for word_count in dict.fromkeys(input_word_freqs)]
    
    return output_list

In [10]:
# Getting Business Sorted Customer Details
biz_tx_customers_sorted = biz_tx_df.groupby('to_randomly_generated_account')['from_totally_fake_account'].agg(sorted_list_agg)

# Getting Business Unique Sorted Customer Details
biz_tx_customers_sorted_unique = biz_tx_df.groupby('to_randomly_generated_account')['from_totally_fake_account'].agg(sorted_unique_list_agg)

# Getting Business Sorted Customer Details Count
biz_tx_customers_sorted_unique_count = biz_tx_df.groupby('to_randomly_generated_account')['from_totally_fake_account'].agg(sorted_list_count_agg)

### Building Business Dataframes

In [11]:
# Building business accounts table
biz_slim_data = {
    'name': biz_name_list,
    'tx_count': list(biz_tx_count),
    'tx_sum_gbp': list(biz_tx_sum),
    'tx_mean_gbp': list(biz_tx_mean),
    'tx_median_gbp': list(biz_tx_median)
}

# Building business accounts table
biz_large_data = {
    'name': biz_name_list,
    'tx_count': list(biz_tx_count),
    'tx_sum_gbp': list(biz_tx_sum),
    'tx_mean_gbp': list(biz_tx_mean),
    'tx_median_gbp': list(biz_tx_median),
    'customers_sorted_unique': list(biz_tx_customers_sorted_unique),
    'customers_tx_count': list(biz_tx_customers_sorted_unique_count)
}

# Create small dataframe
biz_slim_df = pd.DataFrame(biz_slim_data)

# Create large dataframe
biz_large_df = pd.DataFrame(biz_large_data)

## User Data Extraction and Organization

In [147]:
# User List
# user_id

# total_tx_count
# gross_tx_sum_gbp
# gross_tx_mean
# gross_tx_med

# sent_tx_count
# gross_tx_sent_sum_gbp
# gross_tx_sent_mean
# gross_tx_sent_med

# receive_tx_count
# gross_tx_receive_sum_gbp
# gross_tx_receive_mean
# gross_tx_receive_med

# net_tx_count
# net_tx_sum_gbp
# net_tx_mean_gbp

# biz_tx_name
# biz_tx_count
# biz_tx_sum
# biz_tx_mean

### Create User ID List

In [148]:
# Combine the unique senders, and unique customer receivers
customer_list_combination = unique_senders_list + unique_receivers_user_list

# Convert all items to strings of integers
# Turned into a set as this removed duplicates
# Values are sorted alphabetically
customer_unique_list = sorted(set([str(int(cust)) for cust in customer_list_combination]))

### Create Total Tx Count List

In [149]:
total_senders = list(tx_df['from_totally_fake_account'])
total_receivers = receivers_user_list
total_customer_frequency = total_senders + total_receivers
total_customer_frequency = [str(int(cust)) for cust in total_customer_frequency]

total_customer_tx_count = sorted_list_count_agg(total_customer_frequency)

### Create Send Tx Count List

In [150]:
send_customer_frequency = [str(int(cust)) for cust in total_senders]
send_customer_tx_count = sorted_list_count_agg(send_customer_frequency)

### Create Receive Tx Count List

In [151]:
receive_customer_frequency = [str(int(cust)) for cust in total_receivers]
receive_customer_tx_count = sorted_list_count_agg(receive_customer_frequency)

### Create Net Tx Count List

In [None]:
# Net Tx Count
net_customer_tx_count = [send_customer_tx_count - receive_customer_tx_count for receive_customer_tx_count, send_customer_tx_count in zip(list_one, list_two)]
