## Import Libraries

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

## Loading the Dataset

In [2]:
# Loading in and setting the data
# You can't download these files from the github, you have to drop them in the folder
tx_data = pd.read_csv('fake_transactional_data_24.csv')
tx_df = pd.DataFrame(tx_data)

In [3]:
tx_df.head()

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date
0,10371.0,4.0,CINEMA,01/01/2025
1,88339.0,2.4,40544,01/01/2025
2,18555.0,2.4,85149,01/01/2025
3,18555.0,4.1,HIPSTER_COFFEE_SHOP,01/01/2025
4,80792.0,1.95,18555,01/01/2025


### Creating Senders and Receivers List

In [4]:
# Creating senders and receivers list
senders_list = sorted(tx_df['from_totally_fake_account'])
receivers_list = sorted(tx_df['to_randomly_generated_account'])

print(tx_df['to_randomly_generated_account'])

0                        CINEMA
1                         40544
2                         85149
3           HIPSTER_COFFEE_SHOP
4                         18555
                   ...         
10148275            COFFEE_SHOP
10148276            COFFEE_SHOP
10148277               WINE_BAR
10148278                  57569
10148279    A_LOCAL_COFFEE_SHOP
Name: to_randomly_generated_account, Length: 10148280, dtype: object


## Business Data Extraction and Organization

In [5]:
# Creating business and users list
receivers_biz_list = []
receivers_user_list = []

for receiver in receivers_list:
    if receiver.isnumeric() == True:
        receivers_user_list.append(receiver)
    else:
        receivers_biz_list.append(receiver)

In [6]:
unique_senders_list = sorted(list(set(senders_list)))
unique_receivers_list = sorted(list(set(receivers_list)))
unique_receivers_biz_list = sorted(list(set(receivers_biz_list)))
unique_receivers_user_list = sorted(list(set(receivers_user_list)))

### Extracting Valuable Business Data

In [7]:
# Creating business table
# Pruning data from original dataframe
biz_tx_df = tx_df[tx_df['to_randomly_generated_account'].isin(unique_receivers_biz_list)]
biz_tx_df = biz_tx_df.sort_values('to_randomly_generated_account')

# Getting list of ordered names
biz_name_list = list(biz_tx_df['to_randomly_generated_account'].unique())

In [8]:
# Getting Descriptive Stats
biz_tx_count = biz_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].count()
biz_tx_sum = biz_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].sum()

biz_tx_mean = biz_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].mean()
biz_tx_median = biz_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].median()

In [9]:
# Needed to count words
from collections import Counter

# Custom aggregation function for sorted list
def sorted_list_agg(input_list):
    return sorted(input_list)

# Custom aggregation function for sorted unique list
def sorted_unique_list_agg(input_list):
    return sorted(input_list.unique())

def sorted_list_count_agg(input_list):
    # Sort the list
    input_list = sorted(input_list)
    
    # Count the frequencies
    input_word_freqs = Counter(input_list)
    
    # Generate the output list
    # dict.fromkeys iterates over the words in the order they occur, it ensures the words remain in order
    output_list = [input_word_freqs[word_count] for word_count in dict.fromkeys(input_word_freqs)]
    
    return output_list

In [10]:
# Getting Business Sorted Customer Details
biz_tx_customers_sorted = biz_tx_df.groupby('to_randomly_generated_account')['from_totally_fake_account'].agg(sorted_list_agg)

# Getting Business Unique Sorted Customer Details
biz_tx_customers_sorted_unique = biz_tx_df.groupby('to_randomly_generated_account')['from_totally_fake_account'].agg(sorted_unique_list_agg)

# Getting Business Sorted Customer Details Count
biz_tx_customers_sorted_unique_count = biz_tx_df.groupby('to_randomly_generated_account')['from_totally_fake_account'].agg(sorted_list_count_agg)

### Building Business Dataframes

In [11]:
# Building business accounts table
biz_slim_data = {
    'name': biz_name_list,
    'tx_count': list(biz_tx_count),
    'tx_sum_gbp': list(biz_tx_sum),
    'tx_mean_gbp': list(biz_tx_mean),
    'tx_median_gbp': list(biz_tx_median)
}

# Building business accounts table
biz_large_data = {
    'name': biz_name_list,
    'tx_count': list(biz_tx_count),
    'tx_sum_gbp': list(biz_tx_sum),
    'tx_mean_gbp': list(biz_tx_mean),
    'tx_median_gbp': list(biz_tx_median),
    'customers_sorted_unique': list(biz_tx_customers_sorted_unique),
    'customers_tx_count': list(biz_tx_customers_sorted_unique_count)
}

# Create small dataframe
biz_slim_df = pd.DataFrame(biz_slim_data)

# Create large dataframe
biz_large_df = pd.DataFrame(biz_large_data)

In [12]:
# Saves the dataframes to csv files, in the specific folder
biz_slim_df.to_csv('SavedData/biz_slim.csv', index=False)
biz_large_df.to_csv('SavedData/biz_large.csv', index=False)

## User Data Extraction and Organization

In [13]:
# User List
# user_id X

# total_tx_count X
# gross_tx_sum_gbp X

# sent_tx_count X
# gross_tx_sent_sum_gbp X
# gross_tx_sent_mean X
# gross_tx_sent_med X

# receive_tx_count X
# gross_tx_receive_sum_gbp X
# gross_tx_receive_mean X
# gross_tx_receive_med X

# net_tx_count X
# net_tx_sum_gbp X

### Create User ID List

In [14]:
# Combine the unique senders, and unique customer receivers
customer_list_combination = unique_senders_list + unique_receivers_user_list

# Convert all items to strings of integers
# Turned into a set as this removed duplicates
# Values are sorted alphabetically
customer_unique_list = sorted(set([str(int(cust)) for cust in customer_list_combination]))

### Create Send Tx Count List

In [15]:
# Dictionary featuring all the unique customer id's
user_send_tx_count_dict = dict.fromkeys(customer_unique_list, 0)

# User send tx's, first converted to int to get rid of decimals, then converted to string
user_send_tx_count = tx_df['from_totally_fake_account'].astype(int).astype(str).value_counts()

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in user_send_tx_count:
        user_send_tx_count_dict[user] = user_send_tx_count[user]

# Converting this to a list
user_send_tx_count_list = list(user_send_tx_count_dict.values())

In [16]:
# User id checker, for send tx count
user_id_tx_checker = '1000' # Have to set to String to see count, have to set integer to see value
spec_user_send_tx_count = user_send_tx_count.get(user_id_tx_checker, 0)
#print(spec_user_send_tx_count)

# Ensuring value is or isn't in list
#print(user_id_tx_checker in unique_senders_list)

### Create Receive Tx Count List

In [17]:
# Dictionary featuring all the unique customer id's
user_receive_tx_count_dict = dict.fromkeys(customer_unique_list, 0)

# Creating tx_df that features no company names
user_tx_df = tx_df[~tx_df['to_randomly_generated_account'].isin(biz_name_list)]

# User send tx's, first converted to int to get rid of decimals, then converted to string
user_receive_tx_count = user_tx_df['to_randomly_generated_account'].astype(int).astype(str).value_counts()

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in user_receive_tx_count:
        user_receive_tx_count_dict[user] = user_receive_tx_count[user]

# Converting this to a list
user_received_tx_count_list = list(user_receive_tx_count_dict.values())

In [18]:
# User id checker, for send tx count
user_id_tx_checker = 100384 # Have to set to String to see count, have to set integer to see value
spec_receive_send_tx_count = user_receive_tx_count.get(user_id_tx_checker, 0)
#print(spec_receive_send_tx_count)

# Ensuring value is or isn't in list
#print(user_id_tx_checker in unique_receivers_list)

### Create Total Tx Count List

In [19]:
user_total_tx_count_dict = {}

for user in user_send_tx_count_dict:
    user_total_tx_count_dict[user] =  user_send_tx_count_dict[user] + user_receive_tx_count_dict[user]

user_total_tx_count_list = list(user_total_tx_count_dict.values())

### Create Net Tx Count List

In [20]:
user_net_tx_count_dict = {}

for user in user_send_tx_count_dict:
    user_net_tx_count_dict[user] =  user_send_tx_count_dict[user] - user_receive_tx_count_dict[user]

user_net_tx_count_list = list(user_net_tx_count_dict.values())

### Gross Tx Sent Sum GBP

In [21]:
# Groups the user with all the entries in which they have sent money
# The ".sum()" adds up all these entries
grouped_user_id_sent = tx_df.groupby('from_totally_fake_account')['monopoly_money_amount'].sum()

# Convert the user id to int to get rid of zero, then to string
grouped_user_id_sent.index = grouped_user_id_sent.index.astype(int).astype(str)

# Dictionary featuring all the unique customer id's
user_sent_tx_sum_dict = dict.fromkeys(customer_unique_list, 0)

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in grouped_user_id_sent:
        user_sent_tx_sum_dict[user] = grouped_user_id_sent[user]

# Converting this to a list
user_sent_tx_sum_list = list(user_sent_tx_sum_dict.values())

### Gross Tx Receive Sum GBP

In [22]:
# Gets the tx_df, filters the dataframe for only rows with business accounts, and putting '~' in front does the reverse
user_tx_df = tx_df[~tx_df['to_randomly_generated_account'].isin(biz_name_list)]

# Same piece of code used in previous cell, but now finding received money
grouped_user_id_received = user_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].sum()

# Convert the user id to int to get rid of zero, then to string
grouped_user_id_received.index = grouped_user_id_received.index.astype(int).astype(str)

# Dictionary featuring all the unique customer id's
user_received_tx_sum_dict = dict.fromkeys(customer_unique_list, 0)

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in grouped_user_id_received:
        user_received_tx_sum_dict[user] = grouped_user_id_received[user]

# Converting this to a list
user_received_tx_sum_list = list(user_received_tx_sum_dict.values())

### Gross Tx Sum GBP

In [23]:
user_total_tx_sum_dict = {}

for user in user_sent_tx_sum_dict:
    user_total_tx_sum_dict[user] =  user_sent_tx_sum_dict[user] + user_received_tx_sum_dict[user]

user_total_tx_sum_list = list(user_total_tx_sum_dict.values())
#print(user_total_tx_sum_list)

### Net Tx Sum GBP

In [24]:
user_net_tx_sum_dict = {}

for user in user_sent_tx_sum_dict:
    user_net_tx_sum_dict[user] =  user_sent_tx_sum_dict[user] - user_received_tx_sum_dict[user]

user_net_tx_sum_list = list(user_net_tx_sum_dict.values())
#print(user_net_tx_sum_list)

### Gross Tx Sent Mean GBP

In [25]:
# Groups the user with all the entries in which they have sent money
# The ".sum()" adds up all these entries
grouped_user_id_sent_mean = tx_df.groupby('from_totally_fake_account')['monopoly_money_amount'].mean()

# Convert the user id to int to get rid of zero, then to string
grouped_user_id_sent_mean.index = grouped_user_id_sent_mean.index.astype(int).astype(str)

# Dictionary featuring all the unique customer id's
user_sent_tx_mean_dict = dict.fromkeys(customer_unique_list, 0)

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in grouped_user_id_sent_mean:
        user_sent_tx_mean_dict[user] = grouped_user_id_sent_mean[user]

# Converting this to a list
user_sent_tx_mean_list = list(user_sent_tx_mean_dict.values())

### Gross Tx Sent Median GBP

In [26]:
# Groups the user with all the entries in which they have sent money
# The ".sum()" adds up all these entries
grouped_user_id_sent_median = tx_df.groupby('from_totally_fake_account')['monopoly_money_amount'].median()

# Convert the user id to int to get rid of zero, then to string
grouped_user_id_sent_median.index = grouped_user_id_sent_median.index.astype(int).astype(str)

# Dictionary featuring all the unique customer id's
user_sent_tx_median_dict = dict.fromkeys(customer_unique_list, 0)

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in grouped_user_id_sent_median:
        user_sent_tx_median_dict[user] = grouped_user_id_sent_median[user]

# Converting this to a list
user_sent_tx_median_list = list(user_sent_tx_median_dict.values())

### Gross Tx Received Mean GBP

In [27]:
# Groups the user with all the entries in which they have sent money
# The ".sum()" adds up all these entries
grouped_user_id_received_mean = user_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].mean()

# Convert the user id to int to get rid of zero, then to string
grouped_user_id_received_mean.index = grouped_user_id_received_mean.index.astype(int).astype(str)

# Dictionary featuring all the unique customer id's
user_received_tx_mean_dict = dict.fromkeys(customer_unique_list, 0)

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in grouped_user_id_received_mean:
        user_received_tx_mean_dict[user] = grouped_user_id_received_mean[user]

# Converting this to a list
user_received_tx_mean_list = list(user_received_tx_mean_dict.values())

### Gross Tx Received Mean GBP

In [28]:
# Groups the user with all the entries in which they have sent money
# The ".sum()" adds up all these entries
grouped_user_id_received_median = user_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].median()

# Convert the user id to int to get rid of zero, then to string
grouped_user_id_received_median.index = grouped_user_id_received_median.index.astype(int).astype(str)

# Dictionary featuring all the unique customer id's
user_received_tx_median_dict = dict.fromkeys(customer_unique_list, 0)

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in grouped_user_id_received_median:
        user_received_tx_median_dict[user] = grouped_user_id_received_median[user]

# Converting this to a list
user_received_tx_median_list = list(user_received_tx_median_dict.values())

## Building the User Dataframe

In [30]:
user_slim_data = {
    'user_id': customer_unique_list,
    'sent_tx_count': user_send_tx_count_list,
    'sent_tx_sum_gbp': user_sent_tx_sum_list,
    'sent_tx_median_gbp': user_sent_tx_median_list,
    'received_tx_count': user_received_tx_count_list,
    'received_tx_sum_gbp': user_received_tx_sum_list,
    'received_tx_median_gbp': user_received_tx_median_list
    
}

user_large_data = {
    'user_id': customer_unique_list,
    'total_tx_count': user_total_tx_count_list,
    'net_tx_count': user_net_tx_count_list,
    'total_tx_sum_gbp': user_total_tx_sum_list,
    'net_tx_sum_gbp': user_net_tx_sum_list,
    'sent_tx_count': user_send_tx_count_list,
    'sent_tx_sum_gbp': user_sent_tx_sum_list,
    'sent_tx_mean_gbp': user_sent_tx_mean_list,
    'sent_tx_median_gbp': user_sent_tx_median_list,
    'received_tx_count': user_received_tx_count_list,
    'received_tx_sum_gbp': user_received_tx_sum_list,
    'received_tx_mean_gbp': user_received_tx_mean_list,
    'received_tx_median_gbp': user_received_tx_median_list
    
}

# Create small dataframe
user_slim_df = pd.DataFrame(user_slim_data)

# Create large dataframe
user_large_df = pd.DataFrame(user_large_data)

In [31]:
# Saves the dataframes to csv files, in the specific folder
user_slim_df.to_csv('SavedData/user_slim.csv', index=False)
user_large_df.to_csv('SavedData/user_large.csv', index=False)