In [1]:
import requests
import numpy as np
import pandas as pd
import random
import math

# User Data Generator
This is the file where we generate our mock user data. This includes mocking user
information and user transactions.

## User Information
To generate our mock user info, we use the `random-data-api.com` API. We create 10,000 users using the
API and also 2 _"test"_ users where we have known info for testing. For the `balance` field,
we simply sample from a normal distribution, centered around 4000, and lower-bounded by 0. 

In [2]:
URL_API = 'https://random-data-api.com/api/v2'
NUM_USERS = 5002
NUM_TEST_USERS = 2
MAX_USERS_PER_REQUEST = 100
FIELDS = {
    'first_name': 'FirstName',
    'last_name': 'LastName',
    'email': 'Email',
    'password': 'Password'
}
BALANCE_MEAN = 4000
BALANCE_STD_DEV = 1000

In [3]:
parsed_users = [
    {
        'FirstName': "John",
        'LastName': f"Doe{i}",
        'Email': f"test{i}@email.com",
        'Password': f'test{i}',
        'Balance': BALANCE_MEAN
    } for i in range(NUM_TEST_USERS)
]

while len(parsed_users) < NUM_USERS:
    raw_users = requests.get(f"{URL_API}/users?size={min(MAX_USERS_PER_REQUEST, NUM_USERS - len(parsed_users))}"\
    ).json()

    print(f"{len(parsed_users)} parsed users...")

    for user in raw_users:
        balance = np.random.normal(BALANCE_MEAN, BALANCE_STD_DEV)
        balance = max(0, int(balance))

        parsed_user = {'Balance': balance}

        parsed_user.update({
            FIELDS[field]: v for field, v in user.items() if field in FIELDS
        })
        parsed_users.append(parsed_user)

df_users = pd.DataFrame(parsed_users)
df_users = df_users.drop_duplicates(subset=['Email'])
df_users = df_users.reset_index().rename(columns={'index': 'ID'})
df_users.shape


2 parsed users...
102 parsed users...
202 parsed users...
302 parsed users...
402 parsed users...
502 parsed users...
602 parsed users...
702 parsed users...
802 parsed users...
902 parsed users...
1002 parsed users...
1102 parsed users...
1202 parsed users...
1302 parsed users...
1402 parsed users...
1502 parsed users...
1602 parsed users...
1702 parsed users...
1802 parsed users...
1902 parsed users...
2002 parsed users...
2102 parsed users...
2202 parsed users...
2302 parsed users...
2402 parsed users...
2502 parsed users...
2602 parsed users...
2702 parsed users...
2802 parsed users...
2902 parsed users...
3002 parsed users...
3102 parsed users...
3202 parsed users...
3302 parsed users...
3402 parsed users...
3502 parsed users...
3602 parsed users...
3702 parsed users...
3802 parsed users...
3902 parsed users...
4002 parsed users...
4102 parsed users...
4202 parsed users...
4302 parsed users...
4402 parsed users...
4502 parsed users...
4602 parsed users...
4702 parsed users...
4802

(4995, 6)

## User Transactions
To generate our mock user transactions, we need to consider the two types of transactions: purchasing and selling.

### Purchasing
```
for each user u:
    pick a random number min_purch <= n <= max_purch of transactions
    pick n different rows in the SecurityPrices table
    pick a random number min_qty <= n <= max_qty of shares to buy
```


In [4]:
MIN_PURCH = 10
MAX_PURCH = 30
MIN_QTY = 1
MAX_QTY = 10
PERCENT_ETF = .2 # To enforce that most transactions are stocks, since most functionality is for stocks

df_transactions = pd.DataFrame({
    'UserID': pd.Series(dtype=int),
    'Ticker': pd.Series(dtype=str),
    'Date': pd.Series(dtype=object),
    'Quantity': pd.Series(dtype=int),
    'Type': pd.Series(dtype=str),
})

df_prices = pd.read_csv('SecurityPrices.csv')
df_securities = pd.read_csv('Securities.csv')
df_prices = df_prices.merge(df_securities, on='Ticker')
df_stocks = df_prices[df_prices['ETF'] == False]
df_etf = df_prices[df_prices['ETF'] == True]

In [5]:
transaction_list = []

for ID in range(len(df_users)):
    n = random.randint(MIN_PURCH, MAX_PURCH)
    n_stocks = math.ceil(n * (1 - PERCENT_ETF))
    n_etf = math.floor(n * PERCENT_ETF)
    prices = df_stocks.sample(n=n_stocks)
    if n_etf > 0:
        prices = pd.concat([prices, df_etf.sample(n=n_etf)])
    prices = prices[['Ticker', 'Date']]
    prices['UserID'] = ID
    prices['Quantity'] = np.random.randint(MIN_QTY, MAX_QTY, prices.shape[0])
    prices['Type'] = 'BUY'
    transaction_list.append(prices)
    #= pd.concat([df_transactions, prices])

df_transactions = pd.concat(transaction_list)
df_transactions.reset_index(drop=True, inplace=True)
df_transactions

Unnamed: 0,Ticker,Date,UserID,Quantity,Type
0,DISCA,2013-11-22,0,9,BUY
1,NVDA,2010-08-23,0,3,BUY
2,CTSH,2019-11-15,0,3,BUY
3,CSCO,1992-03-06,0,2,BUY
4,AAPL,1984-05-22,0,3,BUY
...,...,...,...,...,...
99490,EBAY,2007-02-06,4994,9,BUY
99491,ALXN,2015-03-17,4994,9,BUY
99492,ALGN,2002-11-20,4994,5,BUY
99493,PIO,2008-03-17,4994,8,BUY


### Selling
For selling, we need to be more careful because we can't sell shares we don't own.
To simplify, we assume that the user will only sell at most the quantity of shares
that they bought in a single transaction. That is, a user won't sell 5 shares if they
bought 2 and 3 shares previously, but he could sell 3 shares.
This way we don't need to keep track of a rolling sum, which would slow down our data generation.
```
for each user u:
    pick a random number 0 <= n <= SELL% * n_purch
    pick n different rows in the purchasing table
    pick a random number 1 <= n <= qty_purchased of shares to sell at a later date
```

In [6]:
SELL_PERCENT = .5

sell_transactions = []
dates_per_ticker = dict()

for ID in range(len(df_users)):
    df_purch = df_transactions[df_transactions['UserID'] == ID]

    n = random.randint(0, int(SELL_PERCENT * len(df_purch)))
    if n == 0:
        continue

    prices = df_purch.sample(n=n)

    for sold in range(n):
        cur_ticker = prices.iloc[sold]['Ticker']
        if cur_ticker not in dates_per_ticker:
            dates_per_ticker[cur_ticker] = df_prices[df_prices['Ticker'] == cur_ticker]['Date']
        dt = dates_per_ticker[cur_ticker]
        possible_dates = dt[dt > prices.iloc[sold]['Date']]

        if len(possible_dates) > 0:
            sell_transactions.append({
                'UserID': ID,
                'Ticker': cur_ticker,
                'Date': possible_dates.sample().iloc[0],
                'Quantity': random.randint(1, prices.iloc[sold]['Quantity']),
                'Type': 'SELL',
            })

df_transactions = pd.concat([df_transactions, pd.DataFrame(sell_transactions)])
df_transactions.reset_index(drop=True, inplace=True)
df_transactions

Unnamed: 0,Ticker,Date,UserID,Quantity,Type
0,DISCA,2013-11-22,0,9,BUY
1,NVDA,2010-08-23,0,3,BUY
2,CTSH,2019-11-15,0,3,BUY
3,CSCO,1992-03-06,0,2,BUY
4,AAPL,1984-05-22,0,3,BUY
...,...,...,...,...,...
123907,IWN,2017-11-02,4992,3,SELL
123908,CTSH,2008-02-29,4992,1,SELL
123909,MNST,2011-12-23,4992,1,SELL
123910,KRE,2011-12-19,4992,3,SELL


In [7]:
df_transactions.to_csv('Transactions.csv', index=False)
df_users.to_csv('Users.csv', index=False)