# Republican Caucus Trader Analysis

This notebook does some analysis on trader behavior in the Republican Caucus market.

In [None]:
import os
import sys
import re
from collections import defaultdict
from functools import lru_cache

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import Series, DataFrame, Panel

idx = pd.IndexSlice

from research_tools import storage

pd.options.display.float_format = lambda x: '{:,.4f}'.format(x) if abs(x) < 1 else '{:,.2f}'.format(x)

# Load Data

First, load the data. All of the data is organized in nice, standardized formats.

In [None]:
os.chdir('..')

basename = 'gop'

orders, behavior_analysis, trader_analysis, quotes_dict = storage.retrieve_all(
    [basename + '.orders',
     basename + '.behavior_analysis',
     basename + '.trader_analysis',
     basename + '.reconstructed_quotes'])

# Trader Stats Calculations

Calculate statistics on the trading results of each trader.

We want to know what the spread profits are and the profits resulting from biased prices (prices that sum to more than $1).

To do this we first need to join the trade data with our reconstructed quotes.

## Join quote data with trade data

In [None]:
def add_contract_id(df, contract_id):
    df = df.drop('timestamp', axis=1).copy()
    df['contract_id'] = contract_id
    return df

quotes = pd.concat([add_contract_id(df, c_id) for c_id, df in quotes_dict.items()]).sort_index()

trader_analysis = pd.merge_asof(trader_analysis.sort_values('seq'),
                                quotes,
                                by='contract_id',
                                left_on='seq',
                                right_index=True,
                                allow_exact_matches=True).sort_index()

# joining with bid/ask prices for Close trades doesn't make any sense
trader_analysis.loc[trader_analysis['trade_type'] == 'Close', ['bid_price', 'ask_price']] = np.nan

trader_analysis['mid_price'] = trader_analysis[['bid_price', 'ask_price']].mean(axis=1)

## Calculate Spread Profits

Now that we know what the prevailing bid-ask prices are at the time of each trade, we can calculate the spread profits. Provide trades are earning a spread profit and take trades are paying the spread.

Close trades are a special trade with the exchange for \\$1 or \\$0.

In [None]:
trader_analysis['spread_profit'] = trader_analysis.eval("corrected_side * quantity * (mid_price - corrected_price)")

# Close trades have no spread profits
trader_analysis.loc[trader_analysis['trade_type'] == 'Close', 'spread_profit'] = 0

## Calculate Bias Profits

To calculate the price biases we must first find the sum of the bid-ask prices. This is the same code used in the other notebooks analyzing the quotes.

In [None]:
def make_interval_quotes(df):
    df = df.set_index('timestamp')

    start = df.index[0].floor('1Min')
    end = df.index[-1].floor('1Min')

    timestamp_intervals = pd.date_range(start, end, freq='1Min')

    return df[['bid_price', 'ask_price']].apply(lambda x: x.asof(timestamp_intervals)).bfill()

def make_quote_panel(quotes):
    quotes_intervals = {c_id: make_interval_quotes(df) for c_id, df in quotes.items()}

    start = min([df.index[0] for df in quotes_intervals.values()])
    end = max([df.index[-1] for df in quotes_intervals.values()])
    timestamp_intervals = pd.date_range(start, end, freq='1Min')

    panel = Panel({c_id: df.loc[timestamp_intervals].ffill().bfill() for c_id, df in quotes_intervals.items()})
    
    return panel

quotes_panel = make_quote_panel(quotes_dict)
quotes_sum = quotes_panel.sum(axis=0)
quotes_sum.columns = ['bid_price_sums', 'ask_price_sums']

Using `quotes_sum` we can calculate the mid and use it as a normalization factor.

Close trades are always at an unbiased price.

In [None]:
quotes_sum['norm_factor'] = quotes_sum.mean(axis=1)

quotes_sum['timestamp'] = quotes_sum.index
quotes_sum['seq'] = quotes_sum['timestamp'].apply(lambda t: int(int(t.to_datetime64()) / 1e6))

price_bias_norm_factor = quotes_sum.set_index('seq')[['norm_factor']]

trader_analysis = pd.merge_asof(trader_analysis.sort_values('seq'),
                                price_bias_norm_factor,
                                left_on='seq',
                                right_index=True,
                                allow_exact_matches=True).sort_index()

trader_analysis['bias_profit'] = trader_analysis.eval("corrected_side * quantity * mid_price * (1 / norm_factor - 1)")

# Close trades always at an unbiased price
trader_analysis.loc[trader_analysis['trade_type'] == 'Close', 'norm_factor'] = 1
trader_analysis.loc[trader_analysis['trade_type'] == 'Close', 'bias_profit'] = 0

In [None]:
def stats(trades):
    out = Series()
    
    total_nonclose = trades[~trades.close_trade].quantity.sum()
    out['take_pct'] = trades.query("take_provide == 'T' and not close_trade").quantity.sum() / total_nonclose
    out['longshot_pct'] = trades.query("price_per_share <= 0.1 and not close_trade").quantity.sum() / total_nonclose
    out['antilongshot_pct'] = trades.query("price_per_share >= 0.9 and not close_trade").quantity.sum() / total_nonclose

    return out

df = orders.groupby('user_guid')['order_id'].count().to_frame('orders_sent')
df2 = trader_analysis.groupby(['user_guid'])[['quantity', 'notional', 'spread_profit', 'bias_profit',
                                              'gross_pnl', 'fee', 'pnl_net_fee']].sum()
df3 = trader_analysis.groupby(['user_guid']).apply(stats)

trader_stats_summary = df.merge(df2, left_index=True, right_index=True).merge(df3, left_index=True, right_index=True)
trader_stats_summary.sort_values('pnl_net_fee', ascending=False, inplace=True)

# position profit is the profit on directional price moves
# calculate this for trader_stats_summary, not trader_analysis
trader_stats_summary['position_profit'] = trader_stats_summary.eval('gross_pnl - spread_profit - bias_profit')

trader_stats_summary = trader_stats_summary[['orders_sent', 'quantity', 'notional', 'spread_profit',
                                             'bias_profit', 'position_profit', 'gross_pnl', 'fee', 'pnl_net_fee',
                                             'take_pct', 'longshot_pct', 'antilongshot_pct']]

Here's what that looks like. The DataFrame shows the PnL breakdown for each trader.

In [None]:
trader_stats_summary.head()

## Trader Subgroups

We can use our behavior analysis to divide the traders into subgroups. The traders had different levels of engagement with the market and different approaches to trading. These choices had a big impact on the resulting profits.

Some traders traded efficiently and others were inefficient. If 75% or more of a trader's volume is for trades that were not strictly dominated by another group of trades, the trader is deemed an efficient trader. If not, they are deemed an inefficient trader.

Some traders were very active and some were not. If a trader opened and closed a position in the same contract multiple times they are deemed an active trader. If they did not they are deemed an inactive trader.

Some traders traded in small sizes and some traded in large sizes. If a trader's max loss was ever more than $50 they are deemed a large trader. If not, they are deemed a small trader.

There were 4,452 traders in this market, less than the 5000 trader limit.

## Efficiency

Many traders only made one trade. Those traders will either be 100% efficient or 0% efficient.

About one quarter of the traders traded efficiently at least 75% of the time.

In [None]:
(behavior_analysis.efficiency >= 0.75).sum() / len(behavior_analysis)

In [None]:
behavior_analysis.efficiency.hist(bins=25, figsize=(10, 6))

## Activity

Again, many traders made only one trade. Obviously those traders only opened 1 position on that one contract.

Even for traders that made more than one trade, many did not open and close positions on the same contract multiple times. Only about 15% can be considered active, opening and closing positions on the same contract. The remaining 85% are inactive.

In [None]:
(behavior_analysis.max_opened_pos == 1).sum() / len(behavior_analysis)

In [None]:
behavior_analysis.max_opened_pos.hist(bins=25, figsize=(10, 6))

## Size

Most traders traded in small sizes. About 69% never risked more than \\$50. About 31% did, with some hitting the risk limit of \\$850 per contract.

In [None]:
(behavior_analysis.max_loss > 50).sum() / len(behavior_analysis)

The maximum max-loss is over $2600, suggesting that some traders were hitting the risk limit of for 3 candidates. This means they were taking on more risk than they were in the DEM market.

In [None]:
behavior_analysis.max_loss.max()

In [None]:
behavior_analysis.max_loss.hist(bins=25, figsize=(10, 6))

# Trader Group Breakdown

Next we analyze the PnL of the traders by dividing by the above three metrics. The goal is to see how profitable their trading activities were. We also decompose the PnL to see where the profits and losses were coming from.

In [None]:
def make_group_stats(group_name, query, per_trader=False):
    out = Series()

    if query:
        group = behavior_analysis.query(query).index
    else:
        group = behavior_analysis.index
    out['trader_count'] = len(group)
    out['orders_sent'] = len(orders.set_index('user_guid').loc[group])
    out = out.append(trader_analysis.set_index('user_guid').loc[group][['quantity', 'notional', 'spread_profit',
                                                                        'bias_profit',
                                                                        'gross_pnl', 'fee', 'pnl_net_fee']].sum())
    out['position_profit'] = out['gross_pnl'] - out['spread_profit'] - out['bias_profit']
    out['sum_max_loss'] = behavior_analysis.loc[group, 'max_loss'].sum()
    out['sum_max_in_pool'] = behavior_analysis.loc[group, 'max_in_pool'].sum()
    out['gross_roi'] = out['gross_pnl'] / out['sum_max_in_pool']
    out['net_roi'] = out['pnl_net_fee'] / out['sum_max_in_pool']

    out = out.append(stats(trader_analysis.set_index('user_guid').loc[group]))
    
    out.name = group_name

    if per_trader:
        out2 = out / out['trader_count']
        out2['trader_count'] = out['trader_count']

        return out2[['trader_count', 'orders_sent', 'quantity', 'notional', 'spread_profit', 'bias_profit',
               'position_profit', 'gross_pnl', 'fee', 'pnl_net_fee']].to_frame()
    else:
        return out[['trader_count', 'orders_sent', 'quantity', 'notional',
                    'sum_max_loss', 'sum_max_in_pool', 'spread_profit',
                    'bias_profit', 'position_profit', 'gross_pnl', 'fee',
                    'pnl_net_fee', 'gross_roi', 'net_roi',  'take_pct', 'longshot_pct',
                    'antilongshot_pct']].to_frame()

def make_multi_group_stats(groups, **args):
    return pd.concat([make_group_stats(*g, **args) for g in groups], axis=1)

Before looking at the numbers, consider the fact that without fees these markets are zero sum games. This means that gross PnL across all the traders is a zero sum game.

Now consider a hypothetical PredictIt market with no fees, spreads, or biased prices (meaning the sum of the Yes contract prices always equals $1). The only way to make money in this market is by being better at predicting future prices, ie, position profits.

In such a market imagine that there is a subset of traders that are superior at information analysis and prediction. They would make better trades. Over time they would win more money than they would lose. If this was the case we would see a net flow of cash from traders that are not in this group to traders that are in the group. This net flow of cash would also be *reliable* in that we would see this across markets.

Now imagine that all traders are making bets by throwing darts at the wall. In this market there would be no way to reliably segment the market such that we would see a net flow of cash between groups.

What we will see below is that there are segments where money is moving from one group to another. In particular, from inefficient traders to efficient traders. And what is even more interesting is *why* that money is flowing between the two groups.

## Trader Efficiency

As one might expect (or hope), as a group the efficient traders were also profitable.

In [None]:
groups = [
    ('Everyone', None),
    ('Efficient_Traders', 'efficiency >= 0.75'),
    ('Inefficient_Traders', 'efficiency < 0.75')
]

make_multi_group_stats(groups)

In this market \\$17k flowed from inefficient traders to efficient traders. Why did \\$17k flow between these two groups?

Only \\$5.7k is from position profits. This is 34%. This is one third. Compare this to the ideal market with no spreads or biased prices where position profits are the only way to make money.

\\$9.2k (55%) is from price biases and the remaining \\$1.9k (11%) is from market spreads. More than half is from price biases.

Also consider that the efficient traders are grouped together because they didn't trade the strictly dominated contracts but they also had positive position profits, indicating that they were better are processing information. If these two things were uncorrelated the position_profit value here would be closer to zero. It isn't. This suggests that the ability to understand how markets work and trade efficiently is correlated to the ability to pick the right candidates to back.

Note I've also added a few more rows to this analysis. The "sum_max_in_pool" is the total of the maximum amount of money each trader had in the pool of cash any time during the life of this market. To calculate this we look for the notional exposure minus the accumulated pnl from closed out positions. It shows how big the capital pool is in the zero sum game and is critical for understanding the significance or scale of the numbers for a group. In particular, those two pnl numbers divided by the sum_max_in_pool value give the "gross_roi" and "net_roi". The efficient traders had a gross ROI of about 11%. For some groups these are rather notable. 

In [None]:
[x / 16912.11 for x in [5770.22, 9225.32, 1916.57]]

## Trader Size

Next is size. Most traders were small and did not risk more than \$50. As a group they lost money by paying the spread, trading inefficiently, and picking the wrong candidates.

In [None]:
groups = [
    ('Everyone', None),
    ('Small_Traders', 'max_loss <= 50'),
    ('Large_Traders', 'max_loss > 50')
]

make_multi_group_stats(groups)

Overall there was a negligible net flow of cash from small traders to large traders.

The Small Traders gross_roi is -1%. This contrasts with their performance in the DEM market.

## Trader Activity

Finally, trader activity. The active traders were profitable. It seems reasonable that traders who are trading in larger sizes or more frequently would put in the effort to learn how to trade profitably. These traders earned the spread, picked the right candidates and traded efficiently.

In [None]:
groups = [
    ('Everyone', None),
    ('Active_Traders', 'max_opened_pos > 1'),
    ('Inactive_Traders', 'max_opened_pos == 1')
]

make_multi_group_stats(groups)

As one might expect there was a net flow of money from inactive traders to active traders. The net flow was \$1.5k. The active traders had positive spread and bias profit values but negative position profits.

# Trader Subgroup Breakdown

Next we do the same thing but this time grouping the traders into smaller groups using multiple characteristics at the same time.

The following is for half of Table 6.

In [None]:
groups = [
    ('Everyone', None),
    ('Small_Active', 'max_opened_pos > 1 and max_loss <= 50'),
    ('Small_Inactive', 'max_opened_pos == 1 and max_loss <= 50'),
    ('Large_Active', 'max_opened_pos > 1 and max_loss > 50'),
    ('Large_Inactive', 'max_opened_pos == 1 and max_loss > 50')
]

make_multi_group_stats(groups).T['trader_count']

Now let's look at trading efficiency. How about large and small efficient traders?

In [None]:
groups = [
    ('Everyone', None),
    ('Efficient_Traders', 'efficiency >= 0.75'),
    ('Efficient_Traders:Small', 'efficiency >= 0.75 and max_loss <= 50'),
    ('Efficient_Traders:Large', 'efficiency >= 0.75 and max_loss > 50')
]

make_multi_group_stats(groups)

The small efficient traders had position losses, indicating the backed the wrong candidates. The large and efficient traders earned more than half of their profits from the biased prices, unlike in the DEM market where it came from position profits.

In [None]:
[x / 15969.92 for x in [4623.31, 9080.02, 2266.59]]

The inefficient traders did badly regardless of whether or not they traded in large or small sizes.

In [None]:
groups = [
    ('Everyone', None),
    ('Inefficient_Traders', 'efficiency < 0.75'),
    ('Inefficient_Traders:Small', 'efficiency < 0.75 and max_loss <= 50'),
    ('Inefficient_Traders:Large', 'efficiency < 0.75 and max_loss > 50')
]

make_multi_group_stats(groups)

The Small and inefficient traders did better in this market than the DEM market. This is pretty good considering they took a lot of longshot bets.

Now, efficiency and trading activity?

In [None]:
groups = [
    ('Everyone', None),
    ('Efficient_Traders', 'efficiency >= 0.75'),
    ('Efficient_Traders:Active', 'efficiency >= 0.75 and max_opened_pos > 1'),
    ('Efficient_Traders:Inactive', 'efficiency >= 0.75 and max_opened_pos == 1')
]

make_multi_group_stats(groups)

The efficient and active traders were profitable in spite their negative position profits, indicating they picked the wrong candidates. Their spread profits are profitable. This matches my pnl breakdown for IEM. A trader functioning as a market maker will earn the spread but lose money on position profits. The spread profits will exceed the position losses. Also observe the low take percentage.

The efficient and inactivate traders were also profitable but interestingly made most of their profits from position profits. They lost money on the spread. These two groups were functioning very differently.

Now the inefficient traders. How did they do broken down by trading activity?

In [None]:
groups = [
    ('Everyone', None),
    ('Inefficient_Traders', 'efficiency < 0.75'),
    ('Inefficient_Traders:Active', 'efficiency < 0.75 and max_opened_pos > 1'),
    ('Inefficient_Traders:Inactive', 'efficiency < 0.75 and max_opened_pos == 1')
]

make_multi_group_stats(groups)

Unlike the DEM market, the inefficient traders who traded actively had negative positive position profits. However this is small compared to the sum_max_in_pool value (the total capital employed by this group).

The inefficient and inactive traders lost money on the spread, price bias and picking the wrong candidates. 59% was from the price bias, unlike the DEM market where it was mostly for position profits.

In [None]:
[x / -16136.79 for x in [-2153.76, -9548.61, -4434.42]]

Finally, grouping by all three characteristics at the same time.

As the results above would suggest the large and efficient traders made the most money, regardless of if they were active or not.

The "Efficient, Large, Active" traders had negative position profits and positive spread profits, much like a market maker. They seemed to have been employing a liquidity providing strategy.

The only profitable inefficient group are the "Inefficient, Large, Active" traders.  They were profitable but only because of their positive position profits.

In [None]:
groups = [
    ('Efficient Small Active', 'efficiency >= 0.75 and max_loss <= 50 and max_opened_pos > 1'),
    ('Efficient Small Inactive', 'efficiency >= 0.75 and max_loss <= 50 and max_opened_pos == 1'),
    ('Efficient Large Active', 'efficiency >= 0.75 and max_loss > 50 and max_opened_pos > 1'),
    ('Efficient Large Inactive', 'efficiency >= 0.75 and max_loss > 50 and max_opened_pos == 1'),
    ('Inefficient Small Active', 'efficiency < 0.75 and max_loss <= 50 and max_opened_pos > 1'),
    ('Inefficient Small Inactive', 'efficiency < 0.75 and max_loss <= 50 and max_opened_pos == 1'),
    ('Inefficient Large Active', 'efficiency < 0.75 and max_loss > 50 and max_opened_pos > 1'),
    ('Inefficient Large Inactive', 'efficiency < 0.75 and max_loss > 50 and max_opened_pos == 1')
]

group_summary = make_multi_group_stats(groups).T

group_summary.sort_values('pnl_net_fee', ascending=False)

# Trader Characterization Consistency

Are the traders characterized the same way in both markets? This question only applies for traders that actually traded in both markets, of course.

To answer this question we need to save the trader characterizations and compare them to similar data from the DEM Caucus market.

In [None]:
trader_classifications = DataFrame(index=behavior_analysis.index, columns='category efficiency size activity'.split())

for label, condition in groups:
    indx = behavior_analysis.query(condition).index
    trader_classifications.loc[indx, 'category'] = label
    trader_classifications.loc[indx, 'efficiency size activity'.split()] = label.split()

The efficiency, size, and activity columns are there to potentially help decompose the traders who are not in the same category.

In [None]:
trader_classifications.head()

In [None]:
def dump_pickle(data, filename):
    with open(os.path.join('data', filename), 'wb') as f:
        pickle.dump(data, f)

Also saving the trader stats summary so we can look at the correlation of trader PnL between the two markets, potentially by category.

In [None]:
dump_pickle(trader_classifications, basename + '.trader_classifications.p')
dump_pickle(trader_stats_summary, basename + '.trader_stats_summary.p')
dump_pickle(group_summary, basename + '.group_summary.p')