## Analysis

This notebook contains the code for the analysis of the final dataset. While rigorous econometric analysis is limited to basic stationarity tests (due to the nature of the dataset), some basic tests for market efficiency can reveal a lot of insight as to how the machine learning traders affected the market.

The tests we will conduct will look at a variety of factors, including mean-reversion tendencies, run tests and whether our different types of traders outperformed some simple strategies such as filters and buy-and-hold strategies. We will also perform tests for stationarity and cointegration (between the fundamental price and trading price) although again these should be taken with a grain of salt, as the price series have been generated by our simulations rather than a true random process.

In [113]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import csv
import glob
from tqdm import tqdm
import statsmodels.tsa.stattools as sm
import collections

In [5]:
def unpack_data(filepath): #Returns a dictionary where each item is a df containing one run, one dict per generation
    all_files = glob.glob(filepath + "/*.csv")
    name_list = []
    datadict = {}
    for f in all_files:
        name_list.append(f[89:-4])
    for i,n in enumerate(name_list):
        datadict[n] = pd.read_csv(all_files[i], header=0)
#    for k, v in datadict.items():
#        v.drop(columns=["Unnamed: 0", "volume", "spread", "10_MA", "50_MA"], inplace=True)
    return datadict

In [9]:
gendict = {}
for i in tqdm(range(51)):
    genname = f"gen{i}"
    gendict[genname] = unpack_data(f"/Users/karangarg/Documents/Year 3 Modules/EC331/Code/rae_repo/simulations/gen{i}_sims/data")

100%|██████████| 51/51 [00:14<00:00,  3.53it/s]


In [17]:
gens = np.array([i for i in range(51)])

In [15]:
def compute_mean_squared_diff(datadict): #Returns average mean squared difference, i.e. the square of the average distance between the true price and trading price for any given period in any given simlation
    msd_list = []
    for k, v in datadict.items():
        diff_list = []
        for i in range(len(v)):
            squared_diff = (v.iloc[i]["trading_price"] - v.iloc[i]["true_price"])**2
            diff_list.append(squared_diff)
        msd = sum(diff_list)/len(diff_list)
        msd_list.append(msd) 
    avg_msd = sum(msd_list)/len(msd_list)
    return avg_msd

def gen_msd(gendict): #Computes mean squared difference for all generations and returns a np array with them all
    msd_list = []
    for k, v in gendict.items():
        msd = compute_mean_squared_diff(v)
        msd_list.append(msd)
    msd_array = np.array(msd_list)
    return msd_array

In [22]:
msd = gen_msd(gendict)

In [20]:
def compute_reversion_rate(datadict, dev=1): #For any given run of the market, how many times does price deviate and how long does it take to return to the true price
    final_avg_return_list = [] #Avg returned run length for individual simulations
    final_avg_no_return_list = [] #Avg run length for not returned runs for individual simulations
    return_run_length_list = [] #Number of deviations per simulation that return for individual simulations
    no_return_run_length_list = [] #Number of deviations per simulation that don't return for individual simulations
    for k, v in datadict.items():
        run_count = 0
        dev_return_list = []
        dev_no_return_list = []
        for i in range(len(v)):
            if abs(v.iloc[i]["trading_price"] - v.iloc[i]["true_price"]) > dev and abs(v.iloc[i-1]["trading_price"] - v.iloc[i-1]["true_price"]) <= dev:
                for t in range(i, len(v)):
                    if abs(v.iloc[t]["trading_price"] - v.iloc[t]["true_price"]) > dev and t == len(v)-1:
                        dev_no_return_list.append(t-i)
                        break
                    elif abs(v.iloc[t]["trading_price"] - v.iloc[t]["true_price"]) <= dev:
                        dev_return_list.append(t-i)
                        break        
        if len(dev_return_list) != 0:
            avg_return_time = sum(dev_return_list)/len(dev_return_list)
            final_avg_return_list.append(avg_return_time)
            return_run_length_list.append(len(dev_return_list))
        if len(dev_no_return_list) != 0:
            avg_no_return_time = sum(dev_no_return_list)/len(dev_no_return_list)
            final_avg_no_return_list.append(avg_no_return_time)
            no_return_run_length_list.append(len(dev_no_return_list))
    final_avg_return = sum(final_avg_return_list)/len(final_avg_return_list) #Length of avg deviation run (that returned)
    final_avg_no_return = sum(final_avg_no_return_list)/len(final_avg_no_return_list) #Length of avg deviation run (that didn't return)
    avg_no_returns = sum(return_run_length_list)/len(datadict) #Avg number of deviations per run that returned
    avg_no_no_returns = sum(no_return_run_length_list)/len(datadict) #Avg number of deviations per run that didn't return
    return final_avg_return, final_avg_no_return, avg_no_returns, avg_no_no_returns

def reversion_stats(gendict):
    returned_run_length_list = []
    unreturned_run_length_list = []
    returned_devs_per_run_list = []
    unreturned_devs_per_run_list = []
    for k, v in gendict.items():
        rrl, url, rdpr, udpr = compute_reversion_rate(v)
        returned_run_length_list.append(rrl)
        unreturned_run_length_list.append(url)
        returned_devs_per_run_list.append(rdpr)
        unreturned_devs_per_run_list.append(udpr)
    returned_run_length_array = np.array(returned_run_length_list)
    unreturned_run_length_array = np.array(unreturned_run_length_list)
    returned_devs_per_run_array = np.array(returned_devs_per_run_list)
    unreturned_devs_per_run_array = np.array(unreturned_devs_per_run_list)
    return returned_run_length_array, unreturned_run_length_array, returned_devs_per_run_array, unreturned_devs_per_run_array

In [21]:
avg_ret_run, avg_unret_run, no_ret_runs, no_unret_runs = reversion_stats(gendict)

In [25]:
def compute_inefficient_proportion(datadict, dev=1): #Computes the proportion of time that the true and trading prices deviate
    prop_list = []
    for k, v in datadict.items():
        prop_temp = 0
        for i in range(len(v)):
            if abs(v.iloc[i]["trading_price"] - v.iloc[i]["true_price"]) > dev:
                prop_temp +=1
        prop_list.append((prop_temp/len(v)))
    prop = sum(prop_list)/len(prop_list)
    return prop

def inefficient_proportion(gendict):
    ineff_list = []
    for k, v in gendict.items():
        ineff = compute_inefficient_proportion(v)
        ineff_list.append(ineff)
    ineff_array = np.array(ineff_list)
    return ineff_array

In [26]:
ineff_prop = inefficient_proportion(gendict)

In [35]:
def compute_volatility(datadict): #Computes the average volatility for a generation of data
    vol_list = []
    for k, v in datadict.items():
        price_array = v["trading_price"].to_numpy(copy=True)
        vol = np.std(price_array)
        vol_list.append(vol)
    vol_array = np.array(vol_list)
    avg_vol = np.mean(vol_array)
    return avg_vol

def volatility(gendict):
    vol_list = []
    for k, v in gendict.items():
        vol = compute_volatility(v)
        vol_list.append(vol)
    vol_array = np.array(vol_list)
    return vol_array

In [36]:
vol = volatility(gendict)

In [64]:
def compute_stationarity(datadict, p=0.01): #Computes the proportion of a generation that has stationary data
    stat_count = 0
    total_count = 0
    for k, v in datadict.items():
        price_array = v["trading_price"].to_numpy(copy=True)
        try:
            _, pval, _, _, _, _ = sm.adfuller(price_array, regression="ct")
            if pval < p:
                stat_count += 1
            total_count += 1
        except ValueError:
            pass
    stat_prop = stat_count/total_count
    return stat_prop

def stationary_proportion(gendict, pcrit=0.01):
    stat_list = []
    for k, v in gendict.items():
        stat_prop = compute_stationarity(v, pcrit)
        stat_list.append(stat_prop)
#        print(stat_prop)
    stat_array = np.array(stat_list)
    return stat_array

In [65]:
stationary1 = stationary_proportion(gendict)
stationary5 = stationary_proportion(gendict, 0.05)
stationary10 = stationary_proportion(gendict, 0.1)

In [72]:
def compute_coint_proportion(datadict, p=0.01): #Computes the proportion of a generation where both price series are cointegrated
    coint_count = 0
    total_count = 0
    for k, v in datadict.items():
        price_array = v["trading_price"].to_numpy(copy=True)
        true_array = v["true_price"].to_numpy(copy=True)
        try:
            _, pval, _ = sm.coint(price_array, true_array, trend="nc")
            if pval < p:
                coint_count += 1
            total_count += 1
        except ValueError:
            pass
    coint_prop = coint_count/total_count
    return coint_prop

def cointegrated_proportion(gendict, pcrit=0.01):
    coint_list = []
    for k, v in gendict.items():
        coint_prop = compute_coint_proportion(v, pcrit)
        coint_list.append(coint_prop)
    coint_array = np.array(coint_list)
    return coint_array

In [73]:
coint1 = cointegrated_proportion(gendict)
coint5 = cointegrated_proportion(gendict, 0.05)
coint10 = cointegrated_proportion(gendict, 0.1)

  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  return np.dot(wresid, wresid) / self.df_resid


In [81]:
def compute_avg_spread(datadict): #Computes the average spread for a generation
    spread_list = []
    for k, v in datadict.items():
        spread = v["spread"].to_numpy(copy=True)
        spread_list.append(np.mean(spread))
    spread_array = np.array(spread_list)
    avg_spread = np.mean(spread_array)
    return avg_spread
    
def avg_spread(gendict):
    spread_list = []
    for k, v in gendict.items():
        avg_spread = compute_avg_spread(v)
        spread_list.append(avg_spread)
    spread_array = np.array(spread_list)
    return spread_array

In [82]:
spread = avg_spread(gendict)

In [77]:
gendata = pd.DataFrame(data={"vol": vol,
                             "stat_prop1": stationary1,
                             "stat_prop5": stationary5,
                             "stat_prop10": stationary10,
                             "coint_prop1": coint1,
                             "coint_prop5": coint5,
                             "coint_prop10": coint10,
                             "spread": spread,
                             "msd": msd,
                             "ineff_prop": ineff_prop,
                             "avg_ret_run": avg_ret_run,
                             "avg_unret_run": avg_unret_run,
                             "no_ret_runs": no_ret_runs,
                             "no_unret_runs": no_unret_runs}
                       , index=gens)

In [91]:
def unpack_states(filepath): #Slightly modified function for unpacking the states data rather than run data
    all_files = glob.glob(filepath + "*.csv")
    name_list = []
    datadict = {}
    for f in all_files:
        name_list.append(f[91:-4])
    for i,n in enumerate(name_list):
        datadict[n] = pd.read_csv(all_files[i], header=0)
#    for k, v in datadict.items():
#        v.drop(columns=["Unnamed: 0", "volume", "spread", "10_MA", "50_MA"], inplace=True)
    return datadict

In [96]:
fsdict = {}
for i in tqdm(range(51)):
    fsname = f"gen{i}"
    fsdict[fsname] = unpack_states(f"/Users/karangarg/Documents/Year 3 Modules/EC331/Code/rae_repo/simulations/gen{i}_sims/states/final")    

100%|██████████| 51/51 [00:14<00:00,  3.64it/s]


In [99]:
isdict = {}
for i in tqdm(range(51)):
    isname = f"gen{i}"
    isdict[isname] = unpack_states(f"/Users/karangarg/Documents/Year 3 Modules/EC331/Code/rae_repo/simulations/gen{i}_sims/states/state0")    

100%|██████████| 51/51 [00:12<00:00,  4.02it/s]


In [114]:
def order_states(statedict): #Takes a dictionary of gens of states data and orders each gen's dict by run order
    ordered_statedict = collections.OrderedDict(sorted(statedict.items()))
    return ordered_statedict

# MAKE SURE ALL RESULTS ARE TURNED INTO ORDERED DICTS TO MAKE SURE THE RESULTS ARE VALID

In [108]:
def dom_flavour(fsdatadict): #Returns a dictionary of the count of the top flavour for each run within a generation
    toparb = 0
    topchart = 0
    topval = 0
    topnoise = 0
    toplstm = 0
    for k, v in fsdatadict.items():
        arbcount = 0
        chartcount = 0
        valcount = 0
        noisecount = 0
        lstmcount = 0
        for i in range(len(v)):
            if v.iloc[i]["flavour"] == "arb":
                arbcount += 1
            elif v.iloc[i]["flavour"] == "chart":
                chartcount += 1
            elif v.iloc[i]["flavour"] == "val":
                valcount += 1
            elif v.iloc[i]["flavour"] == "noise":
                noisecount += 1
            elif v.iloc[i]["flavour"] == "lstm":
                lstmcount += 1
        countdict = {arbcount: "arb", chartcount: "chart", valcount: "val", noisecount: "noise", lstmcount: "lstm"}
        topflavour = countdict.get(max(countdict))
        if topflavour == "arb":
            toparb += 1
        elif topflavour == "chart":
            topchart += 1
        elif topflavour == "val":
            topval += 1
        elif topflavour == "noise":
            topnoise += 1
        elif topflavour == "lstm":
            toplstm += 1
    topflavourdict = {"arb": toparb, "chart": topchart, "val":topval, "noise": topnoise, "lstm": toplstm}
    return topflavourdict

def dom_flavour_all(fsdict):
    listoftopdicts = []
    for k, v in fsdict.items():
        topflavour = dom_flavour(v)
        listoftopdicts.append(topflavour)
    return listoftopdicts

In [110]:
topflavourslist = dom_flavour_all(fsdict)

In [None]:
def delta_flavour(isdatadict, fsdatadict): #Computes the average change in flavours for a generation
    

In [111]:
fsdict["gen0"]

{'finalstatesim18':     Unnamed: 0    id  stock       bal       acc flavour
 0            0   id0     99     10.03  13617.58     arb
 1            1   id1     62   2224.56  10746.46     arb
 2            2   id2      0  10602.50  10602.50     arb
 3            3   id3     43   5631.57  11541.92     arb
 4            4   id4     98     36.20  13506.30     arb
 ..         ...   ...    ...       ...       ...     ...
 95          95  id95     36   6370.93  11319.13     val
 96          96  id96     27   6750.60  10461.75     val
 97          97  id97     81   1528.67  12662.12     val
 98          98  id98      0  10634.05  10634.05   noise
 99          99  id99     88   1147.16  13242.76   noise
 
 [100 rows x 6 columns],
 'finalstatesim30':     Unnamed: 0    id  stock      bal       acc flavour
 0            0   id0     99    54.64  16096.60     arb
 1            1   id1     39  6529.66  12849.22     arb
 2            2   id2     94    75.61  15307.37     arb
 3            3   id3     3

In [112]:
isdict["gen0"]

{'state0sim39':     Unnamed: 0    id  stock   bal    acc flavour
 0            0   id0     50  5000  10000     arb
 1            1   id1     50  5000  10000     arb
 2            2   id2     50  5000  10000     arb
 3            3   id3     50  5000  10000     arb
 4            4   id4     50  5000  10000     arb
 ..         ...   ...    ...   ...    ...     ...
 95          95  id95     50  5000  10000   noise
 96          96  id96     50  5000  10000   noise
 97          97  id97     50  5000  10000   noise
 98          98  id98     50  5000  10000   noise
 99          99  id99     50  5000  10000   noise
 
 [100 rows x 6 columns],
 'state0sim11':     Unnamed: 0    id  stock   bal    acc flavour
 0            0   id0     50  5000  10000     arb
 1            1   id1     50  5000  10000     arb
 2            2   id2     50  5000  10000     arb
 3            3   id3     50  5000  10000     arb
 4            4   id4     50  5000  10000     arb
 ..         ...   ...    ...   ...    ...  