## Analysis

This notebook contains the code for the analysis of the final dataset. While rigorous econometric analysis is limited (due to the nature of the dataset), some basic tests for market efficiency can reveal a lot of insight as to how the machine learning traders affected the market.

The tests we will conduct will look at a variety of factors, including mean-reversion tendencies, run tests and whether our different types of traders outperformed some simple strategies such as filters and buy-and-hold strategies.

In [21]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import csv
import glob

In [46]:
def unpack_data(filepath): #Returns a dictionary where each item is a df containing one run, one dict per generation
    all_files = glob.glob(filepath + "/*.csv")
    name_list = []
    datadict = {}
    for f in all_files:
        name_list.append(f[89:-4])
    for i,n in enumerate(name_list):
        datadict[n] = pd.read_csv(all_files[i], header=0)
#    for k, v in datadict.items():
#        v.drop(columns=["Unnamed: 0", "volume", "spread", "10_MA", "50_MA"], inplace=True)
    return datadict

In [47]:
gendict = {}
for i in range(51):
    genname = f"gen{i}"
    gendict[genname] = unpack_data(f"/Users/karangarg/Documents/Year 3 Modules/EC331/Code/rae_repo/simulations/gen{i}_sims/data")
    
#gen0 = unpack_data("/Users/karangarg/Documents/Year 3 Modules/EC331/Code/rae_repo/simulations/gen0_sims/data") #Load gen0 data

In [64]:
def compute_mean_squared_diff(datadict): #Returns average mean squared difference, i.e. the square of the average distance between the true price and trading price for any given period in any given simlation
    msd_list = []
    for k, v in datadict.items():
        diff_list = []
        for i in range(len(v)):
            squared_diff = (v.iloc[i]["trading_price"] - v.iloc[i]["true_price"])**2
            diff_list.append(squared_diff)
        msd = sum(diff_list)/len(diff_list)
        msd_list.append(msd) 
    avg_msd = sum(msd_list)/len(msd_list)
    return avg_msd

In [65]:
compute_mean_squared_diff(gen0)

In [60]:
def reversion_rate(datadict, dev=1): #For any given run of the market, how many times does price deviate and how long does it take to return to the true price
    final_avg_return_list = [] #Avg returned run length for individual simulations
    final_avg_no_return_list = [] #Avg run length for not returned runs for individual simulations
    return_run_length_list = [] #Number of deviations per simulation that return for individual simulations
    no_return_run_length_list = [] #Number of deviations per simulation that don't return for individual simulations
    for k, v in datadict.items():
        run_count = 0
        dev_return_list = []
        dev_no_return_list = []
        for i in range(len(v)):
            if abs(v.iloc[i]["trading_price"] - v.iloc[i]["true_price"]) > dev and abs(v.iloc[i-1]["trading_price"] - v.iloc[i-1]["true_price"]) <= dev:
                for t in range(i, len(v)):
                    if abs(v.iloc[t]["trading_price"] - v.iloc[t]["true_price"]) > dev and t == len(v)-1:
                        dev_no_return_list.append(t-i)
                        break
                    elif abs(v.iloc[t]["trading_price"] - v.iloc[t]["true_price"]) <= dev:
                        dev_return_list.append(t-i)
                        break        
        if len(dev_return_list) != 0:
            avg_return_time = sum(dev_return_list)/len(dev_return_list)
            final_avg_return_list.append(avg_return_time)
            return_run_length_list.append(len(dev_return_list))
        if len(dev_no_return_list) != 0:
            avg_no_return_time = sum(dev_no_return_list)/len(dev_no_return_list)
            final_avg_no_return_list.append(avg_no_return_time)
            no_return_run_length_list.append(len(dev_no_return_list))
    final_avg_return = sum(final_avg_return_list)/len(final_avg_return_list) #Length of avg deviation run (that returned)
    final_avg_no_return = sum(final_avg_no_return_list)/len(final_avg_no_return_list) #Length of avg deviation run (that didn't return)
    avg_no_returns = sum(return_run_length_list)/len(datadict) #Avg number of deviations per run that returned
    avg_no_no_returns = sum(no_return_run_length_list)/len(datadict) #Avg number of deviations per run that didn't return
    return final_avg_return, final_avg_no_return, avg_no_returns, avg_no_no_returns 

In [61]:
reversion_rate(gen0)

(4.961647762793108, 149.50980392156862, 23.28, 0.51)

In [30]:
def inefficient_proportion(datadict, dev=1): #Computes the proportion of time that the true and trading prices deviate
    prop_list = []
    for k, v in datadict.items():
        prop_temp = 0
        for i in range(len(v)):
            if abs(v.iloc[i]["trading_price"] - v.iloc[i]["true_price"]) > dev:
                prop_temp +=1
        prop_list.append((prop_temp/len(v)))
    prop = sum(prop_list)/len(prop_list)
    return prop

In [31]:
inefficient_proportion(gen0)

0.5242133546690094

SyntaxError: can't assign to literal (<ipython-input-67-4d9ce7923741>, line 3)