In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Historical Data

In [2]:
!dir "../csv/round-3-island-data-bottle"

 Volume in drive C is Windows
 Volume Serial Number is 4C9C-5B8B

 Directory of C:\Users\joshu\Documents\Code\prosperity2\csv\round-3-island-data-bottle

2024-04-15  09:41 PM    <DIR>          .
2024-04-15  09:41 PM    <DIR>          ..
2024-04-15  09:41 PM         2,621,062 prices_round_3_day_0.csv
2024-04-15  09:41 PM         2,620,447 prices_round_3_day_1.csv
2024-04-15  09:41 PM         2,620,492 prices_round_3_day_2.csv
2024-04-15  09:41 PM           152,390 trades_round_3_day_0_nn.csv
2024-04-15  09:41 PM           146,572 trades_round_3_day_1_nn.csv
2024-04-15  09:41 PM           150,193 trades_round_3_day_2_nn.csv
               6 File(s)      8,311,156 bytes
               2 Dir(s)  44,314,460,160 bytes free


In [3]:
days = [0, 1, 2]
pricesByDay = {}
tradesByDay = {}

for day in days:
    pricesByDay[day] = pd.read_csv(f"../csv/round-3-island-data-bottle/prices_round_3_day_{day}.csv", sep=';')
    tradesByDay[day] = pd.read_csv(f"../csv/round-3-island-data-bottle/trades_round_3_day_{day}_nn.csv", sep=';')

In [4]:
pricesByDay[0].head()

Unnamed: 0,day,timestamp,product,bid_price_1,bid_volume_1,bid_price_2,bid_volume_2,bid_price_3,bid_volume_3,ask_price_1,ask_volume_1,ask_price_2,ask_volume_2,ask_price_3,ask_volume_3,mid_price,profit_and_loss
0,0,0,CHOCOLATE,7999,111,,,,,8001,111,,,,,8000.0,0.0
1,0,0,STRAWBERRIES,3999,210,,,,,4001,210,,,,,4000.0,0.0
2,0,0,ROSES,14999,72,,,,,15001,72,,,,,15000.0,0.0
3,0,0,GIFT_BASKET,71348,19,71347.0,24.0,,,71362,19,71363.0,24.0,,,71355.0,0.0
4,0,100,GIFT_BASKET,71344,1,71343.0,12.0,71342.0,20.0,71355,1,71356.0,12.0,71357.0,20.0,71349.5,0.0


In [5]:
tradesByDay[0].head()

Unnamed: 0,timestamp,buyer,seller,symbol,currency,price,quantity
0,0,,,CHOCOLATE,SEASHELLS,8001.0,8
1,0,,,ROSES,SEASHELLS,15001.0,3
2,200,,,ROSES,SEASHELLS,15000.0,3
3,500,,,STRAWBERRIES,SEASHELLS,3999.0,10
4,1000,,,GIFT_BASKET,SEASHELLS,71360.0,2


## Prices

In [6]:
def plot_prices(product):
    fig, ax = plt.subplots(nrows=3, figsize=(15, 25))

    for i in range(3):
        day = days[i]
        ax[i].set_title(f"Bid prices for {product} on day {day}")
        dfPrices = pricesByDay[day][lambda df: df['product'] == product]
        for colName in ['bid_price_1', 'bid_price_2', 'bid_price_3']:
            ax[i].scatter(dfPrices['timestamp'], dfPrices[colName], c='g')
        for colName in ['ask_price_1', 'ask_price_2', 'ask_price_3']:
            ax[i].scatter(dfPrices['timestamp'], dfPrices[colName], c='r')

In [7]:
products = ['CHOCOLATE', 'STRAWBERRIES', 'ROSES', 'GIFT_BASKET']
price_columns = ['bid_price_1', 'bid_price_2', 'bid_price_3', 'ask_price_1', 'ask_price_2', 'ask_price_3']
meanPricesByDay = {}

for day in days:
    meanPricesByDay[day] = pricesByDay[day].copy()
    priceCols = pricesByDay[day][price_columns]
    meanPricesByDay[day]['mean_price'] = priceCols.sum(axis=1) \
                                        / priceCols.count(numeric_only=True, axis=1)

meanPricesByDay[0].head()
# How far the mean_price deviates from mid price
(meanPricesByDay[0]['mean_price'] - meanPricesByDay[0]['mid_price']).describe()

count    40000.000000
mean         0.000021
std          0.539411
min         -5.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          5.300000
dtype: float64

In [8]:
P = {
    'CHOCOLATE': 4,
    'STRAWBERRIES': 4,
    'ROSES': 4,
    'GIFT_BASKET': 4
}

response_name = 'mean_price' # try mid_price

def getXy(product):
    X, y = [], []
    for day in days:
        df_product = meanPricesByDay[day][lambda df: df['product'] == product]
        for i in range(P[product], df_product.shape[0]):
            x = df_product.iloc[i-P[product]:i][response_name].values
            X.append(x)
            y.append(df_product.iloc[i][response_name])
           
    X = np.array(X)
    y = np.array(y)
    print(f"{X.shape[0]} training observations")
    return X, y

In [9]:
from sklearn.linear_model import LinearRegression

## Regress Chocolate

In [10]:
X_chocolate, y_chocolate = getXy('CHOCOLATE')
lm_chocolate = LinearRegression().fit(X_chocolate, y_chocolate)
lm_chocolate.intercept_, lm_chocolate.coef_

29988 training observations


(0.14240089710074244,
 array([-0.01108489,  0.00786715,  0.04070572,  0.96249293]))

## Regress Strawberries

In [11]:
X_strawberries, y_strawberries = getXy('STRAWBERRIES')
lm_strawberries = LinearRegression().fit(X_strawberries, y_strawberries)
lm_strawberries.intercept_, lm_strawberries.coef_

29988 training observations


(0.37947875425743405,
 array([-0.00548191,  0.05122488,  0.1366758 ,  0.81748684]))

## Regress Roses

In [12]:
X_roses, y_roses = getXy('ROSES')
lm_roses = LinearRegression().fit(X_roses, y_roses)
lm_roses.intercept_, lm_roses.coef_

29988 training observations


(4.238897536935838,
 array([-0.00885487,  0.00141759,  0.01359939,  0.99354437]))

## Regress Gift Basket

In [13]:
X_gift, y_gift = getXy('GIFT_BASKET')
lm_gift = LinearRegression().fit(X_gift, y_gift)
lm_gift.intercept_, lm_gift.coef_

29988 training observations


(3.180094202849432,
 array([-5.15603870e-03, -9.81008814e-04, -1.12541938e-03,  1.00721665e+00]))

## Trades

In [14]:
# Do something with the trades data sets