# stock2vec

Create vectors for stocks based on their relative volatility.

In [1]:
import datetime
import math
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/wiki_prices.csv', 
                        usecols=["ticker", "date", "adj_close", "adj_high", "adj_low", "adj_volume"],
                        parse_dates=['date'])

df.sort_values('date', inplace=True)

df = df[pd.notnull(df['adj_high']) & pd.notnull(df['adj_low']) & pd.notnull(df['adj_close'])]

print(df.head())

        ticker       date   adj_high    adj_low  adj_close  adj_volume
7561760     KO 1962-01-02   0.273859   0.266600   0.266600   1612800.0
5620436     GE 1962-01-02   0.341163   0.332214   0.334451   2073600.0
1005749   ARNC 1962-01-02   3.492621   3.472967   3.472967     44800.0
6739482    IBM 1962-01-02  15.738806  15.561965  15.561965    387200.0
1416507     BA 1962-01-02   0.488470   0.480022   0.480022    352200.0


## Preprocessing

Filter ticks for the past ~15 years of stocks with volume > 10000 and volatility > 0.

In [3]:
df = df[df['date'] >= datetime.date(2000,1,1)]
df = df[df['adj_volume'] > 10000]
del df['adj_volume']

df['volt'] = (df['adj_high'] - df['adj_low']) / df['adj_close']
del df['adj_high']
del df['adj_low']
del df['adj_close']

df = df[df['volt'] > 0]

df.sort_values(['date', 'volt'], inplace=True)

print(df.head())

         ticker       date      volt
3017202   CNBKA 2000-01-03  0.003679
1872359     BMI 2000-01-03  0.004000
5209067    FLIC 2000-01-03  0.004068
5368021    FRED 2000-01-03  0.004391
12380997    STL 2000-01-03  0.004499


**Build Batches**

For each stock, find C stocks that have the closest volatility to that ticker for that day.

In [34]:
ticker_to_int = {}
int_to_ticker = {}

def get_ticker_int(ticker):
    key = ticker_to_int.get(ticker, None)
    if key is None:
        key = ticker_to_int[ticker] = len(ticker_to_int)
        int_to_ticker[key] = ticker
    return key

def get_stock_date(stocks, idx):
    return stocks.iloc[[idx], 1].values[0]

def get_stock_ticker(stocks, idx):
    return stocks.iloc[[idx], 0].values[0]

def get_stock_int(stocks, idx):
    return get_ticker_int(get_stock_ticker(stocks, idx))

def get_window(stocks, idx, window_size=5):
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    
    stock_int = get_stock_int(stocks, idx)
    stock_date = get_stock_date(stocks, idx)
    
    window = []
    
    for i in range(start, stop+1):
        nearby_stock_int = get_stock_int(stocks, i)
        nearby_stock_date = get_stock_date(stocks, i)
        if nearby_stock_int != stock_int and nearby_stock_date == stock_date:
            window.append(nearby_stock_int)
    
    return window

for idx in range(0, 10, 5):
    print('window for', get_stock_ticker(df, idx), get_stock_int(df, idx))
    for i in get_window(df, idx, 5):
        print(i, int_to_ticker[i])

window for CNBKA 0
1 BMI
2 FLIC
window for WEYS 3
0 CNBKA
1 BMI
2 FLIC
4 FRED
5 STL
6 UMBF
7 CRRC
8 THO
9 MAC
10 MAS


## Vector Math

Apple - Google = ?