In [None]:
using Dates, CSV, JuliaDB, OnlineStats, Plots, PlotThemes, Interact
theme(:dark)

# Big Data Visualization Tools

## `OnlineStats.Partition`

- Plot any-sized dataset.
- Here's a plot of 100 Million observations:

In [None]:
o = Partition(Series(Mean(), Extrema()), 200)

y = randn()

for _ in 1:10^8
    fit!(o,  y += randn())
end

plot(o)

# Kaggle's [Huge Stock Market Dataset](https://www.kaggle.com/borismarjanovic/price-volume-data-for-all-us-stocks-etfs)

- OHLC data for each stock/ETF (each is a separate CSV) traded in the US
- Just over 700MB

In [None]:
path = "/Users/joshday/datasets/price-volume-data-for-all-us-stocks-etfs/Stocks/"

readdir(path)

In [None]:
;head "$path/aapl.us.txt"

## Statistics/Plots Directly From CSV

In [None]:
function plot_high_low(stock; kw...)
    o = IndexedPartition(Date, Extrema(), 500)
    for row in CSV.File("$path/$stock.us.txt") 
        fit!(o, [row.Date => row.Low, row.Date => row.High])
    end
    t = uppercase(stock) * " (nobs = $(nobs(o)))"
    plot(o; xlab="Date", title=t, legend=false, kw...)
end

In [None]:
plot_high_low("aapl")

## ...But this loads the entire CSV!

Even though we are creating the plots with OnlineStats, the entire file is loaded into memory.

### `CSV.Rows`

- `CSV.Rows` lets you lazily read from a CSV file
    - **Minimal memory footprint**
    - At the cost of **no type inference** (everything is treated as `String`)

In [None]:
function plot_high_low2(stock; kw...)
    o = IndexedPartition(Date, Extrema(), 500)
    for row in CSV.Rows("$path/$stock.us.txt")
        dt = Date(row.Date, "yyyy-mm-dd")
        low = parse(Float64, row.Low)
        hi = parse(Float64, row.High)
        fit!(o, [dt => low, dt => hi])
    end
    t = uppercase(stock) * " (nobs = $(nobs(o)))"
    plot(o; xlab="Date", title=t, legend=false, kw...)
end

In [None]:
plot_high_low2("aapl")

In [None]:
plot(plot_high_low2("aapl"), plot_high_low2("msft"), layout=(2,1), link=:x)

## Loading Multiple Datasets at Once

- Working with only one CSV at a time limits what kinds of analyses we can do.

In [None]:
using JuliaDB

In [None]:
t = loadtable(path, filenamecol = :stock)

### Passing an `OnlineStat` as a reducer

In [None]:
reduce(Mean(), t, select=:Open)

In [None]:
temp = groupreduce(Mean(), t, :stock, select=:Open)
select(temp, (1, 2 => value))

In [None]:
temp = groupreduce(IndexedPartition(Date, Extrema(),50), t, :stock, select=(:Date, :Open))

In [None]:
using Interact

@manipulate for i in 1:7163
    plot(temp[i].IndexedPartition, title=temp[i])
end

# TrueFX API

In [None]:
function get_data(q = "")
    endpoint = "https://webrates.truefx.com/rates/connect.html?f=csv&$q"
    hdr = [:pair, :utc, :big_bid_figure, :bid_points, :offer_bid_figure,
           :offer_points, :high, :low, :open]
    CSV.read(download(endpoint); header=hdr, footerskip=1)
end

In [None]:
get_data()

In [None]:
get_data("c=EUR/USD")

In [None]:
using OnlineStats, Plots
theme(:juno)
gr()

o = IndexedPartition(Int, Extrema(), 10)

while true
    IJulia.clear_output(true)
    df = get_data("c=USD/JPY")
    fit!(o, (df[1, :utc], df[1, :bid_points]))
    display(plot(o, xlab="UTC Time", ylab="Bid Points", title="nobs = $(nobs(o))"))
end