# Data Overview 

This notebook will explore and visualize the data stored in the data folder

## Connect to Spark

Findspark package will do the job

In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkContext
import pyspark

spark = SparkSession \
    .builder \
    .appName("Paktolos") \
    .getOrCreate()

sc = SparkContext.getOrCreate()
sqlc = SQLContext(sc)

## Table display

Display a data slice using a pandas table 

In [14]:
dfjoinclean = spark.read.parquet("data/forexdatajoinclean/")  

beg = dt.datetime(year=2018,month=1,day=3)
end = dt.datetime(year=2018,month=1,day=5)

df = dfjoinclean.filter(dfjoinclean.date.between(beg,end)).toPandas()
df

Unnamed: 0,date,O_AUDUSD,C_AUDUSD,H_AUDUSD,L_AUDUSD,tick_AUDUSD,O_GBPUSD,C_GBPUSD,H_GBPUSD,L_GBPUSD,...,O_EURCHF,C_EURCHF,H_EURCHF,L_EURCHF,tick_EURCHF,O_USDCAD,C_USDCAD,H_USDCAD,L_USDCAD,tick_USDCAD
0,2018-01-03 00:00:00,0.782500,0.78263,0.78279,0.782500,186.0,1.35898,1.359060,1.359100,1.358960,...,1.17162,1.171650,1.17186,1.17142,371.0,1.251330,1.251420,1.25147,1.25131,162.0
1,2018-01-03 00:01:00,0.782630,0.78253,0.78263,0.782470,160.0,1.35906,1.359060,1.359080,1.358980,...,1.17165,1.171740,1.17188,1.17149,216.0,1.251420,1.251410,1.25156,1.25137,298.0
2,2018-01-03 00:02:00,0.782530,0.78261,0.78266,0.782490,146.0,1.35906,1.359090,1.359090,1.359010,...,1.17174,1.171730,1.17183,1.17165,83.0,1.251410,1.251330,1.25143,1.25133,407.0
3,2018-01-03 00:03:00,0.782610,0.78261,0.78263,0.782580,290.0,1.35909,1.359110,1.359110,1.359000,...,1.17173,1.171600,1.17177,1.17154,139.0,1.251330,1.251400,1.25141,1.25131,63.0
4,2018-01-03 00:04:00,0.782610,0.78272,0.78275,0.782590,215.0,1.35911,1.359060,1.359110,1.359040,...,1.17160,1.171890,1.17194,1.17160,532.0,1.251400,1.251160,1.25144,1.25113,255.0
5,2018-01-03 00:05:00,0.782720,0.78273,0.78275,0.782710,31.0,1.35906,1.359130,1.359140,1.359070,...,1.17189,1.171730,1.17205,1.17167,1414.0,1.251160,1.251020,1.25119,1.25101,355.0
6,2018-01-03 00:06:00,0.782730,0.78272,0.78280,0.782710,97.0,1.35913,1.359100,1.359170,1.359080,...,1.17173,1.171730,1.17197,1.17147,1420.0,1.251020,1.251070,1.25108,1.25101,375.0
7,2018-01-03 00:07:00,0.782720,0.78272,0.78274,0.782700,47.0,1.35910,1.359130,1.359160,1.359070,...,1.17173,1.172020,1.17207,1.17156,450.0,1.251070,1.250990,1.25113,1.25097,210.0
8,2018-01-03 00:08:00,0.782720,0.78275,0.78278,0.782730,49.0,1.35913,1.359290,1.359340,1.359090,...,1.17202,1.171860,1.17206,1.17178,155.0,1.250990,1.251010,1.25103,1.25087,224.0
9,2018-01-03 00:09:00,0.782750,0.78275,0.78277,0.782710,80.0,1.35929,1.359310,1.359360,1.359210,...,1.17186,1.171760,1.17199,1.17171,626.0,1.251010,1.251020,1.25110,1.25100,77.0


## Correlation between pairs

Compute the correlation pairs over ten years

In [11]:
from pyspark.mllib.stat import Statistics
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

def corr_matrix(inpcols,beg=None,end=None):
    
    dfjoin = spark.read.parquet("data/forexdatajoinclean/")    
     
    if beg is None or end is None:
        dfilter = dfjoin
    else :
        dfilter = dfjoin.filter(dfjoin.date.between(beg,end ))
    
    dfilter = dfilter.dropna()
    vec_assembler = VectorAssembler(inputCols=inpcols, outputCol="features")
    dffeatures = vec_assembler.transform(dfilter)
    pearsonCorr = Correlation.corr(dffeatures, 'features', 'pearson').collect()
    return pearsonCorr

dfjoin = spark.read.parquet("data/forexdatajoinclean/")    
 
inpcols = dfjoin.columns
inpcols.remove("date")
inpcols = [x for x in inpcols if x.startswith("O_")]
rowsize = len(inpcols)

corrpearson = corr_matrix(inpcols = inpcols)

trace = go.Heatmap(
    z=corrpearson[0][0].values.reshape(rowsize,rowsize),    
    zmin = -1,
    zmax = 1,
    name= "10yearsCorr",
    x=inpcols,
    y=inpcols)    

data=[trace]

layout = go.Layout(
    title='Currency pairs correlation based on minute intervals since 2008 computed using spark'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='corr heatmap')
