# Data Overview 

This notebook will explore and visualize the data stored in the data folder

## Connect to Spark

Findspark package will do the job

In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkContext
import pyspark

spark = SparkSession \
    .builder \
    .appName("Paktolos") \
    .getOrCreate()

sc = SparkContext.getOrCreate()
sqlc = SQLContext(sc)

## Table display

Display a data slice using a pandas table 

In [12]:
dfjoinclean = spark.read.parquet("data/forexdatajoinclean/")  

beg = dt.datetime(year=2018,month=1,day=3)
end = dt.datetime(year=2018,month=1,day=5)

df = dfjoinclean.filter(dfjoinclean.date.between(beg,end)).toPandas()
df.head()

Unnamed: 0,date,O_AUDUSD,C_AUDUSD,H_AUDUSD,L_AUDUSD,tick_AUDUSD,O_GBPUSD,C_GBPUSD,H_GBPUSD,L_GBPUSD,...,O_EURCHF,C_EURCHF,H_EURCHF,L_EURCHF,tick_EURCHF,O_USDCAD,C_USDCAD,H_USDCAD,L_USDCAD,tick_USDCAD
0,2018-01-03 00:00:00,0.7825,0.78263,0.78279,0.7825,186.0,1.35898,1.35906,1.3591,1.35896,...,1.17162,1.17165,1.17186,1.17142,371.0,1.25133,1.25142,1.25147,1.25131,162.0
1,2018-01-03 00:01:00,0.78263,0.78253,0.78263,0.78247,160.0,1.35906,1.35906,1.35908,1.35898,...,1.17165,1.17174,1.17188,1.17149,216.0,1.25142,1.25141,1.25156,1.25137,298.0
2,2018-01-03 00:02:00,0.78253,0.78261,0.78266,0.78249,146.0,1.35906,1.35909,1.35909,1.35901,...,1.17174,1.17173,1.17183,1.17165,83.0,1.25141,1.25133,1.25143,1.25133,407.0
3,2018-01-03 00:03:00,0.78261,0.78261,0.78263,0.78258,290.0,1.35909,1.35911,1.35911,1.359,...,1.17173,1.1716,1.17177,1.17154,139.0,1.25133,1.2514,1.25141,1.25131,63.0
4,2018-01-03 00:04:00,0.78261,0.78272,0.78275,0.78259,215.0,1.35911,1.35906,1.35911,1.35904,...,1.1716,1.17189,1.17194,1.1716,532.0,1.2514,1.25116,1.25144,1.25113,255.0


## Correlation between pairs

Compute the correlation pairs over ten years

In [11]:
from pyspark.mllib.stat import Statistics
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

def corr_matrix(inpcols,beg=None,end=None):
    
    dfjoin = spark.read.parquet("data/forexdatajoinclean/")    
     
    if beg is None or end is None:
        dfilter = dfjoin
    else :
        dfilter = dfjoin.filter(dfjoin.date.between(beg,end ))
    
    dfilter = dfilter.dropna()
    vec_assembler = VectorAssembler(inputCols=inpcols, outputCol="features")
    dffeatures = vec_assembler.transform(dfilter)
    pearsonCorr = Correlation.corr(dffeatures, 'features', 'pearson').collect()
    return pearsonCorr

dfjoin = spark.read.parquet("data/forexdatajoinclean/")    
 
inpcols = dfjoin.columns
inpcols.remove("date")
inpcols = [x for x in inpcols if x.startswith("O_")]
rowsize = len(inpcols)

corrpearson = corr_matrix(inpcols = inpcols)

trace = go.Heatmap(
    z=corrpearson[0][0].values.reshape(rowsize,rowsize),    
    zmin = -1,
    zmax = 1,
    name= "10yearsCorr",
    x=inpcols,
    y=inpcols)    

data=[trace]

layout = go.Layout(
    title='Currency pairs correlation based on minute intervals since 2008 computed using spark'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='corr heatmap')
