## Prevendo a cotação de criptomoedas em tempo real

Bilbiotecas, frameworks e outras ferramentas utilizdas:

- PySpark
- Pandas
- Numpy
- Seaborn
- Machine Learning

Fonte de Dados:
https://bitcoincharts.com/charts/

In [1]:
# Importa o findspark e inicializa
import findspark
findspark.init()

In [2]:
# Imports
import pyspark
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
# Formatação das saídas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 400)
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')

## Preparando o Ambiente Spark

In [4]:
# Definindo semente aleatória (seed) para reprodutibilidade do notebook
rnd_seed = 23
np.random.seed = rnd_seed
np.random.set_state = rnd_seed

In [5]:
# Criando o Spark Context
sc = SparkContext(appName = 'Mini-Projeto3')

23/03/07 09:18:58 WARN Utils: Your hostname, ingo-Vostro-3583 resolves to a loopback address: 127.0.1.1; using 192.168.1.10 instead (on interface wlo1)
23/03/07 09:18:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/07 09:18:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
# Criando a sessão Spark
spark_session = SparkSession.Builder().getOrCreate()

In [7]:
# Visualiza o objeto spark_session
spark_session

## Caregando os Dados

In [8]:
# Carrega os dados a partir da sessão Spark
df_spark = spark_session.read.csv('dataset.csv', header = 'true', inferSchema = 'true')

                                                                                

In [9]:
# Tipo do objeto
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [10]:
# Visualiza os dados
df_spark.show()

+----------+----+----+----+-----+------------+-----------------+--------------+
| Timestamp|Open|High| Low|Close|Volume_(BTC)|Volume_(Currency)|Weighted_Price|
+----------+----+----+----+-----+------------+-----------------+--------------+
|1325317920|4.39|4.39|4.39| 4.39|  0.45558087|     2.0000000193|          4.39|
|1325319300| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325319360| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325319420| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325319480| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325319540| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325319600| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325319660| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325345040| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|
|1325345100| NaN| NaN| NaN|  NaN|       

In [11]:
# Visualiza os metadados (schema)
df_spark.printSchema()

root
 |-- Timestamp: integer (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume_(BTC): double (nullable = true)
 |-- Volume_(Currency): double (nullable = true)
 |-- Weighted_Price: double (nullable = true)



In [12]:
# Verifica o número de linhas
df_spark.count()

                                                                                

4856600

## Data Wrangling com SparkSQL

In [13]:
# Cria uma tabela temporária a partir do dataframe
# As tabelas temporárias são úteis quando você deseja que o conjunto de resuldados fique visível
# para todas as outras sessões Spark
df_spark.createOrReplaceTempView('dados_bitcoin')

In [14]:
# Executa uma consulta SQL
df_bitcoin = spark_session.sql('select *, from_unixtime(Timestamp) as `dateTime` from dados_bitcoin')

In [15]:
type(df_bitcoin)

pyspark.sql.dataframe.DataFrame

In [16]:
# Visualiza os dados
df_bitcoin.show()

+----------+----+----+----+-----+------------+-----------------+--------------+-------------------+
| Timestamp|Open|High| Low|Close|Volume_(BTC)|Volume_(Currency)|Weighted_Price|           dateTime|
+----------+----+----+----+-----+------------+-----------------+--------------+-------------------+
|1325317920|4.39|4.39|4.39| 4.39|  0.45558087|     2.0000000193|          4.39|2011-12-31 05:52:00|
|1325319300| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|2011-12-31 06:15:00|
|1325319360| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|2011-12-31 06:16:00|
|1325319420| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|2011-12-31 06:17:00|
|1325319480| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|2011-12-31 06:18:00|
|1325319540| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|2011-12-31 06:19:00|
|1325319600| NaN| NaN| NaN|  NaN|         NaN|              NaN|           NaN|2011-12-31 06:20:00|


In [17]:
# Remove valores NA (Não faz sentido qualquer tratamento aqui, pois todas as colunas de cotações estão como NA)
df_bitcoin = df_bitcoin.dropna('any')

In [18]:
# Visualiza os dados
df_bitcoin.show()

+----------+----+----+----+-----+------------+-----------------+--------------+-------------------+
| Timestamp|Open|High| Low|Close|Volume_(BTC)|Volume_(Currency)|Weighted_Price|           dateTime|
+----------+----+----+----+-----+------------+-----------------+--------------+-------------------+
|1325317920|4.39|4.39|4.39| 4.39|  0.45558087|     2.0000000193|          4.39|2011-12-31 05:52:00|
|1325346600|4.39|4.39|4.39| 4.39|        48.0|           210.72|          4.39|2011-12-31 13:50:00|
|1325350740| 4.5|4.57| 4.5| 4.57| 37.86229723|     171.38033753|  4.5264114983|2011-12-31 14:59:00|
|1325350800|4.58|4.58|4.58| 4.58|         9.0|            41.22|          4.58|2011-12-31 15:00:00|
|1325391360|4.58|4.58|4.58| 4.58|       1.502|          6.87916|          4.58|2012-01-01 02:16:00|
|1325431680|4.84|4.84|4.84| 4.84|        10.0|             48.4|          4.84|2012-01-01 13:28:00|
|1325457900| 5.0| 5.0| 5.0|  5.0|        10.1|             50.5|           5.0|2012-01-01 20:45:00|


In [19]:
# Numero de registros
df_bitcoin.count()

                                                                                

3613769

In [20]:
# Vamos renoemar algumas colunas para facilitar a manipulação dos dados
df_bitcoin = df_bitcoin.withColumnRenamed('Volume_(BTC)', 'VolBTC').withColumnRenamed('Volume_(Currency)', 'VolCurrency')

In [21]:
df_bitcoin.show()

+----------+----+----+----+-----+-----------+------------+--------------+-------------------+
| Timestamp|Open|High| Low|Close|     VolBTC| VolCurrency|Weighted_Price|           dateTime|
+----------+----+----+----+-----+-----------+------------+--------------+-------------------+
|1325317920|4.39|4.39|4.39| 4.39| 0.45558087|2.0000000193|          4.39|2011-12-31 05:52:00|
|1325346600|4.39|4.39|4.39| 4.39|       48.0|      210.72|          4.39|2011-12-31 13:50:00|
|1325350740| 4.5|4.57| 4.5| 4.57|37.86229723|171.38033753|  4.5264114983|2011-12-31 14:59:00|
|1325350800|4.58|4.58|4.58| 4.58|        9.0|       41.22|          4.58|2011-12-31 15:00:00|
|1325391360|4.58|4.58|4.58| 4.58|      1.502|     6.87916|          4.58|2012-01-01 02:16:00|
|1325431680|4.84|4.84|4.84| 4.84|       10.0|        48.4|          4.84|2012-01-01 13:28:00|
|1325457900| 5.0| 5.0| 5.0|  5.0|       10.1|        50.5|           5.0|2012-01-01 20:45:00|
|1325534640| 5.0| 5.0| 5.0|  5.0|     19.048|       95.24|  

A coluna dataTime fornece detalhes da data de cotação. Vamos separar os elementos da data em diferentes colunas.

In [22]:
# Vamos dividir o dataframe extraindo data
df_data = df_bitcoin.withColumn('date', split(col('dateTime'),' ').getItem(0))

In [23]:
df_data.printSchema()

root
 |-- Timestamp: integer (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- VolBTC: double (nullable = true)
 |-- VolCurrency: double (nullable = true)
 |-- Weighted_Price: double (nullable = true)
 |-- dateTime: string (nullable = true)
 |-- date: string (nullable = true)



In [24]:
type(df_data)

pyspark.sql.dataframe.DataFrame

In [25]:
# Vamos dividir o dataframe extraindo tempo
df_data = df_data.withColumn('time', split(col('dateTime'),' ').getItem(1))

In [26]:
# Schema
df_data.printSchema()

root
 |-- Timestamp: integer (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- VolBTC: double (nullable = true)
 |-- VolCurrency: double (nullable = true)
 |-- Weighted_Price: double (nullable = true)
 |-- dateTime: string (nullable = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)



In [27]:
# Visualiza os dados
df_data.show()

+----------+----+----+----+-----+-----------+------------+--------------+-------------------+----------+--------+
| Timestamp|Open|High| Low|Close|     VolBTC| VolCurrency|Weighted_Price|           dateTime|      date|    time|
+----------+----+----+----+-----+-----------+------------+--------------+-------------------+----------+--------+
|1325317920|4.39|4.39|4.39| 4.39| 0.45558087|2.0000000193|          4.39|2011-12-31 05:52:00|2011-12-31|05:52:00|
|1325346600|4.39|4.39|4.39| 4.39|       48.0|      210.72|          4.39|2011-12-31 13:50:00|2011-12-31|13:50:00|
|1325350740| 4.5|4.57| 4.5| 4.57|37.86229723|171.38033753|  4.5264114983|2011-12-31 14:59:00|2011-12-31|14:59:00|
|1325350800|4.58|4.58|4.58| 4.58|        9.0|       41.22|          4.58|2011-12-31 15:00:00|2011-12-31|15:00:00|
|1325391360|4.58|4.58|4.58| 4.58|      1.502|     6.87916|          4.58|2012-01-01 02:16:00|2012-01-01|02:16:00|
|1325431680|4.84|4.84|4.84| 4.84|       10.0|        48.4|          4.84|2012-01-01 13:2

In [28]:
# Vamos dividir o dataframe extraindo a hora
df_data_hora = df_data.withColumn('hour', split(col('time'),':').getItem(0))

In [29]:
df_data_hora.printSchema()

root
 |-- Timestamp: integer (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- VolBTC: double (nullable = true)
 |-- VolCurrency: double (nullable = true)
 |-- Weighted_Price: double (nullable = true)
 |-- dateTime: string (nullable = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- hour: string (nullable = true)



In [30]:
df_data_hora.show()

+----------+----+----+----+-----+-----------+------------+--------------+-------------------+----------+--------+----+
| Timestamp|Open|High| Low|Close|     VolBTC| VolCurrency|Weighted_Price|           dateTime|      date|    time|hour|
+----------+----+----+----+-----+-----------+------------+--------------+-------------------+----------+--------+----+
|1325317920|4.39|4.39|4.39| 4.39| 0.45558087|2.0000000193|          4.39|2011-12-31 05:52:00|2011-12-31|05:52:00|  05|
|1325346600|4.39|4.39|4.39| 4.39|       48.0|      210.72|          4.39|2011-12-31 13:50:00|2011-12-31|13:50:00|  13|
|1325350740| 4.5|4.57| 4.5| 4.57|37.86229723|171.38033753|  4.5264114983|2011-12-31 14:59:00|2011-12-31|14:59:00|  14|
|1325350800|4.58|4.58|4.58| 4.58|        9.0|       41.22|          4.58|2011-12-31 15:00:00|2011-12-31|15:00:00|  15|
|1325391360|4.58|4.58|4.58| 4.58|      1.502|     6.87916|          4.58|2012-01-01 02:16:00|2012-01-01|02:16:00|  02|
|1325431680|4.84|4.84|4.84| 4.84|       10.0|   

In [31]:
# Vamos ajustar o formato da data para extrair o dia da semana
df_data_hora = df_data_hora.withColumn("date", df_data_hora["date"].cast(DateType())).withColumn("hour", df_data_hora["hour"].cast(DoubleType())).withColumn("dateTime", df_data_hora["dateTime"].cast(DateType()))


In [32]:
df_data_hora.show()

+----------+----+----+----+-----+-----------+------------+--------------+----------+----------+--------+----+
| Timestamp|Open|High| Low|Close|     VolBTC| VolCurrency|Weighted_Price|  dateTime|      date|    time|hour|
+----------+----+----+----+-----+-----------+------------+--------------+----------+----------+--------+----+
|1325317920|4.39|4.39|4.39| 4.39| 0.45558087|2.0000000193|          4.39|2011-12-31|2011-12-31|05:52:00| 5.0|
|1325346600|4.39|4.39|4.39| 4.39|       48.0|      210.72|          4.39|2011-12-31|2011-12-31|13:50:00|13.0|
|1325350740| 4.5|4.57| 4.5| 4.57|37.86229723|171.38033753|  4.5264114983|2011-12-31|2011-12-31|14:59:00|14.0|
|1325350800|4.58|4.58|4.58| 4.58|        9.0|       41.22|          4.58|2011-12-31|2011-12-31|15:00:00|15.0|
|1325391360|4.58|4.58|4.58| 4.58|      1.502|     6.87916|          4.58|2012-01-01|2012-01-01|02:16:00| 2.0|
|1325431680|4.84|4.84|4.84| 4.84|       10.0|        48.4|          4.84|2012-01-01|2012-01-01|13:28:00|13.0|
|132545790

In [33]:
# Vamos extrair o dia da semana
df_data_hora = df_data_hora.withColumn('day_of_week', dayofweek(df_data_hora.date))

In [34]:
df_data_hora.printSchema()

root
 |-- Timestamp: integer (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- VolBTC: double (nullable = true)
 |-- VolCurrency: double (nullable = true)
 |-- Weighted_Price: double (nullable = true)
 |-- dateTime: date (nullable = true)
 |-- date: date (nullable = true)
 |-- time: string (nullable = true)
 |-- hour: double (nullable = true)
 |-- day_of_week: integer (nullable = true)



In [35]:
# Vamos extrair o ano da cotação
df_data_hora_ano = df_data_hora.withColumn('year', split(col('date'), '-').getItem(0))

In [36]:
df_data_hora_ano.show()

+----------+----+----+----+-----+-----------+------------+--------------+----------+----------+--------+----+-----------+----+
| Timestamp|Open|High| Low|Close|     VolBTC| VolCurrency|Weighted_Price|  dateTime|      date|    time|hour|day_of_week|year|
+----------+----+----+----+-----+-----------+------------+--------------+----------+----------+--------+----+-----------+----+
|1325317920|4.39|4.39|4.39| 4.39| 0.45558087|2.0000000193|          4.39|2011-12-31|2011-12-31|05:52:00| 5.0|          7|2011|
|1325346600|4.39|4.39|4.39| 4.39|       48.0|      210.72|          4.39|2011-12-31|2011-12-31|13:50:00|13.0|          7|2011|
|1325350740| 4.5|4.57| 4.5| 4.57|37.86229723|171.38033753|  4.5264114983|2011-12-31|2011-12-31|14:59:00|14.0|          7|2011|
|1325350800|4.58|4.58|4.58| 4.58|        9.0|       41.22|          4.58|2011-12-31|2011-12-31|15:00:00|15.0|          7|2011|
|1325391360|4.58|4.58|4.58| 4.58|      1.502|     6.87916|          4.58|2012-01-01|2012-01-01|02:16:00| 2.0|  

In [None]:
# Converte o dataframe do Spark para Pandas.
# Porque isso vai facilitar a análise exploratória de dados.
df_pandas = df_data_hora_ano.toPandas()

In [None]:
type(df_pandas)

In [None]:
df_pandas.sample(10)

In [None]:
# Obtendo os valores individuais para usar nos gráficos 
hour = df_pandas['hour'].values.tolist()
weighted_price = df_pandas['Weighted_Price'].values.tolist()
volume_BTC = df_pandas['VolBTC'].values.tolist()
date_of_week = df_pandas['day_of_week'].values.tolist()
year = df_pandas['year'].values.tolist()

## Análise Exploratória

In [None]:
# Heatmap para visualizar a correlação
corr = df_pandas.corr()
f,ax = plt.subplots(figsize = (10, 10))
sns.heatmap(corr, annot = True, linewidths = .5, fmt = '.1f', ax = ax)

Idealmente queremos alta correlação entre as variáveis de entrada e a variável de saída e baixa correlação entre as variáveis de entrada!

In [None]:
# Scatter Plot Bitcoin x Volume Moeda
plt.figure(figsize = (12, 5))
sns.set(style = 'whitegrid')
df_pandas.plot(kind = 'scatter', x = 'VolBTC', y = 'VolCurrency')
plt.xlabel('Volume Bitcoin')
plt.ylabel('Volume Moeda')
plt.title('Scatter Plot Volume Bitcoin x Volume Moeda')
plt.show()

In [None]:
# Line Plot Cotação Open x High
plt.figure(figsize = (16,5))
df_pandas.Open.plot(kind = 'line',
                    color = 'r',
                    label = 'Open',
                    alpha = 0.5,
                    linewidth = 5,
                    grid = True,
                    linestyle = ':')
df_pandas.High.plot(color = 'g',
                    label = 'High',
                    linewidth = 1,
                    alpha = 0.5,
                    grid = True,
                    linestyle = '-.')
plt.legend(loc = 'upper left')
plt.xlabel('Tempo')
plt.ylabel('Cotação')
plt.title('Line Plot Cotação Open x High')
plt.show()

In [None]:
# Histograma da cotação de abertura
df_pandas.Open.plot(kind = 'hist', bins = 50)

In [None]:
# Plot do valor ponderado da cotação (nosso target) por hora
plt.plot(hour, weighted_price, 'g*')
plt.xlabel('Hora')
plt.ylabel('Valor Ponderado da Cotação')
plt.title('Valor Ponderado da Cotação por Hora')
plt.show()

In [None]:
# Plot do valor ponderado da cotação por dia da semana
plt.plot(date_of_week, weighted_price, 'b*')
plt.xlabel('Dia da Semana')
plt.ylabel('Valor Ponderado da Cotação')
plt.title('Valor Ponderado da Cotação por Dia da Semana')
plt.show()

In [None]:
# Plot do VolBTC por hora
plt.plot(hour, volume_BTC, 'r*')
plt.xlabel('Hora')
plt.ylabel('Volume Bitcoin')
plt.title('Volume negocioado por Hora')
plt.show()

In [None]:
# Plor de VolBTC por dia da semana
plt.plot(date_of_week, volume_BTC, 'yo')
plt.xlabel('Dia da Semana')
plt.ylabel('Volume Negocioado')
plt.title('Volume de Bitcoin negociado por Dia da Semana')
plt.show()

In [None]:
# Plot do valor ponderado da cotaçao por ano
plt.plot(year, weighted_price, 'm^')
plt.xlabel('Ano')
plt.ylabel('Valor Ponderado da Cotação')
plt.title('Valor Ponderado da Cotação por Ano')
plt.show()

In [None]:
# Plot do Volume por ano
plt.plot(year, volume_BTC, 'kD')
plt.xlabel('Ano')
plt.ylabel('Volume Negociado')
plt.title('Volume de Bitcoin Negociano por Ano')
plt.show()

## Engenheria de Atributos com PySpark

In [37]:
df_bitcoin.printSchema()

root
 |-- Timestamp: integer (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- VolBTC: double (nullable = true)
 |-- VolCurrency: double (nullable = true)
 |-- Weighted_Price: double (nullable = true)
 |-- dateTime: string (nullable = true)



In [40]:
# Prepara o vetor de atributos utilizando três variáveis com base na análise exploratória
assembler = VectorAssembler(inputCols = ['Open', 'VolBTC', 'VolCurrency'],
                            outputCol = 'features')

In [41]:
# Cria o dataframe do vetor de atributos
df_assembled = assembler.transform(df_bitcoin)

In [42]:
# Visualiza os dados
df_assembled.show(10, truncate = False)

+----------+----+----+----+-----+-----------+------------+--------------+-------------------+------------------------------+
|Timestamp |Open|High|Low |Close|VolBTC     |VolCurrency |Weighted_Price|dateTime           |features                      |
+----------+----+----+----+-----+-----------+------------+--------------+-------------------+------------------------------+
|1325317920|4.39|4.39|4.39|4.39 |0.45558087 |2.0000000193|4.39          |2011-12-31 05:52:00|[4.39,0.45558087,2.0000000193]|
|1325346600|4.39|4.39|4.39|4.39 |48.0       |210.72      |4.39          |2011-12-31 13:50:00|[4.39,48.0,210.72]            |
|1325350740|4.5 |4.57|4.5 |4.57 |37.86229723|171.38033753|4.5264114983  |2011-12-31 14:59:00|[4.5,37.86229723,171.38033753]|
|1325350800|4.58|4.58|4.58|4.58 |9.0        |41.22       |4.58          |2011-12-31 15:00:00|[4.58,9.0,41.22]              |
|1325391360|4.58|4.58|4.58|4.58 |1.502      |6.87916     |4.58          |2012-01-01 02:16:00|[4.58,1.502,6.87916]          |


## Normalização

In [46]:
# Divisão em dados de treino e teste
dados_treino, dados_teste = df_assembled.randomSplit([.7,.3], seed = rnd_seed)

In [47]:
type(dados_treino)

pyspark.sql.dataframe.DataFrame

In [49]:
# Cria o scaler
scaler = MinMaxScaler(inputCol = 'features', outputCol = 'scaled_features')

In [52]:
# Fit nos dados de treino
scalerModel = scaler.fit(dados_treino)

                                                                                

In [53]:
# Fit e transform nos dados de treino
dados_treino_scaled = scalerModel.transform(dados_treino)

Exception ignored in: <function JavaWrapper.__del__ at 0x7f612296a430>
Traceback (most recent call last):
  File "/home/ingo/anaconda3/lib/python3.9/site-packages/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'VectorAssembler' object has no attribute '_java_obj'
Exception ignored in: <function JavaWrapper.__del__ at 0x7f612296a430>
Traceback (most recent call last):
  File "/home/ingo/anaconda3/lib/python3.9/site-packages/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'VectorAssembler' object has no attribute '_java_obj'
Exception ignored in: <function JavaWrapper.__del__ at 0x7f612296a430>
Traceback (most recent call last):
  File "/home/ingo/anaconda3/lib/python3.9/site-packages/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: '

In [68]:
# Transform nos dados de teste
dados_teste_scaled = scalerModel.transform(dados_teste)

In [59]:
dados_treino_scaled.select('features', 'scaled_features').show(10, truncate = False)

[Stage 21:>                                                         (0 + 1) / 1]

+-------------------------------+-------------------------------------------------------------------+
|features                       |scaled_features                                                    |
+-------------------------------+-------------------------------------------------------------------+
|[4.39,48.0,210.72]             |[9.557729822093386E-6,0.008199728766573702,1.51589789206087E-5]    |
|[4.5,37.86229723,171.38033753] |[1.1339679449941311E-5,0.006467928495112392,1.2328924278777876E-5] |
|[4.58,9.0,41.22]               |[1.263564281564889E-5,0.001537449143732569,2.9653241795154263E-6]  |
|[4.84,10.0,48.4]               |[1.6847523754198516E-5,0.001708276826369521,3.4818459555688167E-6] |
|[5.0,10.1,50.5]                |[1.9439450485613674E-5,0.0017253595946332162,3.63291778421953E-6]  |
|[5.14,0.68,3.4952]             |[2.170738637560193E-5,1.1616282419312745E-4,2.5144107404760596E-7] |
|[5.26,29.31939163,154.21999997]|[2.36513314241633E-5,0.00500856372847815,1.109442

                                                                                

In [60]:
dados_treino_scaled.columns

['Timestamp',
 'Open',
 'High',
 'Low',
 'Close',
 'VolBTC',
 'VolCurrency',
 'Weighted_Price',
 'dateTime',
 'features',
 'scaled_features']

## Machine Learning

Versão 1 do Modelo (Benchmark)

In [63]:
# Cria o modelo de regressão
modelo_lr_v1 = (LinearRegression(featuresCol = 'scaled_features',
                                 labelCol = 'Weighted_Price',
                                 predictionCol = 'Predicted_price',
                                 maxIter = 100,
                                 regParam = 0.3,
                                 elasticNetParam = 0.8,
                                 standardization = False))

In [64]:
# Treino o modelo
modelo_v1 = modelo_lr_v1.fit(dados_treino_scaled)

                                                                                

In [66]:
# Salva o modelo em disco
modelo_v1.write().overwrite().save('modelo_v1')

                                                                                

## Avaliação do Modelo

In [69]:
# Previsões com dados de teste
previsoes_v1 = modelo_v1.transform(dados_teste_scaled)

In [70]:
# Seleciona oas colunas
pred_data_v1 = previsoes_v1.select('Predicted_Price', 'Weighted_Price').show(10)

[Stage 30:>                                                         (0 + 1) / 1]

+-----------------+--------------+
|  Predicted_Price|Weighted_Price|
+-----------------+--------------+
|7.452060192894729|          4.39|
|7.641960478636686|          4.58|
|8.061740057645222|           5.0|
|8.381572117842202|          5.32|
|8.351587862198736|          5.29|
|8.351587862198736|          5.29|
| 8.25164034338718|          5.19|
|8.421551125366825|  5.3604618117|
|8.811346448731895|   5.777027027|
|9.061215245760785|           6.0|
+-----------------+--------------+
only showing top 10 rows



                                                                                

In [71]:
# Mean Absolute Error
print('Mean Absolute Error (MAE) nos dados de teste: {0}'.format(modelo_v1.summary.meanAbsoluteError))

Mean Absolute Error (MAE) nos dados de teste: 5.017385733181623


In [72]:
# Cria um avaliador para o modelo regressão
evaluator = RegressionEvaluator(labelCol = 'Weighted_Price',
                                predictionCol = 'Predicted_price',
                                metricName = 'rmse')

In [73]:
# Aplica o avaliador
rmse_v1 = evaluator.evaluate(previsoes_v1)
print('Root Mean Squared Error (RMSE) nos dados de teste = %g' % rmse_v1)



Root Mean Squared Error (RMSE) nos dados de teste = 11.2879


                                                                                

In [74]:
# Extrai as previsões
pred_results_v1 = modelo_v1.evaluate(dados_teste_scaled)

                                                                                

In [90]:
# Valores reais de Y sendo convertidos para o formato de Pandas
Y = pred_results_v1.predictions.select('Weighted_Price').toPandas()

                                                                                

In [91]:
# Valores previstos de Y sendo convertidos para o formato de Pandas
_Y = pred_results_v1.predictions.select("Predicted_price").toPandas()

                                                                                

In [94]:
# Distribuição dos valores reais x valores previstos
sns.set_style("dark")
ax1 = sns.displot(Y, color = "r", label = "Valores Reais")
sns.displot(_Y, color = "b", label = "Valores Previstos")

<seaborn.axisgrid.FacetGrid at 0x7f6150548160>

In [95]:
# Plot dos valores reais x valores previstos
plt.figure(figsize = (12,7))
plt.plot(Y, color = 'green', marker = '*', linestyle = 'dashed', label = 'Predicted Price')
plt.plot(_Y, color = 'red', label = 'Weighted_Price')
plt.title('Resuldado do Modelo')
plt.xlabel('Valor Real')
plt.ylabel('Valor Previsto')
plt.legend()

<matplotlib.legend.Legend at 0x7f611a0a1c40>

Versão 2 do Modelo (Otimização de Hiperparâmetros)

In [97]:
# Cria o modelo
modelo_lr_v2 = (LinearRegression(featuresCol = 'scaled_features',
                                  labelCol = 'Weighted_Price',
                                  predictionCol = 'Predicted_price'))

In [98]:
# Cria um grid para otimização de hiperparâmetros
grid = ParamGridBuilder().addGrid(modelo_lr_v2.maxIter, [50, 100]).build()

In [103]:
# Cria o avaliador (será usado na validação cruzada)
evaluator = RegressionEvaluator(labelCol = 'Weighted_Price',
                                predictionCol = 'Predicted_price',
                                metricName = 'rmse')

In [104]:
# Cria o CrossValidator
cv = CrossValidator(estimator = modelo_lr_v2, estimatorParamMaps = grid, evaluator = evaluator, parallelism = 2)

In [105]:
# Treino o CrossValidator
cvModel = cv.fit(dados_treino_scaled)

23/03/07 10:46:12 WARN CacheManager: Asked to cache already cached data.
23/03/07 10:46:12 WARN CacheManager: Asked to cache already cached data.
23/03/07 10:46:12 WARN Instrumentation: [e303192a] regParam is zero, which might cause numerical instability and overfitting.
23/03/07 10:46:12 WARN Instrumentation: [1cc12bcb] regParam is zero, which might cause numerical instability and overfitting.


Exception ignored in: <function JavaWrapper.__del__ at 0x7f612296a430>          
Traceback (most recent call last):
  File "/home/ingo/anaconda3/lib/python3.9/site-packages/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'RegressionEvaluator' object has no attribute '_java_obj'
                                                                                

23/03/07 10:46:24 WARN Instrumentation: [ebde19dd] regParam is zero, which might cause numerical instability and overfitting.
23/03/07 10:46:24 WARN Instrumentation: [f5fca755] regParam is zero, which might cause numerical instability and overfitting.


[Stage 61:====>             (2 + 6) / 8][Stage 63:==>               (1 + 2) / 8]

23/03/07 10:46:34 WARN MemoryStore: Not enough space to cache rdd_328_1 in memory! (computed 41.8 MiB so far)
23/03/07 10:46:34 WARN BlockManager: Persisting block rdd_328_1 to disk instead.


                                                                                

23/03/07 10:46:50 WARN Instrumentation: [e8b67c2a] regParam is zero, which might cause numerical instability and overfitting.
23/03/07 10:46:50 WARN Instrumentation: [b423dc29] regParam is zero, which might cause numerical instability and overfitting.


[Stage 73:====>             (2 + 6) / 8][Stage 75:==>               (1 + 2) / 8]

23/03/07 10:47:00 WARN MemoryStore: Not enough space to cache rdd_426_1 in memory! (computed 41.8 MiB so far)
23/03/07 10:47:00 WARN BlockManager: Persisting block rdd_426_1 to disk instead.
23/03/07 10:47:00 WARN MemoryStore: Not enough space to cache rdd_426_2 in memory! (computed 41.8 MiB so far)
23/03/07 10:47:00 WARN BlockManager: Persisting block rdd_426_2 to disk instead.


                                                                                

23/03/07 10:47:14 WARN Instrumentation: [186d3747] regParam is zero, which might cause numerical instability and overfitting.


                                                                                

In [106]:
# Extrai o melhor modelo do CrossValidator
modelo_v2 = cvModel.bestModel

In [107]:
modelo_v2

LinearRegressionModel: uid=LinearRegression_85def05fec26, numFeatures=3

In [108]:
# Salva o modelo em disco
modelo_v2.write().overwrite().save('modelo_v2')

## Avaliação do Modelo

In [109]:
# Previsões com dados de teste
previsoes_v2 = modelo_v2.transform(dados_teste_scaled)

In [111]:
# Seleciona as colunas
pred_data_v2 = previsoes_v2.select('Predicted_price', 'Weighted_price').show(10)

[Stage 94:>                                                         (0 + 1) / 1]

+------------------+--------------+
|   Predicted_price|Weighted_price|
+------------------+--------------+
| 4.508108472357593|          4.39|
| 4.697545264575934|          4.58|
| 5.108091101746675|           5.0|
| 5.437041429505176|          5.32|
| 5.392532805262221|          5.29|
| 5.402256707831288|          5.29|
|5.3069234754546475|          5.19|
| 5.470987379974378|  5.3604618117|
| 5.860328227652778|   5.777027027|
| 6.117129927737251|           6.0|
+------------------+--------------+
only showing top 10 rows



                                                                                

In [112]:
# Mean Absolute Error
print('MAE: {0}'.format(modelo_v2.summary.meanAbsoluteError))

MAE: 3.388028733539672


In [113]:
evaluator = RegressionEvaluator(labelCol = 'Weighted_Price',
                                predictionCol = 'Predicted_price',
                                metricName = 'rmse')

In [114]:
# Aplica o avaliador
rmse_v2 = evaluator.evaluate(previsoes_v2)
print('Root Mean Squared Error (RMSE) nos dados de teste = %g' % rmse_v2)



Root Mean Squared Error (RMSE) nos dados de teste = 10.3524


                                                                                

In [116]:
# Plot dos valores reais x valores previstos

# Extrai as previsoes
pred_results_v2 = modelo_v2.evaluate(dados_teste_scaled)

# Valores reais de Y sendo convertidos para o formato de Pandas
Y = pred_results_v2.predictions.select('Weighted_Price').toPandas()

# Valores previstos de Y sendo convertidos para o formato de Pandas
Y = pred_results_v2.predictions.select('Predicted_price').toPandas()

# Plot
sns.set_style('dark')
ax1 = sns.displot(Y, color = 'r', label = 'Valores Reais')
sns.displot(_Y, color = 'b', label = 'Valores Previstos')

                                                                                

<seaborn.axisgrid.FacetGrid at 0x7f610dfbfbe0>

In [117]:
# Plot dos valores reais x valores previstos
plt.figure(figsize = (12,7))
plt.plot(Y, color = 'green', marker = '*', linestyle = 'dashed', label = 'Predicted Price')
plt.plot(_Y, color = 'red', label = 'Weighted Price')
plt.title('Resultado do Modelo')
plt.xlabel('Valor Real')
plt.ylabel('Valor Previsto')
plt.legend()

<matplotlib.legend.Legend at 0x7f6111b9d730>

Considerando que os erros são muito próximos dos dois modelos, e que as mensagens de WARN que aparecem no treinamento do modelo 2 parecem indicar instabilidade e possivelmente oevertifing, vamos escolher o modelo 1

## Previsões em Tempo Real

In [118]:
# Novos dados
novos_dados = [[20546.29, 3422.57, 72403082.02], [21620.85, 3271.14, 71319207.5]]

In [120]:
# Prepara o dataframe do Pandas
df_novos_dados = pd.DataFrame(novos_dados, columns = ['Open', 'VolBTC', 'VolCurrency'])

In [121]:
# Visualiza
df_novos_dados

Unnamed: 0,Open,VolBTC,VolCurrency
0,20546.29,3422.57,72403082.02
1,21620.85,3271.14,71319207.5


In [123]:
# Converte o dataframe do Pandas para dataframe do Spark
df_novos_dados_spark = spark_session.createDataFrame(df_novos_dados)

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [124]:
# Schema
df_novos_dados_spark.printSchema()

root
 |-- Open: double (nullable = true)
 |-- VolBTC: double (nullable = true)
 |-- VolCurrency: double (nullable = true)



In [126]:
# Visualiza
df_novos_dados_spark.show()

+--------+-------+-------------+
|    Open| VolBTC|  VolCurrency|
+--------+-------+-------------+
|20546.29|3422.57|7.240308202E7|
|21620.85|3271.14| 7.13192075E7|
+--------+-------+-------------+



[Stage 103:>                                                        (0 + 1) / 1]                                                                                

In [127]:
# Cria o dataframe do vetor de atributos
df_assembled = assembler.transform(df_novos_dados_spark)

In [128]:
# Visualiza os dados
df_assembled.show()

+--------+-------+-------------+--------------------+
|    Open| VolBTC|  VolCurrency|            features|
+--------+-------+-------------+--------------------+
|20546.29|3422.57|7.240308202E7|[20546.29,3422.57...|
|21620.85|3271.14| 7.13192075E7|[21620.85,3271.14...|
+--------+-------+-------------+--------------------+



In [129]:
# Normaliza os dados
df_assembled_scaled = scalerModel.transform(df_assembled)

In [130]:
# Previsões com os novos dados
previsoes = modelo_v1.transform(df_assembled_scaled)

In [131]:
# Imprime as previsões
pred_data = previsoes.select('Predicted_price').show()

+------------------+
|   Predicted_price|
+------------------+
| 20538.57142694398|
|21612.567485085438|
+------------------+



In [133]:
# Encerra a sessão Spark
spark_session.stop()