# EDA & visualizations

LF-invest is a financial company mainly specialised in real estate and stocks market.

Considering the new opportunities deriving from the cryptocurrencies market and the potential future enlargement of it we have been asked to develop a trading model applicable in a real world scenario.

## Imports

In [1]:
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot, init_notebook_mode
# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
import requests 
import pickle
import random
import json
import os
import decimal
import hmac
import time
import itertools
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
%matplotlib inline
import mplfinance as mpf
import mpl_finance
from mpl_finance import candlestick_ohlc
import matplotlib.dates as mdates
import numpy as np
style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')

In [2]:
ls

 Volume in drive C is OS
 Volume Serial Number is D816-28B0

 Directory of C:\Users\luigi\FlatironSchool\Module06\Project\dsc-capstone-project-v2-online-ds-pt-051319

19/03/2020  12:24    <DIR>          .
19/03/2020  12:24    <DIR>          ..
19/03/2020  12:24    <DIR>          .ipynb_checkpoints
18/03/2020  16:37        14,214,202 1) EDA & visualizations.ipynb
18/03/2020  12:56           207,243 2) Time_series_prediction.ipynb
18/03/2020  11:46           326,662 3) Machine_learning.ipynb
19/03/2020  12:23            20,814 4) Neural_network_prediction.ipynb
18/03/2020  16:36        13,148,480 500crypto_joined_close.csv
21/02/2020  12:53    <DIR>          crypto_dfs
18/03/2020  16:21    <DIR>          crypto_project_images
18/03/2020  16:36             7,538 crypto_tickers.pickle
18/03/2020  16:01         1,675,381 Cryptocurrencies_pred_presentation.pdf
19/03/2020  12:23    <DIR>          data_BTCUSD
19/03/2020  12:22    <DIR>          logs
19/03/2020  12:24    <DIR>          models
1

## Get trading data for BTCUSD

Request from Binance API symbol's with hourly interval and create a df.
Increasing the range gives us further data in the past.

In [3]:
df_btcusd = pd.DataFrame()

for x in range(5):
    
    if x == 0: 
    
        response = requests.get('https://api.binance.com/api/v1/klines?symbol=BTCUSDT&interval=1h')
        data_symbol = json.loads(response.text)

        # put in dataframe and drop columns we don't neeed
        df = pd.DataFrame.from_dict(data_symbol)
        df = df.drop(range(6, 12), axis=1)

        # rename columns
        col_names = ['Time', 'Open', 'High', 'Low', 'Close', 'Volume']
        df.columns = col_names

        # transform values from strings to floats
        for col in col_names[1:6]:
            df[col] = df[col].astype(float)
            
        df = df.sort_values(by='Time', ascending=False, kind='quicksort')
            
        df_btcusd = df_btcusd.append(df)
        
        timestamp = int(df.loc[0, 'Time'])
            
    else:
        
        response = requests.get(f'https://api.binance.com/api/v1/klines?symbol=BTCUSDT&interval=1h&endTime={timestamp}')
        data_symbol = json.loads(response.text)
        
        # put in dataframe and drop columns we don't neeed
        df1 = pd.DataFrame.from_dict(data_symbol)
        df1 = df1.drop(range(6, 12), axis=1)
        
        # rename columns
        col_names = ['Time', 'Open', 'High', 'Low', 'Close', 'Volume']
        df1.columns = col_names
        
        # transform values from strings to floats
        for col in col_names[1:6]:
            df1[col] = df1[col].astype(float)
            
        df1.drop(499, inplace=True)
        
        df1 = df1.sort_values(by='Time', ascending=False, kind='quicksort')
        
        df_btcusd = df_btcusd.append(df1)
        
        df_btcusd.reset_index(drop = True, inplace = True)
        
        timestamp = int(df1.loc[0, 'Time'])
        
df_btcusd['Date'] = pd.to_datetime(df_btcusd['Time'], unit='ms')

df_btcusd.to_csv('./data_arima/btcusd.csv', index = False)

FileNotFoundError: [Errno 2] No such file or directory: './data_arima/btcusd.csv'

## Plot candlestick using Plotly

In [None]:
candle = go.Candlestick(
            x = df_btcusd['Date'],
            open = df_btcusd['Open'],
            close = df_btcusd['Close'],
            high = df_btcusd['High'],
            low = df_btcusd['Low'],
            name = "Candlesticks")

fig = go.Figure(data=[candle])

fig.show()

In [None]:
df_btcusd.duplicated().any()

In [None]:
df_btcusd.drop(columns='Time', axis=1, inplace=True)

In [None]:
df_btcusd = df_btcusd.set_index('Date')

In [None]:
df_btcusd = df_btcusd.sort_index()

As we saw the 2020-03-12 at 09:00:00 the prices due to the corona virus are suffering a fall.

I am for this reason removing the data after that point going forward.

In [None]:
df_btcusd = df_btcusd[df_btcusd.index < '2020-03-12 09:00:00']

In [None]:
df_btcusd.head()

## Plot BTCUSD using interactive Plotly 

In [None]:
df_btcusd['Close'].iplot(mode='lines+text',
                        xTitle='Time',
                        yTitle='Price', 
                        title='Hourly BTCUSD Close',
                        opacity=0.8)

## Plot BTCUSD & moving avg using Matplotlib

In [None]:
plt.figure(figsize=(20,10))
# Moving Average 100
df_btcusd['100MA'] = df_btcusd['Close'].rolling(window=100, min_periods=0).mean()

ax1 = plt.subplot2grid((6, 1), (0, 0), rowspan=5, colspan=1)
ax2 = plt.subplot2grid((6, 1), (5, 0), rowspan=1, colspan=1, sharex=ax1)

ax1.plot(df_btcusd.index, df_btcusd['Close'], df_btcusd['100MA'])
ax2.bar(df_btcusd.index, df_btcusd['Volume'])

plt.show()

Resample close and volume colums by 1 day.

In [None]:
df_ohlc = df_btcusd['Close'].resample('1d').ohlc()
df_volume = df_btcusd['Volume'].resample('1d').sum()
df_ohlc.head()

Change date to matplotlib timestamp to plot ohlc candlestick.

In [None]:
df_ohlc.index = df_ohlc.index.map(mdates.date2num)
df_ohlc.head()

In [None]:
df_ohlc.reset_index(inplace=True)
df_ohlc.head()

In [None]:
df_ohlc.columns = [['Date', 'Open', 'High', 'Low', 'Close']]
df_ohlc.head()

## Plot candlestick using Matplotlib

In [None]:
plt.figure(figsize=(20,10))
ax1 = plt.subplot2grid((6, 1), (0, 0), rowspan=5, colspan=1)
ax2 = plt.subplot2grid((6, 1), (5, 0), rowspan=1, colspan=1, sharex=ax1)



candlestick_ohlc(ax1, df_ohlc.values, width=2, colorup='g', colordown='r')
ax1.grid(True)
ax1.set_axisbelow(True)
ax1.set_title('BTC Share Price', color='white', fontsize=27)
ax1.set_facecolor('black')
ax1.figure.set_facecolor('#121212')
ax1.tick_params(axis='x', colors='white')
ax1.tick_params(axis='y', colors='white')
ax1.axes.get_xaxis().set_visible(False)
ax1.set_ylabel('Price', fontsize = 20) 

ax1.plot(df_btcusd['Close'].rolling(window=100).mean(), color='b',  label='100MA')
ax1.legend(loc="upper right", fontsize=15)

plt.xticks(rotation=45,fontsize=15)
plt.yticks(fontsize=15)
ax1.xaxis_date()


ax2.fill_between(df_volume.index.map(mdates.date2num), df_volume.values, 0, color = 'magenta',  label='Volume')
ax2.set_ylabel('Volume', fontsize = 20) 
ax2.tick_params(axis='x', colors='white')
ax2.tick_params(axis='y', colors='white')
ax2.grid(True)
ax2.set_facecolor('black')
ax2.legend(loc="upper right", fontsize=15)
plt.legend()
plt.show()


# Get All Trading (currently) Symbols

In [None]:
def get_trading_symbols():
    '''
     Get trading symbols from Binance API

     Returns: 
         symbols: symbols actively traded in Binance
    '''
    
    response = requests.get('https://api.binance.com' + '/api/v1/exchangeInfo')
    data_symbols = json.loads(response.text)
    
    symbols = []

    for x in data_symbols['symbols']:
        if x['status'] == 'TRADING':
            symbols.append(x['symbol'])
            
    return symbols

In [None]:
symbols = get_trading_symbols()

In [None]:
symbols

# Get Data for all the Symbols

Use 500 symbols.

In [None]:
symbols = symbols[:500]

Create pickle object so that later we can reference it.

In [None]:
with open('crypto_tickers.pickle', 'wb') as f:
    pickle.dump(symbols, f)

In [None]:
def get_data_from_binance(reload_symbols=False):
    '''
     Get data for each cryptocurrency, increasing range gives us further data in the past
     
     Args: 
         reload_symbols: check if path with symbols already exists, if not it calls function to get symbols
    '''
    
    if reload_symbols:
        symbols = get_trading_symbols()
    else:
        with open('crypto_tickers.pickle', 'rb') as f:
            symbols = pickle.load(f)
            
    if not os.path.exists('crypto_dfs'):
        os.makedirs('crypto_dfs')
        
    for symbol in symbols:
        
        print(symbol)
        
        if not os.path.exists('crypto_dfs/{}.csv'.format(symbol)):
            
            
            df_crypto = pd.DataFrame()

            for x in range(5):

                if x == 0: 

                    response = requests.get(f'https://api.binance.com/api/v1/klines?symbol={symbol}&interval=1h')
                    data_symbol = json.loads(response.text)

                    # put in dataframe and drop columns we don't neeed
                    df = pd.DataFrame.from_dict(data_symbol)
                    df = df.drop(range(6, 12), axis=1)

                    # rename columns
                    col_names = ['Time', 'Open', 'High', 'Low', 'Close', 'Volume']
                    df.columns = col_names

                    # transform values from strings to floats
                    for col in col_names[1:6]:
                        df[col] = df[col].astype(float)

                    df = df.sort_values(by='Time', ascending=False, kind='quicksort')

                    df_crypto = df_crypto.append(df)

                    timestamp = int(df.loc[0, 'Time'])

                else:

                    response = requests.get(f'https://api.binance.com/api/v1/klines?symbol={symbol}&interval=1h&endTime={timestamp}')
                    data_symbol = json.loads(response.text)

                    # put in dataframe and drop columns we don't neeed
                    df1 = pd.DataFrame.from_dict(data_symbol)
                    df1 = df1.drop(range(6, 12), axis=1)

                            # rename columns
                    col_names = ['Time', 'Open', 'High', 'Low', 'Close', 'Volume']
                    df1.columns = col_names

                    # transform values from strings to floats
                    for col in col_names[1:6]:
                        df1[col] = df1[col].astype(float)

                    df1.drop(df.tail(1).index,inplace=True)

                    df1 = df1.sort_values(by='Time', ascending=False, kind='quicksort')

                    df_crypto = df_crypto.append(df1)

                    df_crypto.reset_index(drop = True, inplace = True)

                    timestamp = int(df1.loc[0, 'Time'])
                    

            df_crypto['Date'] = pd.to_datetime(df_crypto['Time'], unit='ms')
            
            df_crypto.to_csv('crypto_dfs/{}.csv'.format(symbol))
            
        else:
            print('Already have {}'.format(symbol))

In [None]:
get_data_from_binance()

In [None]:
def compile_data():
    '''
     Create main_df containing close column for each symbol 

     Returns: 
         main_df: main_df contaning close column for each cryptocurrency
    '''
    with open('crypto_tickers.pickle', 'rb') as f:
        symbols = pickle.load(f)
    
    main_df = pd.DataFrame()
    
    for count, symbol in enumerate(symbols):
        df =pd.read_csv(f'crypto_dfs/{symbol}.csv',  index_col=0)
        df.set_index('Date', inplace=True)
        
        df.rename(columns={'Close' : symbol}, inplace=True)
        df.drop(['Time','Open','High','Low','Volume'], 1, inplace=True)
        
        if main_df.empty:
            main_df = df
            
        else:
            main_df = main_df.join(df)
            
    print(main_df.head())    
        
    main_df.to_csv('500crypto_joined_close.csv')
    
    return main_df

In [None]:
main_df = compile_data()

In [None]:
main_df.head()

In [None]:
main_df.duplicated().any()

# Find Relationship with different Cryptocurrencies

Check correlation between all cryptocurrency.

In [None]:
df = pd.read_csv('500crypto_joined_close.csv')
corrs = df.corr()
corrs

In [None]:
corrs.index

In [None]:
def visualize_data():
    ''' Visualize correlation heatmap 
    '''
    df = pd.read_csv('500crypto_joined_close.csv')
    corrs = df.corr()

    fig = go.Figure(data=go.Heatmap(
                    z=corrs.values,
                    x=list(corrs.columns),
                    y=list(corrs.index),
                    showscale=True,
                    colorscale='Inferno',
                    hoverongaps = False))
    fig.show()

## Visualize interactive correlation heatmap using Plotly

In [None]:
visualize_data()

Next we will be going to do a prediction using time series modeling.