In [2]:
import pandas as pd

import pandas as pd
import numpy as np 
import missingno as mno
import pickle 
import json
import time 
import gc
import random
import sklearn

#For Data Visualization
import matplotlib.pyplot as plt
#%matplotlib inline 
#output of plotting commands is displayed inline within frontends like the Jupyter notebook,
#directly below the code cell that produced it. The resulting plots will then also be stored in the notebook document.

import seaborn as sns
sns.set(rc={'figure.figsize':(10,6)})
custom_colors = ["#4e89ae", "#c56183","#ed6663","#ffa372"]

#NetworkX
import networkx as nx
import plotly.express as px 
import plotly.graph_objects as go #To construct network graphs
from plotly.subplots import make_subplots #To make multiple plots

#To avoid printing of un necessary Deprecation warning and future warnings!
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

from plotly.offline import init_notebook_mode, iplot
from IPython.core.display import display, HTML #To display html content in a code cell
init_notebook_mode(connected=True)

gc.collect()

28

In [4]:
raw_data=pd.read_csv('../database/top_100//bitcoin.csv')
def date_format(df):
    if isinstance(df, pd.DataFrame):
        if 'Date' in df.columns:
            df.Date = pd.to_datetime(df.Date)
            df[['Year','Month','Day']] = df.Date.apply(lambda x: pd.Series(x.strftime("%Y-%m-%d").split("-")))
    else:
        print("Invalid entry, please use a data frame with a Date column.")
    return df

data=date_format(raw_data)
data['Close']=data['Close'].astype(float)
data['Open']=data['Open'].astype(float)
data['High']=data['High'].astype(float)
data['Low']=data['Low'].astype(float)
data['Year']=data['Year'].astype(int)
data['Month']=data['Month'].astype(int)
data['Day']=data['Day'].astype(int)

In [6]:
#Seaborn plot viz of the missing values in the data 
def missing_values(data):
    #import seaborn as sns
    missed = pd.DataFrame()
    missed['column'] = data.columns

    missed['percent'] = [round(100* data[col].isnull().sum() / len(data), 2) for col in data.columns]
    missed = missed.sort_values('percent',ascending=False)
    missed = missed[missed['percent']>0]

    fig = sns.barplot(
        x=missed['percent'], 
        y=missed["column"], 
        orientation='horizontal',palette="winter"
    ).set_title('Missed values percent for every column')


def triple_plot(x, title,c): # Use triple plot for numeric and important key features 
    fig, ax = plt.subplots(3,1,figsize=(20,10),sharex=True)
    sns.distplot(x, ax=ax[0],color=c)
    ax[0].set(xlabel=None)
    ax[0].set_title('Histogram + KDE')
    sns.boxplot(x, ax=ax[1],color=c)
    ax[1].set(xlabel=None)
    ax[1].set_title('Boxplot')
    sns.violinplot(x, ax=ax[2],color=c)
    ax[2].set(xlabel=None)
    ax[2].set_title('Violin plot')
    #fig.suptitle(title, fontsize=30)
    #plt.tight_layout(pad=3.0)
    plt.show()

#Info, missing values and describe of the dataframe along with the triple plot of integer variables
def data_understand(df):
    display(HTML('<div class="alert alert-info"><h4><center>Data Information</center></h4></div><br>'))
    print(df.info())
    display(HTML('<br><div class="alert alert-info"><h4><center>Data Description</center></h4></div><br>'))
    req_cols=df.select_dtypes(include=np.number).columns.to_list()[0:6]
    print(pd.DataFrame(round(df[req_cols].describe(),2)))
    try: missing_values(df)
    except: pass;
    display(HTML('<br><div class="alert alert-info"><h4><center>Missing values</center></h4></div><br>'))
    #print('\n\nNA Values statistics')
    print(df.isna().sum())
    print('\nNo missing values were present! 💯')
    print('\n\n')
    #df.hist(bins=10,figsize=(20,15)) 
    #For smaller data use smaller bin sizes(5 to 20), increase accordingly if the data size increases
    plt.show()
    clrs=0
    display(HTML('<br><div class="alert alert-info"><h4><center>Distribution of OHLC Values of Bitcoin</center></h4></div><br>'))
    for i in df.select_dtypes(include=np.number).columns.to_list()[0:5]:
        if i!='Volume': display(HTML('<div style="color:white;display:fill;border-radius:5px;background-color:#5642C5;font-size:150%;font-family:Verdana;letter-spacing:0.5px;text-align:center"><p style="padding: 10px;color:white;"><center> BTC {} Price<center></p></div>'.format(i)))
        else: display(HTML('<div style="color:white;display:fill;border-radius:5px;background-color:#5642C5;font-size:150%;font-family:Verdana;letter-spacing:0.5px;text-align:center"><p style="padding: 10px;color:white;"><center> BTC trading {} <center></p></div>'.format(i)))
        triple_plot(df[i], str(i).upper(),custom_colors[random.choice([0,1,2,3])])
        print('\n')
    display(HTML('<br><div class="alert alert-info"><h4><center>The Distributions of OHLC(Opening, Highest, Lowest, and Closing price) follow a log-normal distribution. Due to the very reason using mean as a central tendency measure wouldnt be ideal in this case. Since the distribution is log-normal, the mean values could be skewed. Therefore, for that very reason, we will be using median values as they are less prone to skewness.</center></h4></div><br>'))

In [None]:
data_understand(data)

In [10]:
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Currency,Year,Month,Day
0,2010-07-18,0.0,0.1,0.1,0.1,75,USD,2010,7,18
1,2010-07-19,0.1,0.1,0.1,0.1,574,USD,2010,7,19
2,2010-07-20,0.1,0.1,0.1,0.1,262,USD,2010,7,20
3,2010-07-21,0.1,0.1,0.1,0.1,575,USD,2010,7,21
4,2010-07-22,0.1,0.1,0.1,0.1,2160,USD,2010,7,22
...,...,...,...,...,...,...,...,...,...,...
4415,2022-08-19,23201.6,23202.3,20807.8,20831.3,339472,USD,2022,8,19
4416,2022-08-20,20830.7,21357.4,20784.8,21138.9,206943,USD,2022,8,20
4417,2022-08-21,21138.9,21692.4,21077.4,21517.2,177522,USD,2022,8,21
4418,2022-08-22,21516.8,21517.4,20912.1,21416.3,251833,USD,2022,8,22


In [8]:
fig = px.line(data, x="Year", y="Close", title='BTC price over the years',text='Year')
fig.update_traces(textposition="bottom right")
fig.update_layout(template='plotly_dark',
    yaxis_title='Closing price',
    hovermode="x",title_x=0.5
)
fig.show()

In [13]:
fig = go.Figure(data=go.Ohlc(x=data['Date'],
                open=data['Open'],
                high=data['High'],
                low=data['Low'],
                close=data['Close']))
fig.update(layout_xaxis_rangeslider_visible=False)

fig.update_layout(template='plotly_dark',
    title='BTC Price over the years',
    xaxis_title="Year",
    yaxis_title='BTC Price',title_x=0.5)
fig.update_yaxes( # the y-axis is in dollars
    tickprefix="$", showgrid=True
)

fig.update_xaxes(
     showgrid=True
)
fig.show()

In [12]:
fig = go.Figure(data=go.Ohlc(x=data['Date'],
                open=data['Open'],
                high=data['High'],
                low=data['Low'],
                close=data['Close']))

fig.update_layout(template='plotly_dark',
    title='BTC Price over the years',
    xaxis_title="Year",
    yaxis_title='BTC Price',title_x=0.5,
    shapes = [dict(x0='2017-09-01', x1='2017-09-01', y0=0, y1=1, xref='x', yref='paper',line_width=2),
              dict(x0='2018-01-10', x1='2018-01-10', y0=0, y1=1, xref='x', yref='paper',line_width=2),
              dict(x0='2020-11-01', x1='2020-11-01', y0=0, y1=1, xref='x', yref='paper',line_width=2),
             dict(x0='2021-05-01', x1='2021-05-01', y0=0, y1=1, xref='x', yref='paper',line_width=2)],
    annotations=[dict(x='2017-01-01', y=0.05, xref='x', yref='paper',showarrow=False, xanchor='left', text='bull🐂'),
        dict(x='2018-01-10', y=0.05, xref='x', yref='paper',showarrow=False, xanchor='left', text='bear🐻'),
        dict(x='2020-11-01', y=0.05, xref='x', yref='paper',showarrow=False, xanchor='left', text='bull🐂'),
        dict(x='2021-08-01', y=0.05, xref='x', yref='paper',showarrow=False, xanchor='left', text='bear🐻')]
)

fig.update_yaxes( # the y-axis is in dollars
    tickprefix="$", showgrid=False
)

fig.update_xaxes(
     showgrid=True
)

fig.show()