<a href="https://colab.research.google.com/github/knowhrishi/DLTA/blob/main/dlta_eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q kaggle

from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json
unzip:  cannot find or open train.csv.zip, train.csv.zip.zip or train.csv.zip.ZIP.


In [3]:

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c g-research-crypto-forecasting

!mkdir train
!unzip train.csv.zip -d train

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading __init__.py to /content
  0% 0.00/59.0 [00:00<?, ?B/s]
100% 59.0/59.0 [00:00<00:00, 23.0kB/s]
Downloading competition.cpython-37m-x86_64-linux-gnu.so to /content
  0% 0.00/458k [00:00<?, ?B/s]
100% 458k/458k [00:00<00:00, 63.0MB/s]
Downloading train.csv.zip to /content
100% 1.02G/1.02G [00:06<00:00, 152MB/s]
100% 1.02G/1.02G [00:06<00:00, 167MB/s]
Downloading example_sample_submission.csv to /content
  0% 0.00/406 [00:00<?, ?B/s]
100% 406/406 [00:00<00:00, 346kB/s]
Downloading asset_details.csv to /content
  0% 0.00/444 [00:00<?, ?B/s]
100% 444/444 [00:00<00:00, 443kB/s]
Downloading supplemental_train.csv.zip to /content
 91% 105M/116M [00:00<00:00, 129MB/s] 
100% 116M/116M [00:00<00:00, 158MB/s]
Downloading example_test.csv to /content
  0% 0.00/5.78k [00:00<?, ?B/s]
100% 5.78k/5.78k [00:00<00:00, 4.82MB/s]
mkdir: cannot create directory ‘train’: File exists
Archive:  train.csv.zip
  inflating: train/train.csv    

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [4]:
data = pd.read_csv('/content/train/train.csv') 
asset_details = pd.read_csv('/content/asset_details.csv')

# <center>EDA</center> 

## Column Description
*   **timestamp**: All timestamps are returned as second Unix timestamps (the number of seconds elapsed since 1970-01-01 00:00:00.000 UTC). Timestamps in this dataset are multiple of 60, indicating minute-by-minute data.
*   **Asset_ID**: The asset ID corresponding to one of the crytocurrencies (e.g. `Asset_ID = 1` for Bitcoin). The mapping from `Asset_ID` to crypto asset is contained in `asset_details.csv`.
*   **Count**: Total number of trades in the time interval (last minute).
*   **Open**:	Opening price of the time interval (in USD).
*   **High**:	Highest price reached during time interval (in USD).
*   **Low**: Lowest price reached during time interval (in USD).
*   **Close**:	Closing price of the time interval (in USD).
*   **Volume**:	The number of cryptoasset units traded during the minute.
*   **VWAP**: The average price of the asset over the time interval, weighted by volume. VWAP is an aggregated form of trade data.
*   **Target**: Residual log-returns for the asset over a 15 minute horizon. 


In [5]:
data.head() 

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


### Checking Null Rows

In [6]:
data.isnull().sum()

timestamp         0
Asset_ID          0
Count             0
Open              0
High              0
Low               0
Close             0
Volume            0
VWAP              9
Target       750338
dtype: int64

In [7]:
print("Total Null Target Rows = " ,data["Target"].isnull().sum())
print("Percentage of NUll rows in Training Data = {:.2f}%".format(data["Target"].isnull().sum()*100 / data.shape[0] ))

Total Null Target Rows =  750338
Percentage of NUll rows in Training Data = 3.10%


# <center>DATA DISTRIBUTION</center> 

### Training Data Distribution among differnet Assets (Crypto Currencies)

In [8]:
asset_count= []
for i in range(14):
    count = (data["Asset_ID"]==i).sum()
    asset_count.append(count)
fig = px.bar(x = asset_details.sort_values("Asset_ID")["Asset_Name"],
             y = asset_count , 
             color = asset_count ,
             color_continuous_scale="Emrld") 
fig.update_xaxes(title="Assets")
fig.update_yaxes(title = "Number of Rows")
fig.update_layout(showlegend = True,
    title = {
        'text': 'Data Distribution ',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'} ,
        template="plotly_white")
fig.show()

# <center>CANDELSTICK CHARTS</center> 

In [13]:
def crypto_df(asset_id ,data= data ):
    df = data[data["Asset_ID"]==asset_id].reset_index(drop = True)
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    df = df.set_index('timestamp')
    return df
btc = crypto_df(asset_id = 1)
eth = crypto_df (asset_id = 6 )
mon = crypto_df(asset_id = 11 )
dog = crypto_df(asset_id = 4 )
bin = crypto_df(asset_id = 0 )

In [14]:
def candelstick_chart(data,title):
    candlestick = go.Figure(data = [go.Candlestick(x =data.index, 
                                               open = data[('Open')], 
                                               high = data[('High')], 
                                               low = data[('Low')], 
                                               close = data[('Close')])])
    candlestick.update_xaxes(title_text = 'Time',
                             rangeslider_visible = True)

    candlestick.update_layout(
    title = {
        'text': '{:} Candelstick Chart'.format(title),
        'y':0.90,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'} , 
    template="plotly_white")

    candlestick.update_yaxes(title_text = 'Price in USD', ticksuffix = '$')
    return candlestick

### Bitcoin, Etherreum, Monero, Dogecoin, Binance Coin Candelstick Chart for last few rows

In [15]:
btc_plot = candelstick_chart(btc[-100:],title = "Bitcoin(BTC)")
btc_plot.show()

eth_plot = candelstick_chart(eth[-250:],title = "Ethereum(ETC)")
eth_plot.show()

mon_plot = candelstick_chart(mon[-500:],title = "Monero")
mon_plot.show()

dog_plot = candelstick_chart(dog[-1100:],title = "Dogecoin")
dog_plot.show()

bin_plot = candelstick_chart(bin[-1200:],title = "Binance Coin")
bin_plot.show()


# <center>OHLC CHARTS</center> 

In [16]:
def ohlc_chart(data,title):
    ohlc = go.Figure(data = [go.Ohlc(x =data.index, 
                                               open = data[('Open')], 
                                               high = data[('High')], 
                                               low = data[('Low')], 
                                               close = data[('Close')])])
    ohlc.update_xaxes(title_text = 'Time',
                             rangeslider_visible = True)

    ohlc.update_layout(
    title = {
        'text': '{:} OHLC Chart'.format(title),
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        template="plotly_white")

    ohlc.update_yaxes(title_text = 'Price in USD', ticksuffix = '$')
    return ohlc

### Bitcoin(BTC) OHLC Chart for first 200 rows

In [19]:
ohlc_chart(btc[:200], title = "Bitcoin(BTC)")

### Ethereum OHLC Chart for first 200 rows

In [18]:
ohlc_chart(eth[:200], title = "Ethereum")

### Monero OHLC Chart for first 150 rows

In [None]:
ohlc_chart(mon[:150], title = "Monero")

### Dogecoin OHLC Chart for first 300 rows

In [None]:
ohlc_chart(dog[:200], title = "Dogecoin")

### Binance Coin OHLC Chart for first 200 rows

In [None]:
ohlc_chart(bin[:200], title = "Binance Coin")

# <center>AREA PLOTS</center> 

In [20]:
def vol_traded(data ,title,color):
    area = px.area(data_frame=data,
               x = data.index ,
               y = "Volume",
               markers = True)
    area.update_traces(line_color=color)
    area.update_xaxes(
        title_text = 'Time',
        rangeslider_visible = True)
    area.update_yaxes(title_text = 'Number of trades every minute')
    area.update_layout(showlegend = True,
        title = {
            'text': '{:} Volume Traded'.format(title),
            'y':0.94,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
        template="plotly_white")
    return area

### Ethereum(ETH) Volume Traded for last 50 rows

In [22]:
vol_traded(eth[-50:], "Ethereum (ETH)",color = "Red")

### Ethereum Volume Traded for last 300 rows

In [24]:
vol_traded(eth[-300:], "Ethereum",color = "Blue")

### Monero Volume Traded for last 200 rows

In [25]:
vol_traded(mon[-200:], "Monero",color = "Orange")

### Dogecoin Volume Traded for last 250 rows


In [26]:
vol_traded(dog[-250:], "Dogecoin",color = "Purple")

### Binance Coin Volume Traded for last 300 rows

In [27]:
vol_traded(dog[-350:], "bin",color = "Red")