In [86]:
import pandas as pd
from functools import reduce

## Get data from Big Query

In [48]:
%%bigquery df
SELECT
    id
    ,symbol
    ,name
    ,date
    ,market_data.current_price as current_price
    ,market_data.market_cap as market_cap
    ,market_data.total_volume as total_volume  
FROM
    `crispy-computing-machine.crispy_dwh.crypto_btc`

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 738.82query/s] 
Downloading: 100%|██████████| 1909/1909 [00:00<00:00, 2265.38rows/s]


Create a column for time ordered by day

In [49]:
df["date"]=pd.to_datetime(df["date"])
df.sort_values(by=["date"],ascending=True,inplace=True)
df.reset_index(drop=True,inplace=True)
df.reset_index(drop=False,inplace=True)
df.rename(columns={"index":"t"},inplace=True)

Summary of our dataframe

In [50]:
completitud=pd.DataFrame((1-df.isnull().sum()/df.shape[0])*100).reset_index().rename(columns={"index":"columna",0:"completitud"})
missings=pd.DataFrame(df.isnull().sum()).reset_index().rename(columns={"index":"columna",0:"missings"})
completitud=completitud.merge(missings,how="inner",on="columna")
tipo=pd.DataFrame(df.dtypes).reset_index().rename(columns={"index":"columna",0:"type"})
completitud=completitud.merge(tipo,how="inner",on="columna")
completitud

Unnamed: 0,columna,completitud,missings,type
0,t,100.0,0,int64
1,id,100.0,0,object
2,symbol,100.0,0,object
3,name,100.0,0,object
4,date,100.0,0,datetime64[ns]
5,current_price,100.0,0,float64
6,market_cap,99.947617,1,float64
7,total_volume,100.0,0,float64


## Feature Engineering 

In [95]:
class window_time(object):
    def __init__(self,vobs,vdes,step,incremental):
        self.vobs = vobs
        self.vdes = vdes    
        self.step = step
        self.incremental = incremental
    
    def anclas(self, time_column):
        self.anclai = self.vobs-1
        self.anclaf = time_column.max()- self.vdes
        
    def ing_X(self,df,ancla,k):
        
        l = []
        
        aux = df.loc[(df['t'] <= ancla) & (df['t'] >= (ancla-k+1))]
        
        #current_price_mean
        piv = aux.pivot_table(index='symbol',columns='t',values='current_price',aggfunc='sum')
        piv[f'x_current_price_mean_{k}'] = piv.mean(axis=1)
        l.append(piv.filter(like='x_'))
        
        #current_price_std
        piv = aux.pivot_table(index='symbol',columns='t',values='current_price',aggfunc='sum')
        piv[f'x_current_price_std_{k}'] = piv.std(axis=1)
        l.append(piv.filter(like='x_'))   
        
        #market_cap_mean
        piv = aux.pivot_table(index='symbol',columns='t',values='market_cap',aggfunc='sum')
        piv[f'x_market_cap_mean_{k}'] = piv.mean(axis=1)
        l.append(piv.filter(like='x_'))

        #market_cap_std
        piv = aux.pivot_table(index='symbol',columns='t',values='market_cap',aggfunc='sum')
        piv[f'x_market_cap_std_{k}'] = piv.std(axis=1)
        l.append(piv.filter(like='x_'))   

        #total_volume_mean
        piv = aux.pivot_table(index='symbol',columns='t',values='total_volume',aggfunc='sum')
        piv[f'x_total_volume_mean_{k}'] = piv.mean(axis=1)
        l.append(piv.filter(like='x_'))

        #total_volume_std
        piv = aux.pivot_table(index='symbol',columns='t',values='total_volume',aggfunc='sum')
        piv[f'x_total_volume_std_{k}'] = piv.std(axis=1)
        l.append(piv.filter(like='x_'))   

        aux = reduce(lambda x,y:pd.merge(x,y,left_index=True,right_index=True,how='outer'),l).reset_index()
        aux.insert(1,'ancla',ancla)
    
        return aux
    
    def eng_X(self,df,um):
        
        cruzar = lambda x,y:pd.merge(x,y,on=um,how='outer')
        
        apilar = lambda x,y:x.append(y,ignore_index=True)
        
        X = reduce(apilar,
                   map(lambda ancla:
                       reduce(cruzar,
                              map(lambda k:self.ing_X(df,ancla,k),range(self.step,self.vobs+self.step,self.step))
                             ),
                   range(self.anclai,self.anclaf+1)
                      )
                  )
        return X

In [96]:
window=window_time(vobs=360,vdes=7,step=30,incremental=0.1)
window.anclas(df['t'])
X=window.eng_X(df,['symbol','ancla'])

Save X dataframe... because it takes a lot of time

In [None]:
with open('X_v1.pkl', 'wb') as f:
    pickle.dump(X,f)

## Features selection

In [None]:
with open('X_v1.pkl', 'rb') as f:
    X = pickle.load(f)