In [1]:
import numpy as np
import pandas as pd

In [19]:
colnames=['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']
data=pd.read_csv('D.csv', parse_dates=True, names=colnames, skipinitialspace=True)
data.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume
0,2011.02.03,00:00,1.3809,1.3825,1.3609,1.3633,86287
1,2011.02.04,00:00,1.3633,1.3676,1.3543,1.358,77766
2,2011.02.07,00:00,1.3569,1.3626,1.3508,1.3582,80872
3,2011.02.08,00:00,1.3582,1.3688,1.3572,1.3625,88784
4,2011.02.09,00:00,1.3625,1.3743,1.3611,1.3732,84264


In [20]:
data.isna().sum()

Date      0
Time      0
Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64

In [21]:
def generate_features(df):
    """ Generate features for a stock/index/currency/commodity based on historical price and performance
    Args:
        df (dataframe with columns "open", "close", "high", "low", "volume")
    Returns:
        dataframe, data set with new features
    """
    df_new = pd.DataFrame()
    
    # 4 original features
    df_new['date'] = df['Date']
    df_new['open'] = df['Open']
    df_new['high'] = df['High']
    df_new['low'] = df['Low']
    
    
    # 3 New Columns Checking high-open, open-low and total Change 
    df_new['high_open'] = df['High'] - df['Open']
    df_new['open_low'] = df['Open'] - df['Low'] 
    df_new['total_pips']=df_new['high_open'] + df_new['open_low']
    
    
    # return based on 1 day and 1 week (5 days)
    df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1))
    df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5))
    
    
    
    #RSI
    # Window length for moving average
    window_length = 5
    # Get just the close
    close = df['Close']
    # Get the difference in price from previous step
    delta = close.diff()
    # Make the positive gains (up) and negative gains (down) Series
    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0

    # Calculate the SMA
    roll_up = up.rolling(window_length).mean()
    roll_down = np.absolute(down.rolling(window_length).mean())

    # Calculate the RSI based on SMA
    RS = roll_up / roll_down
    RSI = 100.0 - (100.0 / (1.0 + RS))
    df_new['RSI']=RSI
    
    #Bollinger Bands
    n = 20
    MA = df.Close.rolling(window=n).mean()
    SD = df.Close.rolling(window=n).std()
    df_new['UpperBB'] = MA + (2 * SD) 
    df_new['LowerBB'] = MA - (2 * SD)
    
    #MACD (Exponential Moving averages)
    exp1 = df.Close.ewm(span=12, adjust=False).mean()
    exp2 = df.Close.ewm(span=26, adjust=False).mean()
    macd = exp1-exp2
    exp3 = macd.ewm(span=9, adjust=False).mean()
    df_new['MACD'] = macd
    df_new['Signal'] = exp3

    #EVM
    ndays=5
    dm = ((df['High'] + df['Low'])/2) - ((df['High'].shift(1) + df['Low'].shift(1))/2)
    br = (df['Volume'] / 100000000) / ((df['High'] - df['Low']))
    EVM = dm / br 
    EVM_MA = pd.Series(EVM.rolling(ndays).mean(), name = 'EVM') 
    df_new = df_new.join(EVM_MA)

    # Rate of Change (ROC)
    n = 5
    N = df['Close'].diff(n)
    D = df['Close'].shift(n)
    ROC = pd.Series(N/D,name='ROC')
    df_new = df_new.join(ROC)
    
    # Force Index 
    ndays = 5
    FI = pd.Series(df['Close'].diff(ndays) * data['Volume'], name = 'ForceIndex') 
    df_new = df_new.join(FI) 

    # the target
    df_new['close'] = df['Close']
    df_new = df_new.dropna(axis=0)
    return df_new
    

    


In [22]:
data=generate_features(data)
data.to_csv('transformed_data.csv', encoding='utf-8', index=False)

In [23]:
data.isna().sum()    


date          0
open          0
high          0
low           0
high_open     0
open_low      0
total_pips    0
return_1      0
return_5      0
RSI           0
UpperBB       0
LowerBB       0
MACD          0
Signal        0
EVM           0
ROC           0
ForceIndex    0
close         0
dtype: int64

In [24]:
data.head()

Unnamed: 0,date,open,high,low,high_open,open_low,total_pips,return_1,return_5,RSI,UpperBB,LowerBB,MACD,Signal,EVM,ROC,ForceIndex,close
19,2011.03.02,1.3776,1.389,1.3743,0.0114,0.0033,0.0147,0.006461,0.00851,71.586716,1.387593,1.344617,0.004981,0.002451,0.028358,0.00851,1008.774,1.3865
20,2011.03.03,1.3865,1.3973,1.3833,0.0108,0.0032,0.014,0.007501,0.01232,76.234568,1.393238,1.342332,0.006497,0.00326,0.045849,0.01232,1357.977,1.3969
21,2011.03.04,1.3968,1.4006,1.3941,0.0038,0.0027,0.0065,0.001217,0.017016,89.795918,1.39828,1.34135,0.007746,0.004157,0.052665,0.017016,1646.8686,1.3986
22,2011.03.07,1.3994,1.4035,1.3955,0.0041,0.0039,0.008,-0.001287,0.011734,81.395349,1.402068,1.341422,0.008493,0.005024,0.056328,0.011734,1197.5688,1.3968
23,2011.03.08,1.3968,1.3988,1.3862,0.002,0.0106,0.0126,-0.004582,0.009292,71.917808,1.404229,1.342051,0.008471,0.005714,0.026534,0.009292,926.8736,1.3904


In [25]:
# Check Mean of columns
data.mean()

open           1.207656
high           1.212402
low            1.203110
high_open      0.004745
open_low       0.004546
total_pips     0.009292
return_1      -0.000038
return_5      -0.000188
RSI           49.287252
UpperBB        1.228587
LowerBB        1.188048
MACD          -0.000481
Signal        -0.000489
EVM           -0.004516
ROC           -0.000188
ForceIndex   -18.475339
close          1.207696
dtype: float64

In [26]:
# Check Variance of columns
data.var()

open          1.183331e-02
high          1.190194e-02
low           1.170167e-02
high_open     1.834217e-05
open_low      1.898322e-05
total_pips    2.639067e-05
return_1      2.663546e-05
return_5      1.303791e-04
RSI           7.426056e+02
UpperBB       1.239038e-02
LowerBB       1.143274e-02
MACD          4.330843e-05
Signal        3.813590e-05
EVM           8.113593e-03
ROC           1.303791e-04
ForceIndex    1.514421e+06
close         1.183058e-02
dtype: float64

In [27]:
# Check Standard Deviations of columns
data.std()

open             0.108781
high             0.109096
low              0.108174
high_open        0.004283
open_low         0.004357
total_pips       0.005137
return_1         0.005161
return_5         0.011418
RSI             27.250791
UpperBB          0.111312
LowerBB          0.106924
MACD             0.006581
Signal           0.006175
EVM              0.090075
ROC              0.011418
ForceIndex    1230.618087
close            0.108768
dtype: float64

In [28]:
data.shape

(2546, 18)