In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import os
os.chdir("/content/gdrive/My Drive/4995_Competition/4995_kaggle_competition")

# Data Processing

### Load data

In [0]:
import pandas as pd
import numpy as np

# fix random seed for reproducibility
np.random.seed(7)

data = pd.read_csv('train.csv')
dates = data['Unnamed: 0']

data.shape, dates.shape

### Fill NaN values

In [0]:
# Fill data with the last known value
data = data.fillna(method='backfill', axis='rows', inplace=False)
#all_data = all_data.fillna(method='backfill', axis='rows', inplace=False)

# Then, if there are still missing values, fill with 0
data = data.fillna(value=0, axis='rows', inplace=False)
#all_data = all_data.fillna(value=0, axis='rows', inplace=False)

prices = data.iloc[:, 1:]

### Manually create dataset for recurrent network
- default lookback period: 25 days
- sample dimension: lookback period x 505 stocks

In [0]:
rets = prices.pct_change().fillna(value=1e-7, axis='rows', inplace=False)

In [0]:
from sklearn.preprocessing import MinMaxScaler

def create_recurrent_data(prices, rets, lookback_period=25, normalize=True):
    X = []
    y = []

    if normalize:
        scaler = MinMaxScaler(feature_range=(0, 1))

    for i in range(prices.shape[0]-lookback_period):
        labels = prices.iloc[i:i+lookback_period,:].to_numpy()
        targets = rets.iloc[i+lookback_period,:].to_numpy()
        
        if normalize:
            labels = scaler.fit_transform(labels)
        if np.isnan(targets).any():
            print("yes")
            print(i+lookback_period, targets)

        X.append(labels)
        y.append(targets)

    return np.array(X), np.array(y)

X, y = create_recurrent_data(prices, rets, lookback_period=50, normalize=False)

### Train/Test Split
- default: 80/20 split

In [0]:
def train_test_split(X, y, train_size=0.8):
    """
    return: X_train, y_train, X_test, y_test
    """
    assert len(X) == len(y)
    
    split_idx = int(len(X) * train_size)
    return X[:split_idx], y[:split_idx], X[split_idx:], y[split_idx:]

X_train, y_train, X_val, y_val = train_test_split(X, y, train_size=0.8)

In [0]:
for x in X_train:
    if np.isnan(x).any():
        print('x')
    
for y in y_train:
    if np.isnan(y).any():
        print('y')

# Create model


In [0]:
from keras.models import Sequential
from keras.layers import Dense, CuDNNLSTM, Dropout, BatchNormalization
from keras.optimizers import Adam

def create_model(window, n_stocks=506, lr=0.1, dropout=0.2, loss='mse'):
    optim = Adam(lr=lr)

    model = Sequential()
    model.add(CuDNNLSTM(units=256, return_sequences=True, input_shape=(window, n_stocks)))
    model.add(Dropout(dropout))
    model.add(BatchNormalization())
    model.add(CuDNNLSTM(units=256, return_sequences=True))
    model.add(Dropout(dropout))
    model.add(BatchNormalization())
    model.add(CuDNNLSTM(units=256))
    model.add(BatchNormalization())
    model.add(Dense(units=n_stocks))
    model.compile(optimizer=optim, loss=loss)
    return model

window = 50
lstm = create_model(window)
lstm.summary()

# Train

In [0]:
lstm.fit(X_train, y_train, epochs=100, validation_data=[X_val, y_val], shuffle=False)

In [0]:
A = prices.iloc[:,0]
model = create_model(window, n_stocks=1, lr=0.001, dropout=0.2, loss='mse')

In [0]:
A = np.reshape(A.to_numpy(), (-1, 1))
A.shape

In [0]:
from sklearn.preprocessing import MinMaxScaler

def create_recurrent_data(data, lookback_period=25, normalize=True):
    X = []
    y = []

    if normalize:
        scaler = MinMaxScaler(feature_range=(0, 1))

    for i in range(data.shape[0]-lookback_period):
        labels = data[i:i+lookback_period]
        targets = data[i+lookback_period]
        
        if normalize:
            labels = scaler.fit_transform(labels)

        X.append(labels)
        y.append(targets)
                
    return np.array(X), np.array(y)

X_a, y_a = create_recurrent_data(A, lookback_period=window)

In [0]:
model.fit(X_a, y_a)

# Create submission

In [0]:
def create_submission(model, Xte, file_name):
    import pandas as pd
    import numpy as np

    predictions = model.predict(Xtest)

    if predictions.shape[0] != 51630:
        raise Exception("WRONG SIZE")
    
    if round_predictions:
        file_name = file_name+ '_round'
        predictions = np.rint(predictions)

    dataid = np.arange(1, Xtest.shape[0]+1)
    df = pd.DataFrame({'dataid':dataid, 'prediction': predictions.flatten()})
    df.to_csv(file_name+'.csv', index=False)

    print("Submission: {}.csv Created".format(file_name))