In [1]:
##port_1_julia_chan
##stock price prediction system

In [2]:
import os
#show current working directory
os.getcwd()
os.listdir()

['.ipynb_checkpoints', 'stock_prediction.ipynb']

In [3]:
# input the working directory
os.chdir('C:\\Users\\JC\\iCloudDrive\\Polyu\\COMP machine learning\\project\\project file\\20191224')

In [4]:
"""
Model Training / Testing / Visualizing
"""

#libraries to use
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os
import json
import re
import datetime
import random
import warnings
import train

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#rom model_utils import ModelUtils

from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM
from keras.wrappers.scikit_learn import KerasRegressor

In [5]:
# Modify environment variable value of which info and warning messages logging output are ignored & not printed
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [6]:
## Parameters Setup

# set training data file path 
price_train_dir = r'data\raw_price_train'
tweet_train_dir = r'data\tweet_train'

# set prediction data file path
tweet_pred_dir = r'data\tweet_test'

# set training data date range start & end dates
train_dates = ['2014-01-01', '2015-12-20']

# set prediction data dates
pred_dates = ['2015-12-21', '2015-12-22', '2015-12-23', '2015-12-24', '2015-12-28', '2015-12-29', '2015-12-30','2015-12-31']

# set +ve integer number of previous days to predict price trend with time series
window_size = 20

# models to train
models = [    
    #'LinearRegression', 
    #'RidgeRegression', 
    #'LassoRegression',
    #'SVRegression', 
    #'KNNRegression',  
    #'AdaBoostLSTM',
    'LongShortTermMemory']

predict_model = 'LongShortTermMemory'

In [7]:
# define training model 

class ModelUtils():

    def __lstm_build__(x_shape_1, x_shape_2):
        model = Sequential()
        model.add(LSTM(units=200, activation='softsign', return_sequences=True, input_shape = (x_shape_1, x_shape_2)))
        model.add(LSTM(units=200, activation='softsign', return_sequences=True, dropout=0.1))
        model.add(LSTM(units=200, dropout=0.1))
        model.add(Dense(1))
        model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])
        return model

    def isRegression(name):
        return (name == 'LinearRegression' or name == 'RidgeRegression' or name == 'LassoRegression' or
                name == 'SVRegression' or name == 'KNNRegression')

    def isLSTM(name):
        return (name == 'LongShortTermMemory' or name == 'AdaBoostLSTM')

    def init(name, x_shape_1 = 0, x_shape_2 = 0):
        if name == 'TfIdfVectorizer':
            model = TfidfVectorizer(analyzer='word', stop_words='english', max_df=0.90, min_df=0.05)
        elif name == 'LinearRegression':
            model = LinearRegression()
        elif name == 'RidgeRegression':
            model = RidgeCV(alphas=np.logspace(-3, 3, 7), cv=5)
        elif name == 'LassoRegression':
            model = LassoCV(alphas=np.logspace(-3, 3, 7), cv=5)
        elif name == 'SVRegression':
            parameters = [{'kernel':['poly', 'rbf', 'sigmoid'], 'degree':[2, 3, 4], 'C':[1, 10, 100, 1000], 'epsilon':[0.05, 0.1]}]
            model = GridSearchCV(SVR(gamma='scale'), parameters, cv=5)
        elif name == 'KNNRegression':
            model = GridSearchCV(KNeighborsRegressor(), {'n_neighbors': np.arange(1, 20)}, cv=5)
        elif name == 'LongShortTermMemory':
            model = KerasRegressor(build_fn=ModelUtils.__lstm_build__, x_shape_1=x_shape_1, x_shape_2=x_shape_2, epochs=120, batch_size=500)
        elif name == 'AdaBoostLSTM':         
            model = KerasRegressor(build_fn=ModelUtils.__lstm_build__, x_shape_1=x_shape_1, x_shape_2=x_shape_2, epochs=100, batch_size=500)
            model = AdaBoostRegressor(base_estimator=model, n_estimators=20, learning_rate=0.2)
        return model;

    def print_result(name, model):
        if name == 'LinearRegression':
            pass
        elif name == 'RidgeRegression':
            print( "RidgeRegression: alpha={}".format(model.alpha_))
        elif name == 'LassoRegression':
            print( "LassoRegression: alpha={}".format(model.alpha_))
        elif name == 'SVRegression':
            print( "SVRegression: kernel={}, degree={}, C={}, epsilon={}".format(
                model.best_params_['kernel'], model.best_params_['degree'], model.best_params_['C'], model.best_params_['epsilon']))
        elif name == 'KNNRegression':
            print( "KNNRegresion: n={}".format(model.best_params_['n_neighbors']))
        elif name == 'LongShortTermMemory':
            plt.plot(model.model.history.history['mse'])
            plt.title('LSTM MSE vs Epoch')
            plt.ylabel('mse')
            plt.xlabel('epoch')
            plt.show()
            pass
        elif name == 'AdaBoostLSTM':
            pass

In [8]:
## preview of price train data
df = pd.read_csv('data/raw_price_train/1_r_price_train.csv', header=0, index_col=0)
df.dropna(axis=0, how='all', thresh=None, subset=None, inplace=True)
df.index = pd.to_datetime(df.index)
df = df.sort_index(ascending=True, axis=0)
print(df.head())

                 Open       High        Low      Close  Adj Close     Volume
Date                                                                        
2012-09-04  95.108574  96.448570  94.928574  96.424286  87.121140   91973000
2012-09-05  96.510002  96.621429  95.657143  95.747147  86.509338   84093800
2012-09-06  96.167145  96.898575  95.828575  96.610001  87.288956   97799100
2012-09-07  96.864288  97.497147  96.538574  97.205711  87.827171   82416600
2012-09-10  97.207146  97.612854  94.585716  94.677139  85.542564  121999500


In [9]:
## preview of tweet train data
content=[]
with open('data/tweet_train/1_tweet_train/2014-01-01') as file:
    for line in file.readlines():
        content.append((json.loads(line)['text'],json.loads(line)['created_at']))
    print(content)

[(['rt', 'AT_USER', 'summary', 'of', "yesterday's", 'webcast', 'featuring', '$', 'aapl', '$', 'wynn', '$', 'goog', '$', 'lgf', 'tradereducation', 'options', 'hedgingstrategies', '-', '-', 'URL'], 'Wed Jan 01 03:59:03 +0000 2014'), (['rt', 'AT_USER', 'summary', 'of', "yesterday's", 'webcast', 'featuring', '$', 'aapl', '$', 'wynn', '$', 'goog', '$', 'lgf', 'tradereducation', 'options', 'hedgingstrategies', '-', '-', 'URL'], 'Wed Jan 01 03:29:29 +0000 2014'), (['itv', 'will', 'boost', 'apple', 'URL', '$', 'aapl', 'apple'], 'Wed Jan 01 18:08:47 +0000 2014'), (['iphone', 'users', 'are', 'more', 'intelligent', 'than', 'samsung', ',', 'blackberry', 'and', 'htc', 'owners', ',', '$', 'aapl', '$', 'bbry', ',', 'URL'], 'Wed Jan 01 01:52:31 +0000 2014'), (['rt', 'AT_USER', 'summary', 'of', "yesterday's", 'webcast', 'featuring', '$', 'aapl', '$', 'wynn', '$', 'goog', '$', 'lgf', 'tradereducation', 'options', 'hedgingstrategies', '-', '-', 'URL'], 'Wed Jan 01 01:18:36 +0000 2014'), (['2013', 'wrap-u

In [10]:
## Training price & tweet data import

def price_tweet_data_import():
    
    # get canonical price data file path and resolve symbolic links
    price_train_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), price_train_dir)
    price_train_file = glob.glob(os.path.join(price_train_path, "*.csv"))
    
    # get canonical tweet data file directory and resolve symbolic links
    tweet_train_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), tweet_train_dir)
    tweet_train_dir = glob.glob(os.path.join(tweet_train_path, "*"))
    
    # initialize train data dict
    price_train_dict = {}
    tweet_train_dict = {}
    
    # import price training data
    for file in price_train_file:
        df = pd.read_csv(file, header=0, index_col=0)
        
        # data cleaning
        df.dropna(axis=0, how='all', thresh=None, subset=None, inplace=True)
        
        # set date index & sorting
        df.index = pd.to_datetime(df.index)
        df = df.sort_index(ascending=True, axis=0)
        
        #return stock ID of price data file
        stock = os.path.basename(file)[:os.path.basename(file).find('_')]
        
        #extract stock price data dict within training date range
        price_train_dict[stock] = df.loc[train_dates[0]:train_dates[1]]
        
    # import tweet training data
    for dir in tweet_train_dir:
        
        # get tweet data file path
        tweet_train_file = glob.glob(os.path.join(dir, "*"))
        
        # return stock ID of tweet data file
        stock = os.path.basename(os.path.normpath(dir))[:os.path.basename(os.path.normpath(dir)).find('_')]
        
        # initialize list of tweet dates & content
        all_date = []
        all_content = []
        
        # extract tweet json file content by dates 
        for filename in tweet_train_file:
            
            #get tweet train dates
            date = os.path.basename(filename)
            
            #read and save tweet content
            content = []
            with open(filename) as file:
                for line in file.readlines():
                    content.append((json.loads(line)['text'], json.loads(line)['created_at']))
            
            all_date.append(date)
            all_content.append(content)
        
        # generate tweet training dataframe
        df = pd.DataFrame(data=all_content, index=all_date, columns=['tweet'])
        
        # set date index & sorting
        df.index = pd.to_datetime(df.index)
        df = df.sort_index(ascending=True, axis=0)
        
        #extract tweet content dict within training date range
        tweet_train_dict[stock] = df.loc[train_dates[0]:train_dates[1]]
        
    # Process tweets in non-trading days with no stock price
    for stock, df in tweet_train_dict.items():
        start_date = datetime.datetime.strptime(train_dates[0], '%Y-%m-%d')
        end_date = datetime.datetime.strptime(train_dates[1], '%Y-%m-%d')
        
        prev_date = None
        for date in (start_date + datetime.timedelta(n) for n in range(int((end_date - start_date).days + 1))):
            if date in price_train_dict[stock].index:
                prev_date = date
            if date in tweet_train_dict[stock].index:
                if not date in price_train_dict[stock].index:
                    if not prev_date is None:
                        if prev_date in tweet_train_dict[stock].index:
                            df.at[prev_date, 'tweet'] = df.at[prev_date, 'tweet'] + df.at[date, 'tweet']
                        else:
                            df.at[prev_date, 'tweet'] = df.at[date, 'tweet']
                    df.drop([date], inplace=True)
        tweet_train_dict[stock] = df.sort_index(ascending=True, axis=0)
        
    return price_train_dict, tweet_train_dict

In [11]:
## tweet text data preprocessing

def tweet_vectorizing_model_train(tweet_train_dict):
    lines_dict = {"all": []}
    
    # Process Tweets using NLTK and RegEx
    for stock, df in tweet_train_dict.items():
        for index, row in df.iterrows():
            words = []
            for tweet, time in row['tweet']:
                words.extend(tweet)
            lines_dict["all"].append(process_tweet_sentence(" ".join(words)))
            
    # Train Vectorizor
    for stock, lines in lines_dict.items():
        vectorizer = ModelUtils.init('TfIdfVectorizer')
        vectorizer.fit(lines) #vocabulary dictionary of all tokens in raw documents
        print("Number of unique words extracted:{}".format(len(vectorizer.vocabulary_))) #mapping of terms to feature indices
        ModelUtils.save(vectorizer, "{}-TfidfVectorizer.pkl".format(stock))

In [12]:
## Feature extraction and label preprocessing

def features_label_preprocess(price_train_dict, tweet_train_dict):
    X_dict = {}
    y_dict = {}

    for stock, df in price_train_dict.items():
    
        # Feature 1: Stock Closing Price
        df_proc = df[['Adj Close']]

        # Feature 2: Tweet Sentiment Score
        analyser = SentimentIntensityAnalyzer()
        tweet_scores = []

        for date in df_proc.index:
            scores = []
            if (date in tweet_train_dict[stock].index):
                for tweet, create_at in tweet_train_dict[stock].loc[date]['tweet']:
                    score = analyser.polarity_scores(' '.join(tweet))
                    if (score['compound'] < -0.05) or (score['compound'] > 0.05):
                        scores.append(score['compound'])
            if len(scores) > 0:
                tweet_scores.append(sum(scores) / len(scores))
            else:
                tweet_scores.append(0)
        df_proc['Tweet Score'] = tweet_scores

        # Feature 3: Tweet Bag of Words TF-IDF
        lines = []
        for date in df_proc.index:
            words = []
            if (date in tweet_dict[stock].index):
                for tweet, time in tweet_train_dict[stock].loc[date]['tweet']:
                    words.extend(tweet)
            lines.append(process_tweet_sentence(" ".join(words)))
        
        vectorizer = ModelUtils.load('{}-TfidfVectorizer.pkl'.format("all"))
        df_vec = pd.DataFrame(data=vectorizer.transform(lines).toarray(), index=df_proc.index)
        df_proc = pd.concat([df_proc, df_vec], axis='columns', join='inner')

        # Feature 4: Weekday (Mon or Fri)
        df_proc['Weekday'] = df.index.weekday
        
        # Feature 5: Stock Indicator
        df_proc['Stock'] = stock
        
        # Label: Stock Price Daily Change (predict Next Day value)
        df_proc['D+1 Change'] = df[['Adj Close']].pct_change().shift(-1)
        
        # Create preprocessed features & labels dict
        df_proc.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
        np_proc = np.array(df_proc, dtype=np.float)

        X, y = np_proc[:, :-1], np_proc[:,-1]

        X_dict[stock] = X
        y_dict[stock] = y

    return X_dict, y_dict

In [13]:
## Feature transformation

def features_transform(X_dict):
    X_transform_dict = {}

    for stock, x_array in X_dict.items():
        transformers=[('normalize_close_price_feature', 
                        MinMaxScaler(), 
                        [0]),
                      ('normalize_remaining_continuous_features', 
                        MinMaxScaler(), 
                        slice(1, x_array.shape[1] - 2)),
                      ("create_dummies_for_monday_and_friday", 
                        OneHotEncoder(categories=[[0,4]], handle_unknown='ignore'), 
                        [x_array.shape[1] - 2]),
                      ("create_dummies_for_stock_indicators", 
                        OneHotEncoder(categories=[[1,2,3,4,5,6,7,8]], drop='first'),  
                        [x_array.shape[1] - 1])]

        colTransformer = ColumnTransformer(transformers, remainder='passthrough')
        np_proc = colTransformer.fit_transform(x_array)
        X_transform_dict[stock] = np_proc

    return X_transform_dict

In [14]:
## Feature reshape

def features_label_reshape(X_dict, y_dict, isLSTM=False):
    X_reshape_dict = {}
    y_reshape_dict = {}

    for stock in X_dict.keys():

        # Construct 3D feature matrix for LSTM model
        if isLSTM:           
            X_reshape = np.zeros((X_dict[stock].shape[0] - trend_days + 1, trend_days, X_dict[stock].shape[1]))
        
            for i in range(0, X_reshape.shape[0]):
                for j in range(0, trend_days):
                    for k in range(0, X_reshape.shape[2]):
                        X_reshape[i, j, k] = X_dict[stock][i + trend_days - j - 1, k]

            y_reshape = np.reshape(y_dict[stock][trend_days - 1:], (y_dict[stock].shape[0] - trend_days + 1, 1))

        # Add past prices to 2D feature matrix for regression models
        else:
            X_reshape = np.zeros((X_dict[stock].shape[0] - trend_days + 1, X_dict[stock].shape[1] + trend_days - 1))
        
            for i in range(0, X_reshape.shape[0]):
                for j in range(0, trend_days):
                    X_reshape[i, j] = X_dict[stock][i + trend_days - j - 1, 0]

                for j in range(trend_days, X_reshape.shape[1]):
                    X_reshape[i, j] = X_dict[stock][i + trend_days - 1, j - trend_days + 1]

            y_reshape = y_dict[stock][trend_days - 1:]

        X_reshape_dict[stock] = X_reshape
        y_reshape_dict[stock] = y_reshape

    return X_reshape_dict, y_reshape_dict


In [15]:
## train test data split and merge 

def features_label_split_merge(X_dict, y_dict):
    X_train_dict = {}
    y_train_dict = {}
    X_test_dict = {}
    y_test_dict = {}

    # Spliting for training/testing
    for stock in X_dict.keys():

        # Sorted Splitting
        mid = round(X_dict[stock].shape[0] * 0.8)

        X_train_dict[stock] = X_dict[stock][:mid]
        X_test_dict[stock] = X_dict[stock][mid:]

        y_train_dict[stock] = y_dict[stock][:mid]
        y_test_dict[stock] = y_dict[stock][mid:]

    # Merging training data
    X_train = np.array([])
    y_train = np.array([])

    for stock in X_train_dict.keys():
        if (X_train.size > 0):
            X_train = np.r_[X_train, X_train_dict[stock]]
            y_train = np.r_[y_train, y_train_dict[stock]]
        else:
            X_train = X_train_dict[stock]
            y_train = y_train_dict[stock]

    X_train_dict = { 'all': X_train }
    y_train_dict = { 'all': y_train }
    
    return X_train_dict, X_test_dict, y_train_dict, y_test_dict


In [2]:
## model train and evaluation

def model_train_evaluate(model_name, X_train_dict, X_test_dict, y_train_dict, y_test_dict, isLSTM=False):
    
    # Train Models
    for stock in X_train_dict.keys():
        model_dict = {}
        if isLSTM:
            model_dict[stock] = ModelUtils.init(model_name, X_train_dict[stock].shape[1], X_train_dict[stock].shape[2])
            model_dict[stock].fit(X_train_dict[stock], y_train_dict[stock])
        else:
            model_dict[stock] = ModelUtils.init(model_name)
            model_dict[stock].fit(X_train_dict[stock], y_train_dict[stock])

    # Evaluate Models  
    result_list = []
    for stock in X_test_dict.keys():                                  
        y_pred = model_dict['all'].predict(X_test_dict[stock])
        if isLSTM:           
            y_pred = np.reshape(y_pred, (y_pred.shape[0]))
        y_test = np.reshape(y_test_dict[stock], (y_test_dict[stock].shape[0]))
        mse = mean_squared_error(y_test, y_pred)
        result_list.append((y_test, y_pred, mse))
    return result_list;

In [18]:
def model_result_plot(result_list, axisLine=True):

    col = len(result_list.keys())
    if (col > 0):
        row = len(list(result_list.values())[0])
    
        fig, axes = plt.subplots(nrows=row, ncols=col, figsize=(2+3*col, 1+row))
        fig.canvas.set_window_title('Predicting data')

        c = -1
    
        for (model_name, stock_list) in result_list.items():
            c += 1
            r = -1

            for (y_true, y_pred, mse) in stock_list:
                r += 1
            
                if col > 1:
                    ax = axes[r, c]
                elif row > 1:
                    ax = axes[r]
                else:
                    ax = axes

                ax.set_title(model_name + ' MSE={0:.5g}'.format(mse), fontsize=10)

                ax.set_xticklabels(())
                ax.set_yticklabels(())

                ax.plot(y_true, 'b', label='actual')
                ax.plot(y_pred, 'r', label='predict')
        
                if axisLine:
                    ax.axhline(y=0, color='k', linestyle='-', linewidth=0.5)

                ax.fill_between(
                    np.arange(0, len(y_true), 1),
                    y_true,
                    y_pred,
                    color='r',
                    alpha=0.2
                )

        handles, labels = ax.get_legend_handles_labels()
        fig.legend(handles, labels, loc='lower right')
        fig.tight_layout()

        plt.show()

In [19]:
def process_tweet_sentence(sentence): 
    stemmer = PorterStemmer()
    #lemmatizer = WordNetLemmatizer()

    stopword_list = stopwords.words('english')
    stopword_list.extend(["AT_USER", 'URL', 'tradereducation', 'hedgingstrategies'])

    sentence = re.sub(r'((((\$ )+)(.*?) )|(\$ (.*?))$|[^A-Za-z_\s])', '', sentence)

    words = word_tokenize(sentence.lower())

    words = [word for word in words if len(word) > 2 if not word in stopword_list]
    words = [stemmer.stem(word) for word in words]
    #words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words)

In [20]:
if __name__ == '__main__':  
    if len(sys.argv) > 1:
        del sys.argv[0]
        models = sys.argv

    print('  *** Importing price/tweet data for training')
    price_dict, tweet_dict = price_tweet_data_import()
   
    print('  *** Training tweet vectorizing models')
    tweet_vectorizing_model_train(tweet_dict)

    print('  *** Processing training data into features and label')
    X_dict, y_dict = features_label_preprocess(price_dict, tweet_dict)

    print('  *** Transforming training data features')
    X_transform_dict = features_transform(X_dict)       

    result_list = {};

    data_ready = False
    for name in models:
        if ModelUtils.isRegression(name):
            if not data_ready:
                print('    - Regression Models')

                print('  *** Reshaping features and label according to model requirement')
                X_reshape_dict, y_reshape_dict = features_label_reshape(X_transform_dict, y_dict, False)

                print('  *** Splitting features/labels for training/testing')
                X_train_dict, X_test_dict, y_train_dict, y_test_dict = features_label_split_merge(X_reshape_dict, y_reshape_dict)

                data_ready = True;

            print('  *** Training and evaluating models')
            result_list[name] = model_train_evaluate(name, X_train_dict, X_test_dict, y_train_dict, y_test_dict, False)

    data_ready = False
    for name in models:
        if ModelUtils.isLSTM(name):
            if not data_ready:
                print('    - LSTM Models')

                print('  *** Reshaping features and label according to model requirement')
                X_reshape_dict, y_reshape_dict = features_label_reshape(X_transform_dict, y_dict, True)

                print('  *** Splitting features/labels for training/testing')
                X_train_dict, X_test_dict, y_train_dict, y_test_dict = features_label_split_merge(X_reshape_dict, y_reshape_dict)

                data_ready = True;

            print('  *** Training and evaluating models')
            result_list[name] = model_train_evaluate(name, X_train_dict, X_test_dict, y_train_dict, y_test_dict, True)

    print('  *** Visualizing model testing results')
    model_result_plot(result_list)


  *** Importing price/tweet data for training


NameError: name '__file__' is not defined

In [None]:
def price_tweet_pred_data_import(price_dict, tweet_dict):
    tweet_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), tweet_pred_dir)
    tweet_pred_dirs = glob.glob(os.path.join(tweet_path, "*", ""))

    # Create placeholder for price data
    for price_df in price_dict.values():
        for date in dates[:-1]:
            price_df.loc[date] = [0, 0, 0, 0, 1, 0]
        price_df.index = pd.to_datetime(price_df.index)

    # Import tweet data       
    for dir in tweet_pred_dirs:       
        tweet_files = glob.glob(os.path.join(dir, "*"))
        stock = os.path.basename(os.path.normpath(dir))[0]

        for filename in tweet_files:
            date = os.path.basename(filename)
            content = []

            with open(filename) as file:
                for line in file.readlines():
                    content.append((json.loads(line)['text'], json.loads(line)['created_at']))

            tweet_dict[stock].at[date, 'tweet'] = content
        tweet_dict[stock].index = pd.to_datetime(tweet_dict[stock].index)
            
    # Process tweets in non-trading days
    for stock, tweet_df in tweet_dict.items():
        start_date = datetime.datetime.strptime(dates[0], '%Y-%m-%d')
        end_date = datetime.datetime.strptime(dates[-1], '%Y-%m-%d')

        prev_date = start_date
        while True:
            prev_date = prev_date - datetime.timedelta(days=1)
            if prev_date in price_dict[stock].index:
                break

        for date in (start_date + datetime.timedelta(n) for n in range(int ((end_date - start_date).days))):      
            if date in price_dict[stock].index:
                prev_date = date

            if date in tweet_dict[stock].index:
                if not date in price_dict[stock].index:

                    if prev_date in tweet_dict[stock].index:
                        tweet_df.at[prev_date, 'tweet'] = tweet_df.at[prev_date, 'tweet']  + tweet_df.at[date, 'tweet']
                        
                    else:
                        tweet_df.at[prev_date, 'tweet'] = tweet_df.at[date, 'tweet']

                    tweet_df.drop([date], inplace=True)
                    tweet_dict[stock] = tweet_df.sort_index(ascending=True, axis=0)

    return price_dict, tweet_dict

In [None]:
def features_transform(X_dict):
    X_transform_dict = {}
    x_price_dict = {}

    for stock, x_array in X_dict.items():
        colTransformer = ModelUtils.load('{}-ColumnTransformer.pkl'.format(stock))

        np_proc = x_array[x_array.shape[0] - (len(dates) + train.trend_days - 1):]
        X_transform_dict[stock] = colTransformer.transform(np_proc)

    return X_transform_dict

In [None]:
def price_predict(X_dict, isLSTM=False):
    result_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), result_dir)
    if not os.path.exists(result_path):
        os.makedirs(result_path)

    if isLSTM:
        filename = "{}-{}.joblib".format("all", predict_model)                         
    else:
        filename = "{}-{}.pkl".format("all", predict_model)
    model = ModelUtils.load(filename)

    y_pred_dict = {}
    for stock, X in X_dict.items():  
        y_pred_dict[stock] = [];

        transformer = ModelUtils.load('{}-ColumnTransformer.pkl'.format(stock)).transformers_[0][1]

        if isLSTM:
            price = transformer.inverse_transform([[X_dict[stock][0,0,0]]])[0,0]
        else:
            price = transformer.inverse_transform([[X_dict[stock][0,0]]])[0,0]

        print("{} Day0 Close = {}".format(stock, price))

        for i in range(0, X.shape[0]):
            if isLSTM:
                y_pred = model.predict(X[i:(i+1),:,:])
                x_price = X[i, 0, 0]
            else:
                y_pred = model.predict(X[i:(i+1),:])
                x_price = X[i, 0]

            y_pred = np.reshape(y_pred, (1, 1))
            price = price * (1 + y_pred[0,0])

            print("{} Next Close = {} | Y_pred = {}".format(stock, price, y_pred))

            y_transform = transformer.transform([[price]])

            if i < (X.shape[0] - 1):
                for j in range(i + 1, X.shape[0]):
                    if isLSTM:
                        X[j, j-i-1, 0] = y_transform[0,0]

                    else:
                        X[j, j-i-1] = y_transform[0,0]

            y_pred_dict[stock].append(price)

    result_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), result_dir)
    if not os.path.exists(result_path):
        os.makedirs(result_path)

    y_pred_list = []
    for list in y_pred_dict.values():
        y_pred_list = y_pred_list + list

    pickle.dump(y_pred_list, open(os.path.join(result_path, "19005718G.pkl"), 'wb'))


In [None]:
if __name__ == '__main__':  
    if len(sys.argv) > 1:
        predict_model = sys.argv[1]

    print('  *** Importing price/tweet data for predicting')
    price_dict, tweet_dict = train.price_tweet_data_import()
    price_tweet_pred_data_import(price_dict, tweet_dict)

    print('  *** Processing data for prediction into features and label')
    X_dict, y_dict = train.features_label_preprocess(price_dict, tweet_dict)

    print('  *** Transforming training data features')
    X_transform_dict = features_transform(X_dict)

    print('  *** Reshaping features and label according to model requirement')
    X_reshape_dict, y_reshape_dict = train.features_label_reshape(X_transform_dict, y_dict, ModelUtils.isLSTM(predict_model))

    print('  *** Predicting stock price')
    if ModelUtils.isLSTM(predict_model):
        price_predict(X_reshape_dict, True)
    elif ModelUtils.isRegression(predict_model):
        price_predict(X_reshape_dict, False)