## preprocessing.py

In [1]:
from helper_funcs.data import cleaned_market, load_market
from os.path import exists
import pandas as pd
import ta
import pickle
from darts import TimeSeries
from darts.utils.missing_values import fill_missing_values
import numpy as np
from darts.utils.model_selection import train_test_split

### Old preprocessing

In [2]:
def timeseries_init(time_col, static_cols,\
    value_cols, freq, fill_missing, type, ticker=None, group_col=None, EXP_MA=14):

    if type=='MULTI':
        data = cleaned_market()
    elif type=='UNI':
        data = cleaned_market()
    elif type=='Sentiment':
        try:
            data = pd.read_pickle('Datasets/market/market_sentiment.pkl')
        except:
            timeseries = timeseries_init(
                time_col='Date',
                static_cols=[],
                value_cols=[
                    'Adj Close',
                    'Close',
                    'High',
                    'Low',
                    'Open',
                    'Volume'],
                freq=freq,
                fill_missing=True,
                group_col='Ticker',
                type='MULTI'
                )
 
            dictionary = sentiment_init(timeseries[0], EXP_MA)

            data = pd.DataFrame([])
            for i in range(6):
                df = timeseries[i].pd_dataframe().reset_index()
                df['Ticker'] = timeseries[i].static_covariates_values()[0][0]
                data = pd.concat([data, df], axis=0)
            data = data.sort_values(by=['Date', 'Ticker']).reset_index(drop=True)
            data.Date = data.Date.dt.date.apply(lambda x: str(x))

            data['sentiment'] = None
            for key in list(dictionary.keys()):
                data['sentiment'] = data['sentiment'].combine_first(
                    pd.merge(data, dictionary[key], how ='left', on=['Date', 'Ticker']).sentiment_score
                    )
            data.to_pickle('Datasets/market/market_sentiment.pkl')
    

    if type=='MULTI':
        timeseries = TimeSeries.from_group_dataframe(
            df=data,
            time_col=time_col,
            group_cols=group_col,  # individual time series are extracted by grouping `df` by `group_cols`
            static_cols=static_cols,
            value_cols=value_cols,
            freq=freq,
            fill_missing_dates=fill_missing
            )

        for i in range(len(timeseries)):
            timeseries[i] = fill_missing_values(timeseries[i], method='ffill', limit_direction='forward')
            timeseries[i] = timeseries[i].add_holidays(country_code='US')

        

    elif type=='Sentiment':
        timeseries = TimeSeries.from_group_dataframe(
            df=data,
            time_col=time_col,
            group_cols=group_col,  # individual time series are extracted by grouping `df` by `group_cols`
            static_cols=static_cols,
            value_cols=value_cols,
            freq=freq
            )

        for i in range(len(timeseries)):
            # timeseries[i] = fill_missing_values(timeseries[i], method='ffill', limit_direction='forward')
            timeseries[i] = timeseries[i].add_holidays(country_code='US')
    
    else:
        timeseries = None


    return timeseries


def get_covariates(type, data, target_col, past_cov, future_cov):

    if type=='MULTI':
        target_train, past_train, future_train,\
            target_val, past_val, future_val = [[] for _ in range(6)]


        for series in data['train']:
            target_train.append(series[target_col])
            past_train.append(series[past_cov])
            future_train.append(series[future_cov])

        for series in data['val']:
            target_val.append(series[target_col])
            past_val.append(series[past_cov])
            future_val.append(series[future_cov])
    else:
        return None


    return target_train, past_train, future_train,\
        target_val, past_val, future_val

def frac(neg, pos):
    if neg == 0:
        return 1
    if pos == 0:
        return 0
    return neg / pos

def sentiment_init(timeseries, EXP_MA):
    df = pd.read_pickle("Datasets/results/preprocessing/sentiment_finetuned.pkl")
    tickers_df = pd.read_csv("Datasets/kaggle/Company_Tweet.csv")

    df = df.merge(tickers_df, on='tweet_id', how='inner')

    df = df[['post_date', 'sentiment_score', 'ticker_symbol']]

    df_positive = df[df.sentiment_score==1]
    df_positive['post_date'] = df_positive['post_date'].apply(lambda x: x.to_pydatetime().date())
    df_positive = df_positive.groupby(['ticker_symbol', 'post_date']).count()

    df_negative = df[df.sentiment_score==0]
    df_negative['post_date'] = df_negative['post_date'].apply(lambda x: x.to_pydatetime().date())
    df_negative = df_negative.groupby(['ticker_symbol', 'post_date']).count()

    business_days = pd.DataFrame(
        {'Date': timeseries.time_index.to_frame().reset_index(drop=True)['Date'].dt.date})

    sentiment_dict = {}
    tickers = ['AAPL', 'GOOG', 'GOOGL', 'AMZN', 'MSFT', 'TSLA']
    
    for ticker in tickers:
        negative = df_negative.xs(ticker).reindex(pd.date_range('2015-01-01', '2019-12-31'), fill_value=0)
        negative.index = negative.index.date
        positive = df_positive.xs(ticker).reindex(pd.date_range('2015-01-01', '2019-12-31'), fill_value=0)
        positive.index = positive.index.date

        lst_negative = []
        i = 0
        for date_b in business_days.Date:
            sum = 0
            for date in negative.index[i:]:
                i += 1
                sum += negative.loc[date].sentiment_score
                if date_b == date:
                    lst_negative.append(sum)
                    break

        lst_positive = []
        i = 0
        for date_b in business_days.Date:
            sum = 0
            for date in positive.index[i:]:
                i += 1
                sum += positive.loc[date].sentiment_score
                if date_b == date:
                    lst_positive.append(sum)
                    break
            
        new_df = pd.concat(
            [business_days.Date, pd.Series(lst_negative), pd.Series(lst_positive)],
            axis=1
            ).rename(columns = {0:'Negative', 1:'Positive'})

        sentiment_score = pd.DataFrame(
            {
                'sentiment_score': new_df.apply(lambda row: frac(row['Negative'], row['Positive']), axis=1),
                'Date': business_days.Date.apply(lambda x: str(x))
                }
                )
        
        sentiment_score.sentiment_score = sentiment_score.sentiment_score.ewm(EXP_MA).mean()
        
        sentiment_score['Ticker'] = ticker
        
        sentiment_dict[ticker] = sentiment_score
    
    return sentiment_dict

### New preprocessing

In [3]:
# fraction of negative tweets in all tweets
def frac_by_all(negative, positive):
    if negative == 0:
        return 0
    return negative / (negative + positive)

def frac(negative, positive):
    if negative == 0:
        return 0
    elif positive == 0:
        return 1
    return negative / positive


def market_init(freq='B', EXP_MA=15):
    data = load_market()
    tickers_df = pd.read_csv("Datasets/kaggle/Company_Tweet.csv")
    tickers = tickers_df.ticker_symbol.value_counts().index.to_list()
    tickers.sort()


    data['Volatility'] = ta.volatility.AverageTrueRange(
        high=data.High, low=data.Low, close=data.Close, window=EXP_MA).average_true_range()

    data = data[data.Date > '2014.12.31'].reset_index(drop=True)

    for ticker in tickers:
        df = data[data.Ticker == ticker]
        df = df.set_index('Date').resample(freq).ffill().reset_index()
    
        if ticker == 'AAPL':
            market_df = df
        else:
            market_df = pd.concat([market_df, df])
    
    market_df = market_df.sort_values(by=['Date', 'Ticker']).reset_index(drop=True)

    return market_df


def sentiment_init(freq='B', EXP_MA=15):
    tickers_df = pd.read_csv("Datasets/kaggle/Company_Tweet.csv")
    df_sentiment = pd.read_pickle("Datasets/results/preprocessing/sentiment_finetuned.pkl")
    tickers = tickers_df.ticker_symbol.value_counts().index.to_list()
    tickers.sort()

    df_sentiment = df_sentiment.merge(tickers_df, on='tweet_id', how='inner')
    df_sentiment = df_sentiment[['post_date', 'sentiment_score', 'ticker_symbol']]
    df_sentiment.loc[:, 'post_date'] = df_sentiment.post_date.apply(lambda x: x.to_pydatetime().date())
    df_sentiment = df_sentiment.groupby(['ticker_symbol', 'post_date']).value_counts().unstack(fill_value=0)

    for ticker in tickers:
        # first we add missing dates and fill thew with 0
        company = df_sentiment.xs(ticker).reindex(pd.date_range('2015-01-02', '2019-12-31'), fill_value=0)

        # then resample to business days and add up dropped past values up to the date that is not dropped
        company = company.resample(rule=freq, origin='end').sum().rename(columns = {0:'Negative', 1:'Positive'})

        # taking fractions of negative tweeets to calculate the final score 
        company['sentiment_score_1'] = company.apply(lambda row: frac_by_all(row['Negative'], row['Positive']), axis=1)
        company['sentiment_score_2'] = company.apply(lambda row: frac(row['Negative'], row['Positive']), axis=1)

        # calculating moving average
        company['sentiment_score_1'] = company.sentiment_score_1.ewm(span=EXP_MA).mean()
        company['sentiment_score_2'] = company.sentiment_score_2.ewm(span=EXP_MA).mean()

        company['Ticker'] = ticker
        company = company.reset_index().rename(
            columns={
                'index': 'Date'
                })
        
        if ticker == 'AAPL':
            sentiment_df = company
        else:
            sentiment_df = pd.concat([sentiment_df, company])
    
    sentiment_df = sentiment_df.sort_values(by=['Date', 'Ticker']).reset_index(drop=True)
    sentiment_df.columns.name = None

    return sentiment_df


def embeddings_init(freq='B'):
    with open('Datasets/results/preprocessing/embeddings_2.pkl', "rb") as file:
        embeddings = pickle.load(file)
        file.close()
    tickers_df = pd.read_csv("Datasets/kaggle/Company_Tweet.csv")
    tickers = tickers_df.ticker_symbol.value_counts().index.to_list()
    tickers.sort()

    df = pd.read_pickle("Datasets/results/preprocessing/final.pkl")
    df = df[['tweet_id', 'post_date']]
    df = df.merge(tickers_df, on='tweet_id', how='inner').rename(
        columns={
            'post_date': 'Date',
            'ticker_symbol': 'Ticker'
        })

    df['Date'] = df['Date'].apply(lambda x: x.date())
    df = df.reset_index()
    df = df.groupby(['Ticker', 'Date']).agg({'index': lambda x: embeddings[list(x)].mean(axis=0).tolist()})
    df = df.rename(columns={
        'index': 'embeddings'
    })

    df.embeddings = df.embeddings.apply(np.array)

    for ticker in tickers:
        # first we add missing dates and fill thew with 0
        company = df.xs(ticker).reindex(pd.date_range('2015-01-02', '2019-12-31'), fill_value=np.zeros(384))

        # then resample to business days and add up dropped past values up to the date that is not dropped
        company = company.resample(rule=freq, origin='end').mean()

        company['Ticker'] = ticker
        
        if ticker == 'AAPL':
            embeddings_df = company
        else:
            embeddings_df = pd.concat([embeddings_df, company])
    
    embeddings_df = embeddings_df.reset_index().rename(
            columns={
                'index': 'Date'
                })
    embeddings_df = embeddings_df.sort_values(by=['Date', 'Ticker']).reset_index(drop=True)
    return embeddings_df


def total_init():
    if exists('Datasets/results/total_df.pkl'):
        return pd.read_pickle('Datasets/results/total_df.pkl')

    market = market_init()
    sentiment = sentiment_init()
    embeddings = embeddings_init()

    total = pd.merge(market, sentiment, on=['Date', 'Ticker'])
    total = pd.merge(total, embeddings, on=['Date', 'Ticker'])
    total= total.join(pd.DataFrame(total['embeddings'].to_list()))

    total.to_pickle('Datasets/results/total_df.pkl')

    return total


def total_timeseries(market=True, sentiment=True, embeddings=True):

    market_columns = []
    if market is True:
        market_columns = ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume', 'Volatility']

    sentiment_columns = []
    if sentiment is True:
        sentiment_columns = ['Negative', 'Positive', 'sentiment_score_1', 'sentiment_score_2']
    
    embeddings_columns = []
    if embeddings is True:
        embeddings_columns = np.arange(384).tolist()

    value_columns = market_columns + sentiment_columns + embeddings_columns
    
    data = total_init()

    timeseries_total = TimeSeries.from_group_dataframe(
        df=data,
        time_col='Date',
        group_cols='Ticker',
        static_cols=[],
        value_cols=value_columns,
        freq='B'
    )

    for i in range(len(timeseries_total)):
        timeseries_total[i] = timeseries_total[i].add_holidays(country_code='US')

    return timeseries_total


def get_covariates(data, past_covariates, embeddings=False):
    train = data[0]
    val = data[1]
    emb = np.arange(384).tolist()
    emb = list(map(str, emb))

    if embeddings:
       past_covariates = past_covariates + emb

    target_train, past_train, future_train,\
        target_val, past_val, future_val = [[] for _ in range(6)]

    for series_1, series_2 in zip(train, val):
        target_train.append(series_1['Close'])
        past_train.append(series_1[past_covariates])
        future_train.append(series_1['holidays'])

        target_val.append(series_2['Close'])
        past_val.append(series_2[past_covariates])
        future_val.append(series_2['holidays'])

    return target_train, past_train, future_train,\
        target_val, past_val, future_val

In [4]:
total_init()

[*********************100%***********************]  6 of 6 completed


Unnamed: 0,Date,Ticker,Adj Close,Close,High,Low,Open,Volume,Volatility,Negative,...,374,375,376,377,378,379,380,381,382,383
0,2015-01-02,AAPL,24.603205,27.332500,27.860001,26.837500,27.847500,212818400,15.702441,227,...,0.019345,0.008206,-0.009246,-0.011367,-0.029173,0.010009,0.034422,-0.060718,-0.017831,0.031880
1,2015-01-02,AMZN,15.426000,15.426000,15.737500,15.348000,15.629000,55664000,15.454578,37,...,0.034497,0.009232,-0.017027,-0.001370,-0.030030,0.007514,0.019768,-0.060930,-0.028438,0.025051
2,2015-01-02,GOOG,26.168653,26.168653,26.490770,26.133251,26.378078,28951268,15.161924,16,...,0.040227,-0.012218,0.004907,-0.013846,-0.034779,0.007376,0.028838,-0.044456,-0.006530,0.022659
3,2015-01-02,GOOGL,26.477501,26.477501,26.790001,26.393999,26.629999,26480000,14.192552,11,...,0.038495,0.017129,-0.010664,-0.010565,-0.037532,0.014501,0.028705,-0.053237,-0.022100,0.029239
4,2015-01-02,MSFT,40.811424,46.759998,47.419998,46.540001,46.660000,27913900,14.642549,15,...,0.019906,0.000464,0.004415,-0.016480,-0.010017,0.022173,0.016566,-0.054880,-0.004975,0.011813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7813,2019-12-31,AMZN,92.391998,92.391998,92.663002,91.611504,92.099998,50130000,52.694159,42,...,0.023598,0.009558,-0.021011,0.001861,-0.025918,0.008945,0.015516,-0.062164,-0.035174,0.032179
7814,2019-12-31,GOOG,66.850998,66.850998,66.900002,66.454247,66.505501,19236000,50.910398,12,...,0.032021,0.003363,-0.024404,-0.004225,-0.025769,0.015236,0.058303,-0.037929,0.003818,0.031112
7815,2019-12-31,GOOGL,66.969498,66.969498,67.032997,66.606499,66.789497,19514000,47.544805,11,...,0.014872,0.005204,-0.027154,-0.012286,-0.023972,0.020188,0.028995,-0.057463,-0.012534,0.012191
7816,2019-12-31,MSFT,153.313156,157.699997,157.770004,156.449997,156.770004,18369400,50.428518,26,...,0.008740,0.011812,-0.013947,0.003805,-0.019997,0.009847,0.024076,-0.067404,-0.013164,0.028100


In [None]:
total = total_timeseries()

In [None]:
train, val = train_test_split(
    total,
    axis=1,
    test_size=0.2,
    input_size=15,
    horizon=5,
    vertical_split_type='model-aware'
)

data = [train, val]

target_train, past_train, future_train, target_val, past_val, future_val = get_covariates(
    data, 
    [
        'Adj Close',
        'High',
        'Low',
        'Open',
        'Volume',
        'Volatility',
        'Negative',
        'Positive',
        'sentiment_score_1',
        'sentiment_score_2'
        ],
    embeddings=True
    )