In [None]:
"Baseline"

import lightgbm as lgb
import numpy as np
import pandas as pd
from kaggle.competitions import twosigmanews
from sklearn.svm import SVC
from sklearn import preprocessing

print('Training lightgbm')

# money
PARAMS = {"objective":        "binary",
          "metric":           "binary_logloss",
          "num_leaves":       60,
          "max_depth":        -1,
          "learning_rate":    0.01,
          "bagging_fraction": 0.9,  # subsample
          "feature_fraction": 0.9,  # colsample_bytree
          "bagging_freq":     5,  # subsample_freq
          "bagging_seed":     2018,
          "verbosity":        -1
          }

PARAMS_SVC = {
    'C':                       1,
    'decision_function_shape': 'ovr',
    'kernel':                  'linear'
    }

RETURN_10_NEXT = 'returnsOpenNextMktres10'
TIME = 'time'
ASSET = 'assetCode'


def prepare_data(market_df: pd.DataFrame, news_df: pd.DataFrame):
    print('preparing data...')
    # a bit of feature engineering
    # TODO NEED ADD MA , SHIFTS
    market_df[TIME] = market_df.time.dt.strftime("%Y%m%d").astype(int)
    market_df['bartrend'] = market_df['close'] / market_df['open']
    market_df['average'] = (market_df['close'] + market_df['open']) / 2
    market_df['pricevolume'] = market_df['volume'] * market_df['close']

    news_df[TIME] = news_df.time.dt.strftime("%Y%m%d").astype(int)
    news_df[ASSET] = news_df['assetCodes'].map(lambda x: list(eval(x))[0])
    news_df['position'] = news_df['firstMentionSentence'] / news_df[
        'sentenceCount']
    news_df['coverage'] = news_df['sentimentWordCount'] / news_df['wordCount']


    # get rid of extra junk from news data
    droplist = ['sourceTimestamp', 'firstCreated', 'sourceId', 'headline',
                'takeSequence', 'provider', 'firstMentionSentence',
                'sentenceCount', 'bodySize', 'headlineTag', 'marketCommentary',
                'subjects', 'audiences', 'sentimentClass',
                'assetName', 'assetCodes', 'urgency', 'wordCount',
                'sentimentWordCount']
    news_df.drop(droplist, axis=1, inplace=True)
    market_df.drop(['assetName', 'volume'], axis=1, inplace=True)

    # combine multiple news reports for same assets on same day
    news_gp = news_df.groupby([TIME, ASSET], sort=False).aggregate(
            np.mean).reset_index()

    # join news reports to market data, note many assets will have many
    # days without news data
    return pd.merge(market_df, news_gp, how='left', on=[TIME, ASSET],
                    copy=False).fillna(0)  # , right_on=['time', 'assetCodes'])




ENV = twosigmanews.make_env()
(market_df, news_df) = ENV.get_training_data()
data_df = prepare_data(market_df, news_df)




In [None]:
print('building training set...')

def get_ans(all_data  ) -> pd.Series:
    result = sum(_answer_sma(all_data , window)
                     for window in range(10, 30 + 1, 1))

    result = result.apply(np.sign)
    
    return result

def _answer_sma(all_data: pd.DataFrame , window ):
    close = all_data['close']
    f_sma = close.rolling(window=window, min_periods=window).mean()
    b_sma = f_sma.shift(-window)
    b_sma.fillna(close)

    indicator = f_sma - b_sma
    signal = _get_intersections(indicator, close)
    return signal

def _get_intersections(indicator, time_series):
    indicator_shift = indicator.shift(1)

    rolling = time_series.rolling(5, center=True)
    range_ = np.arange(len(time_series) - 4)
    # find the index number of rolling argmax and argmin

    roll_argmax = rolling.apply(np.argmax)[2:-2].T.astype(int) + range_
    roll_argmin = rolling.apply(np.argmin)[2:-2].T.astype(int) + range_

    # find the index of buy and sell points (where two sma intersect)
    # sell_index = result[(indicator >= 0) & (indicator_shift < 0)].index
    # buy_index = result[(indicator < 0) & (indicator_shift >= 0)].index

    # find local argmax and argmin in the buy and sell points
    sell_index = roll_argmax[(indicator >= 0) & (indicator_shift < 0)]
    buy_index = roll_argmin[(indicator < 0) & (indicator_shift >= 0)]

    result = pd.Series(len(time_series) * [0], index=time_series.index)
    result[result.index[sell_index]] = -1
    result[result.index[buy_index]] = 1

    return result.astype(int)
    




GROUP_COLS = [TIME,  RETURN_10_NEXT , ASSET,  'universe']

TRAIN_COLS = [col for col in data_df.columns
              if col not in GROUP_COLS]

# dates = data_df[TIME].unique()

# train = range(len(dates))[:int(0.95 * len(dates))]
# train = data_df[TIME].isin(dates[train])

# val = range(len(dates))[int(0.95 * len(dates)):]
# val = data_df[TIME].isin(dates[val])

d = data_df[data_df['universe'] == 1]
tickers_df = d.groupby(ASSET)
svms = {}
scalers = {}
for tiker, data_t in list(tickers_df)[:5]:
        print("-----------",tiker,"------------------------------")
        s = SVC(**PARAMS_SVC)
        scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
        x = data_t[TRAIN_COLS]
        y = get_ans(data_t)
        x = scaler.fit_transform(x)
        try:
            s.fit(x,y)
            svms[tiker] = s 
            scalers[tiker] = scaler
        except  ValueError as error:
            print(error)





In [None]:
pred_days = ENV.get_prediction_days()



In [None]:
for market_df, news_df, pred_template_df in pred_days:
    data_df_predict = prepare_data(market_df, news_df)
    result = []
    for asset in pred_template_df[ASSET]:
        data_df_predict_a = data_df_predict[data_df_predict[ASSET] ==  asset]
        data_df_predict_a = data_df_predict_a[TRAIN_COLS]
        try:
            x = scalers[asset].transform(data_df_predict_a)
            result.append(svms[asset].predict(x))
        except KeyError as error:
            print(error) 
            result.append(0)
    pred_template_df['confidenceValue'] = result
    ENV.predict(pred_template_df)
    print(pred_template_df)
        

ENV.write_submission_file()


In [None]:
ENV.predict(pred_template_df)