In [None]:
import lightgbm as lgb
from hyperopt import STATUS_OK, hp, tpe, Trials, fmin
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.util import ngrams
import nltk
import datetime
import numpy as np
import pandas as pd
import pytz
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.metrics import r2_score
from tqdm import tqdm
import time

pd.options.mode.chained_assignment = None

In [None]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

In [None]:
'''
grp_market_train = market_train_df.groupby('time')

fig, ax1 = plt.subplots()
avgClose = grp_market_train['close'].mean()
ax1.plot(avgClose)

ax2 = ax1.twinx()
avgVolume = grp_market_train['volume'].mean()
ax2.plot(avgVolume, color='red', alpha=.4)

plt.show()
'''

## Main Loop
Let's loop through all the days and make our random predictions.  The `days` generator (returned from `get_prediction_days`) will simply stop returning values once you've reached the end.

In [None]:
market_obs = market_train_df
news_obs = news_train_df

testing = False
if (testing):
    market_obs = market_obs[market_obs.time.dt.year == 2007]
    news_obs = news_obs[news_obs.time.dt.year == 2007]

In [None]:
# LabelEncoder will be available outside of function
le = preprocessing.LabelEncoder()

# this function engineers features and combines both datasets into one
def process_data(market_obs, news_obs, actual=False, all_asset_codes=None):
    news_obs['timeKey'] = news_train_df.time.apply(lambda x: datetime.datetime(x.year, x.month, x.day, 22, tzinfo=pytz.utc))
    
    # should find a way to update weekend news to market-open days
    # https://stackoverflow.com/questions/47184507/groupby-and-weighted-average
    newsGrp = news_obs.groupby(['timeKey', 'assetName'])
    if (testing):
        sentimentMetrics = [[0]*newsGrp.size(), [0]*newsGrp.size(), [0]*newsGrp.size()]
    else:
        #sentimentMetrics = newsGrp.apply(lambda x: x[['sentimentNegative', 'sentimentPositive', 'sentimentWordCount']]
        #                                             .multiply(x['relevance'], axis=0).sum() / x['relevance'].sum())
        sentimentMetrics = [newsGrp.apply(lambda x: np.average(x['sentimentNegative'], weights=x['relevance'])),
                            newsGrp.apply(lambda x: np.average(x['sentimentPositive'], weights=x['relevance'])),
                            newsGrp.apply(lambda x: np.average(x['sentimentWordCount'], weights=x['relevance']))]
        
    sentimentDf = pd.concat(sentimentMetrics, axis=1,
                        keys=['sentimentNegative','sentimentPositive','sentimentWordCount']).reset_index()
    sentimentDf.rename(columns={'timeKey': 'time'}, inplace=True)
    
    # merge data and engineer features
    if (all_asset_codes is None):
        le.fit(market_obs['assetCode'])
    else:
        le.fit(all_asset_codes)
    data = pd.merge(market_obs, sentimentDf, how='left', on=['time','assetName'])
    data['dayofweek'], data['month'] = data.time.dt.dayofweek, data.time.dt.month
    data['closedHigher'] = data.open > data.close
    data['assetCode'] = le.transform(data['assetCode']) 
    
    # segment into x and y DataFrames
    if (actual):
        x = data.drop(['time', 'assetName'], axis=1)
        return (x)
    else:
        x = data.drop(['returnsOpenNextMktres10', 'time', 'assetName'], axis=1)
        y = data['returnsOpenNextMktres10']
        return (x, y)

In [None]:
categorical_cols = ['assetCode', 'dayofweek', 'month', 'closedHigher', 'universe']

lgb_params = dict(
    objective = 'regression_l1',
    learning_rate = 0.1,
    num_leaves = 3,
    max_depth = -1,
    min_data_in_leaf = 1000,
    bagging_fraction = 0.5,
    bagging_freq = 2,
    feature_fraction = 0.75,
    lambda_l1 = 0.0,
    lambda_l2 = 0.0,
    metric = 'None', # This will ignore the loss objetive and use sigma_score instead,
    seed = 42 # Change for better luck! :)
)

space = {
    'class_weight': hp.choice('class_weight', [None, 'balanced']),
    'boosting_type': 'gbdt',
    'num_leaves': hp.choice('num_leaves', np.arange(30, 150, dtype=int)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'subsample_for_bin': hp.choice('subsample_for_bin', [20000, 30000, 40000]),
    'min_data_in_leaf': hp.choice('min_data_in_leaf', np.arange(20, 500, 5, dtype=int)),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0)
}

In [None]:
t0 = time.time()
x, y = process_data(market_obs, news_obs)
t1 = time.time()
totalTime = t1-t0

print('Took',totalTime,'to process data')

In [None]:
# IGNORE HYPERTUNING FOR NOW.
# https://towardsdatascience.com/automated-machine-learning-hyperparameter-tuning-in-python-dfda59b72f8a
'''
def objective(params, n_folds = 5):
    # n-fold CV with hyperparameters; early stopping based on ROC/AUC
    cv_results = lgb.cv(params, train, nfold = n_folds, num_boost_round = 500,
                        early_stopping_rounds = 100, metrics = 'auc', seed = 42,
                        stratified = False, shuffle = False)
    best_score = max(cv_results['auc-mean'])
    loss = 1 - best_score
    
    pbar.update()
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

MAX_EVALS = 10
pbar = tqdm(total=MAX_EVALS, desc="Hyperopt")
bayes_trials = Trials()
bestParams = fmin(fn = objective, space = space, algo = tpe.suggest, 
             max_evals = MAX_EVALS, trials = bayes_trials)
pbar.close()
'''

In [None]:
# sanity check with R^2 on existing training dataset
'''
n_train = int(x.shape[0] * 0.8)
trainX, trainY = x.iloc[:n_train], y.iloc[:n_train]
testX, testY = x.iloc[n_train:], y.iloc[n_train:]

model = lgb.train(lgb_params, train)
prediction = model.predict(testX)
prediction_score = r2_score(testY, prediction)
print(prediction_score)
'''

In [None]:
full_data = lgb.Dataset(x, y, categorical_feature=categorical_cols, free_raw_data=False)
full_model = lgb.train(lgb_params, full_data)

In [None]:
i = 0
for (market_obs_df, news_obs_df, predictions_template_df) in env.get_prediction_days():
    asset_codes = market_obs['assetCode'].append(market_obs_df['assetCode'])
    newX = process_data(market_obs_df, news_obs_df, actual=True, all_asset_codes=asset_codes)
    
    newPrediction = full_model.predict(newX)
    newPrediction = pd.concat([pd.Series(newPrediction), newX['assetCode']], axis=1)
    predictions_template_df['assetCodeIndex'] = le.transform(predictions_template_df['assetCode'])
    predictions_template_df = predictions_template_df.merge(newPrediction, left_on='assetCodeIndex', right_on='assetCode', how='outer')

    predictions_template_df.drop(['confidenceValue', 'assetCodeIndex', 'assetCode_y'], axis=1, inplace=True)
    predictions_template_df.columns = ['assetCode', 'confidenceValue']
    env.predict(predictions_template_df)
    
    # update market_obs & news_obs
    # but this will require having counter
    i += 1
    print(i)
    
print('Done!')

In [None]:
env.write_submission_file()