In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb

demo = False

In [None]:
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

if demo:
    market_train_df = market_train_df.tail(100000)
    news_train_df = news_train_df.tail(300000)

# Save universe data for latter use
universe = market_train_df['universe']
time = market_train_df['time']

In [None]:
def sigma_score(preds, valid_data):
    labels = valid_data.get_label()
    x_t = preds * labels # * df_valid['universe'] # -> Here we take out the 'universe' term because we already keep only those equals to 1.
    # Here we take advantage of the fact that `labels` (used to calculate `x_t`)
    # x_t = x_t.groupby(valid_data.params['extra_time']).sum()
    score = x_t.mean() / x_t.std()

    return 'sigma_score', score, True

In [None]:
#%%time
# code mostly takes from this kernel: https://www.kaggle.com/ashishpatel26/bird-eye-view-of-two-sigma-xgb

def feature_engineering(market_df,news_df):
#     market_df['time'] = market_df.time.dt.date
#     market_df['returnsOpenPrevRaw1_to_volume'] = market_df['returnsOpenPrevRaw1'] / market_df['volume']
#     market_df['close_to_open'] = market_df['close'] / market_df['open']
#     market_df['volume_to_mean'] = market_df['volume'] / market_df['volume'].mean()
    
#     news_df['time'] = news_df.time.dt.hour
#     news_df['sourceTimestamp']= news_df.sourceTimestamp.dt.hour
#     news_df['firstCreated'] = news_df.firstCreated.dt.date
#     news_df['assetCodesLen'] = news_df['assetCodes'].map(lambda x: len(eval(x)))
#     news_df['assetCodes'] = news_df['assetCodes'].map(lambda x: list(eval(x))[0])
#     news_df['headlineLen'] = news_df['headline'].apply(lambda x: len(x))
#     news_df['assetCodesLen'] = news_df['assetCodes'].apply(lambda x: len(x))
#     news_df['asset_sentiment_count'] = news_df.groupby(['assetName', 'sentimentClass'])['time'].transform('count')
#     news_df['asset_sentence_mean'] = news_df.groupby(['assetName', 'sentenceCount'])['time'].transform('mean')
#     lbl = {k: v for v, k in enumerate(news_df['headlineTag'].unique())}
#     news_df['headlineTagT'] = news_df['headlineTag'].map(lbl)
#     kcol = ['firstCreated', 'assetCodes']
#     news_df = news_df.groupby(kcol, as_index=False).mean()

#     market_df = pd.merge(market_df, news_df, how='left', left_on=['time', 'assetCode'], 
#                             right_on=['firstCreated', 'assetCodes'])

#     lbl = {k: v for v, k in enumerate(market_df['assetCode'].unique())}
#     market_df['assetCodeT'] = market_df['assetCode'].map(lbl)
    
#     market_df = market_df.dropna(axis=0)
    
    return market_df

market_train = feature_engineering(market_train_df, news_train_df)
market_train

In [None]:
# # Save universe data for latter use
universe = market_train['universe']
time = market_train['time']

print(market_train.shape)
up = market_train.returnsOpenNextMktres10 >= 0

fcol = [c for c in market_train_df.columns if c not in ['assetCode', 'assetCodes', 'assetCodesLen', 'assetName', 'assetCodeT', 'volume_to_mean',
                                             'firstCreated', 'headline', 'headlineTag', 'marketCommentary', 'provider', 'returnsOpenPrevRaw1_to_volume',
                                             'returnsOpenNextMktres10', 'sourceId', 'subjects', 'time', 'time_x', 'universe','sourceTimestamp']]

In [None]:
X = market_train[fcol]#.values
up = up.values
y = market_train.returnsOpenNextMktres10#.values

In [None]:
# Scaling of X values
mins = np.min(X, axis=0)
maxs = np.max(X, axis=0)
rng = maxs - mins
X = 1 - ((maxs - X) / rng)

In [None]:
n_train = int(X.shape[0] * 0.8)

X_train, y_train = X.iloc[:n_train], y.iloc[:n_train]
X_valid, y_valid = X.iloc[n_train:], y.iloc[n_train:]

In [None]:
# For valid data, keep only those with universe > 0. This will help calculate the metric
u_valid = (universe.iloc[n_train:] > 0)
t_valid = time.iloc[n_train:]

X_valid = X_valid[u_valid]
y_valid = y_valid[u_valid]
t_valid = t_valid[u_valid]
del u_valid

In [None]:
# Creat lgb datasets
train_cols = X.columns.tolist()
categorical_cols = [] # ['assetCode', 'assetName', 'dayofweek', 'month']

# Note: y data is expected to be a pandas Series, as we will use its group_by function in `sigma_score`
dtrain = lgb.Dataset(X_train.values, y_train, feature_name=train_cols, categorical_feature=categorical_cols, free_raw_data=False)
dvalid = lgb.Dataset(X_valid.values, y_valid, feature_name=train_cols, categorical_feature=categorical_cols, free_raw_data=False)

In [None]:
lgb_params = dict(
    objective = 'regression_l1',
    learning_rate = 0.1,
    num_leaves = 127,
    max_depth = -1,
#     min_data_in_leaf = 1000,
#     min_sum_hessian_in_leaf = 10,
    bagging_fraction = 0.75,
    bagging_freq = 2,
    feature_fraction = 0.5,
    lambda_l1 = 0.0,
    lambda_l2 = 1.0,
    metric = 'None', # This will ignore the loss objetive and use sigma_score instead,
    seed = 42 # Change for better luck! :)
)

In [None]:
evals_result = {}
m = lgb.train(lgb_params, dtrain, num_boost_round=1000, valid_sets=(dvalid,), valid_names=('valid',), verbose_eval=25,
              early_stopping_rounds=100, 
              feval=sigma_score, 
              evals_result=evals_result)

df_result = pd.DataFrame(evals_result['valid'])

In [None]:
ax = df_result.plot(figsize=(12, 8))
ax.scatter(df_result['sigma_score'].idxmax(), df_result['sigma_score'].max(), marker='+', color='red')

In [None]:
num_boost_round, valid_score = df_result['sigma_score'].idxmax()+1, df_result['sigma_score'].max()
print(lgb_params)
print(f'Best score was {valid_score:.5f} on round {num_boost_round}')

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 2, figsize=(14, 14))
lgb.plot_importance(m, ax=ax[0])
lgb.plot_importance(m, ax=ax[1], importance_type='gain')
fig.tight_layout()

In [None]:
# def make_predictions(predictions_template_df, market_obs_df, news_obs_df, le):
#     market_obs_df = feature_engineering(market_obs_df, news_obs_df)
#     predictions_template_df.confidenceValue = np.clip(model.predict(x), -1, 1)

In [None]:
# days = env.get_prediction_days()

# for (market_obs_df, news_obs_df, predictions_template_df) in days:
#     make_predictions(predictions_template_df, market_obs_df, news_obs_df, le)
#     env.predict(predictions_template_df)
# print('Done!')

In [None]:
# env.write_submission_file()

In [3]:
# days = env.get_prediction_days()
# for (market_obs_df, news_obs_df, predictions_template_df) in days:
#     print(market_obs_df.groupby('time').count())
    
# (1820, 5) Only the latest 2017-01-03

                       time           ...           returnsOpenPrevMktres10
0 2017-01-03 22:00:00+00:00           ...                          0.001985
1 2017-01-03 22:00:00+00:00           ...                               NaN
2 2017-01-03 22:00:00+00:00           ...                         -0.015277
3 2017-01-03 22:00:00+00:00           ...                          0.011201
4 2017-01-03 22:00:00+00:00           ...                         -0.010078

[5 rows x 14 columns]
ERROR: You must call `predict` before you can get the data for the next prediction day.


TypeError: 'NoneType' object is not iterable