In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from kaggle.competitions import twosigmanews
from math import sqrt
import matplotlib.pyplot as plt

env = twosigmanews.make_env()
(m_train_df, n_train_df) = env.get_training_data()
m_train_df.dropna(inplace= True)
m_train_df_2013 = m_train_df[m_train_df.time.dt.year >= 2013]

In [None]:
#Subset news data
n_train_df_2013 = n_train_df[n_train_df.time.dt.year >= 2013].copy()

In [None]:
news_var = ['time','assetName', 'bodySize','companyCount','sentenceCount','wordCount',
                                  'firstMentionSentence','relevance','sentimentClass','sentimentNegative',
                                   'sentimentNeutral','sentimentPositive','sentimentWordCount','noveltyCount12H',
                                   'noveltyCount24H','noveltyCount3D','noveltyCount5D','noveltyCount7D',
                                   'volumeCounts12H','volumeCounts24H','volumeCounts3D','volumeCounts5D','volumeCounts7D'
                                  ]

In [None]:
n_train_df_2013 = n_train_df_2013[news_var]
n_train_df_2013['date'] = n_train_df_2013.time.dt.date

In [None]:
n_train_df_2013.groupby(['date','assetName']).size().head(15)

In [None]:
#Group to get day & assetName level data 
n_train_df_grp = n_train_df_2013.groupby(['date','assetName']).mean().reset_index()

In [None]:
m_train_df_2013['date'] = m_train_df_2013.time.dt.date
m_train_df_2013 = pd.merge(m_train_df_2013,n_train_df_grp,how='left',on = ['assetName','date'])
# m_train_df_2013.head()

In [None]:
m_train_df_2013.isna().sum()/m_train_df_2013.shape[0]

In [None]:
#Fill 0 for NA's in News data
m_train_df_2013.fillna(0,inplace=True)

In [None]:
del n_train_df_grp
gc.collect()

In [None]:
#Find the correlations
corr_1 = m_train_df_2013.corr()

print(corr_1['returnsOpenNextMktres10'].sort_values(ascending = False))
del corr_1

In [None]:
#Removing rows with universe 0
m_train_df_2013 = m_train_df_2013[m_train_df_2013.universe == 1].copy()

In [None]:
# Train/test split
id_train = m_train_df_2013.time.dt.year != 2016
id_test = m_train_df_2013.time.dt.year == 2016

dep_var = 'returnsOpenNextMktres10'
ind_var = ['volume', 'close', 'open', 'returnsClosePrevRaw1',
       'returnsOpenPrevRaw1', 'returnsClosePrevMktres1',
       'returnsOpenPrevMktres1', 'returnsClosePrevRaw10',
       'returnsOpenPrevRaw10', 'returnsClosePrevMktres10',
       'returnsOpenPrevMktres10','bodySize', 'companyCount', 'sentenceCount', 'wordCount',
       'firstMentionSentence', 'relevance', 'sentimentClass',
       'sentimentNegative', 'sentimentNeutral', 'sentimentPositive',
       'sentimentWordCount', 'noveltyCount12H', 'noveltyCount24H',
       'noveltyCount3D', 'noveltyCount5D', 'noveltyCount7D', 'volumeCounts12H',
       'volumeCounts24H', 'volumeCounts3D', 'volumeCounts5D',
       'volumeCounts7D']

df_train = m_train_df_2013.loc[id_train,ind_var]
df_test = m_train_df_2013.loc[id_test,ind_var]

print("{0} training rows and {1} testing rows".format(df_train.shape[0],df_test.shape[0]))


y_train = m_train_df_2013.loc[id_train,dep_var]
y_test = m_train_df_2013.loc[id_test,dep_var]

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

xg_reg.fit(df_train,y_train)

In [None]:
xgb.plot_importance(xg_reg,max_num_features = 15)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()

In [None]:
pred_train = xg_reg.predict(df_train)
rms_train = sqrt(mean_squared_error(y_train, pred_train))

pred_test = xg_reg.predict(df_test)
rms_test = sqrt(mean_squared_error(y_test, pred_test))

print('Train RMSE: {0} Test RMSE: {1}'.format(rms_train,rms_test))

In [None]:
pred_test_df = m_train_df_2013.loc[id_test,['time','assetCode','universe','returnsOpenNextMktres10']]
pred_test_df['dayofyear'] = pred_test_df.time.dt.dayofyear
pred_test_df['confidence'] = [1 if pred >=0 else -1 for pred in pred_test]
pred_test_df['score'] = pred_test_df.universe * pred_test_df.returnsOpenNextMktres10 * pred_test_df.confidence
print(pred_test_df.confidence.value_counts())

score_1 = pred_test_df.groupby(['dayofyear']).score.sum()
score_2 = score_1.mean()/ score_1.std()
print("\n Competition Score: ",np.round(score_2,4))

#### Submisson creation

In [None]:
days = env.get_prediction_days()

In [None]:
def make_predictions(market_obs_df,news_obs_df,predictions_df,ind_var,news_var,xg_reg):
    
    #Process news data
    news_obs_df = news_obs_df.loc[:,news_var]
    news_obs_df['date'] = news_obs_df.time.dt.date
    n_train_df_grp = news_obs_df.groupby(['date','assetName']).mean().reset_index()
    
    #Merge the market and news data
    market_obs_df['date'] = market_obs_df.time.dt.date
    market_obs_df = pd.merge(market_obs_df,n_train_df_grp,how='left',on = ['assetName','date'])

    #Fill 0 for NA's in News data
    market_obs_df.fillna(0,inplace = True)
    test = market_obs_df.loc[:,ind_var]
    predictions_df.confidenceValue = [1 if  pred >=0 else -1 for pred in xg_reg.predict(test)]

In [None]:
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    make_predictions(market_obs_df,news_obs_df,predictions_template_df,ind_var,news_var,xg_reg)
    env.predict(predictions_template_df)
print('Done!')

In [None]:
env.write_submission_file()