In [None]:
import pandas as pd
import gc
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from kaggle.competitions import twosigmanews
from sklearn import *
from lightgbm import LGBMClassifier
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import numpy as np

**Obtaining Data**

Create Environment

In [None]:
env = twosigmanews.make_env()

Obtain Market and News data

In [None]:
(market, news) = env.get_training_data()

**Initial analysis of the data**

In [None]:
fig,axes = plt.subplots(1,1,figsize=(20,10))
axes.set_title("Records per year")
axes.set_ylabel("Records")
axes.set_xlabel("Year")
axes.plot(market.time.dt.date.value_counts().sort_index().index, market.time.dt.date.value_counts().sort_index().values)

In [None]:
market.info()

In [None]:
news.info()

**Helper Functions**

In [None]:
# Convert from comma separated to their own rows
def expandAssetCodes(news):
    codes = []
    indexes = []
    for i, values in news['assetCodes'].iteritems():
        explode = values.split(", ")
        codes.extend(explode)
        repeat_index = [int(i)]*len(explode)
        indexes.extend(repeat_index)
    indexOfFrames = pd.DataFrame({'newsIndex': indexes, 'assetCode': codes})
    del codes, indexes
    gc.collect()
    return indexOfFrames

# Merge news with the index of asset codes 
def expandNewsByAssetByIndex(news, expandedAssetCodes):
    news['newsIndex'] = news.index.copy()
    newsMerged = expandedAssetCodes.merge(news, how='left', on='newsIndex')
    return newsMerged

def processMarketData(market, isTraining):
    
    # The following 2 lines are mostly for debug mode because it would take too long
    # to have all the data for analysis and for testing out different options but the code
    # needs to be commented out for submiting the final version
    
    start = datetime(2013, 1, 1, 0, 0, 0).date()    
    market = market.loc[market['time'].dt.date >= start].reset_index(drop=True)
    
    if (isTraining):
        market = market.loc[market['universe'] >= 1.0].reset_index(drop=True)
        market.drop(['universe'], axis=1, inplace=True)
    
    # delete columns with nulls based on data analysis
    market.drop(['returnsClosePrevMktres1', 'returnsOpenPrevMktres1', 'returnsClosePrevMktres10', 'returnsOpenPrevMktres10'], axis=1, inplace=True)

    market['date'] = market.time.dt.date
    market.drop(['time'], axis=1, inplace=True)
    market.drop(['assetName'], axis=1, inplace=True)
    return market

def processNewsData(news):
    
    # The following 2 lines are mostly for debug mode because it would take too long
    # to have all the data for analysis and for testing out different options but the code
    # needs to be commented out for submiting the final version
    start = datetime(2013, 1, 1, 0, 0, 0).date()
    news = news.loc[news['time'].dt.date >= start].reset_index(drop=True)    
    
    news['assetCodes'] = news['assetCodes'].apply(lambda x: x[1:-1].replace("'", ""))
    news['commentaryyes'] = 0
    news['commentaryno'] = 0
    news.loc[news['marketCommentary'] == True, 'commentaryyes'] = 1
    news.loc[news['marketCommentary'] == False, 'commentaryno'] = 1
    news.drop(['marketCommentary'], axis=1, inplace=True)
    
    #delete columns that have too low or no indication to play a role in the prediciton
    news.drop(['urgency', 'bodySize', 'sentimentClass', 'sentimentWordCount', 'sentenceCount', 'wordCount', 'volumeCounts7D', 'volumeCounts5D', 'volumeCounts3D', 'volumeCounts24H', 'volumeCounts12H', 'noveltyCount7D', 'noveltyCount5D', 'noveltyCount3D', 'noveltyCount24H', 'noveltyCount12H', 'firstMentionSentence'], axis=1, inplace=True)
    indexOfAssets = expandAssetCodes(news)
    newsExpanded = expandNewsByAssetByIndex(news, indexOfAssets)
    del news, indexOfAssets
    gc.collect()
    newsExpanded['date'] = newsExpanded.time.dt.date
    newsExpanded.drop(['time'], axis=1, inplace=True)
    newsExpanded.drop(['newsIndex', 'sourceTimestamp', 'firstCreated', 'subjects', 'audiences', 'headlineTag', 'headline', 'assetCodes', 'assetName', 'sourceId'], axis=1, inplace=True)
    return newsExpanded
    

**Pre-processing Market data**

In [None]:
market.head(n=5)

In [None]:
market_train = processMarketData(market, True)
del market
gc.collect()

In [None]:
market_train.info()

**Pre-processing News data**

In [None]:
news_train = processNewsData(news)
del news
gc.collect()

In [None]:
news_train.head(n=5)

In [None]:
news_train.info()

**Combining Market and News data for trainning**

In [None]:
marketNews_train = market_train.merge(news_train, how='left', on=['assetCode', 'date'])
del market_train, news_train
gc.collect()

In [None]:
marketNews_train.info()

In [None]:
marketNews_train = marketNews_train.dropna()

**Analyzing combined News and Market data**

In [None]:
marketNews_train.info()

In [None]:
marketNews_train.corr().style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

In [None]:
marketNews_train[['sentimentPositive', 'sentimentNegative', 'sentimentNeutral']].mean().plot(kind='bar')
plt.title("Sentiment chart")
plt.show()

In [None]:
marketNews_train[['commentaryyes', 'commentaryno']].mean().plot(kind='bar')
plt.title("Market Commentary")
plt.show()

**Extracting feature to be predicted**

In [None]:
num_target = marketNews_train.returnsOpenNextMktres10.astype('float32')
bin_target = (marketNews_train.returnsOpenNextMktres10 >= 0).astype('int8')

**Splitting Train vs. Test data (80/20 split)**

In [None]:
train_indexes, test_indexes = model_selection.train_test_split(marketNews_train.index.values, test_size=0.20, random_state = 11)

In [None]:
lgbmClassifier = LGBMClassifier(
    objective='binary',
    boosting='gbdt',
    learning_rate = 0.05,
    max_depth = 8,
    num_leaves = 80,
    n_estimators = 400,
    bagging_fraction = 0.8,
    feature_fraction = 0.9)

In [None]:
marketNews_train.drop(['returnsOpenNextMktres10'], axis=1, inplace=True)
gc.collect()
marketNews_train.head(1)

In [None]:
marketNews_train.drop(['date', 'assetCode'], axis=1, inplace=True)
gc.collect()

In [None]:
marketNews_train.info()

**Trainning time**

In [None]:
print('Fitting Model')
lgbmClassifier.fit(marketNews_train.loc[train_indexes],bin_target.loc[train_indexes])
print('Fitting Model Completed')

In [None]:
print("Accuracy : %f" % accuracy_score(lgbmClassifier.predict(marketNews_train.loc[test_indexes]), bin_target.loc[test_indexes]))
print("AUC factor: %f" % roc_auc_score(bin_target.loc[test_indexes].values, lgbmClassifier.predict_proba(marketNews_train.loc[test_indexes])[:, 1]))

In [None]:
import matplotlib.pyplot as plt
from mlxtend.evaluate import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
import seaborn as sns
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['axes.titlesize'] = 20
mpl.rcParams['axes.labelsize'] = 16
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16

In [None]:
plt.hist(lgbmClassifier.predict_proba(marketNews_train.loc[test_indexes])[:, 1] * 2 - 1, bins='auto', alpha=0.5, color='green')
plt.xlabel("Confidence")
plt.ylabel("Count")
plt.title("Predicted confidence")
plt.show()

In [None]:
confusionMatrix = confusion_matrix(y_target=np.array(bin_target.loc[test_indexes]), y_predicted=lgbmClassifier.predict(marketNews_train.loc[test_indexes]).tolist())
fig, ax = plot_confusion_matrix(conf_mat=confusionMatrix)
plt.show()

In [None]:
featureImportance = pd.DataFrame()
featureImportance["feature"] = marketNews_train.columns
featureImportance["value"] = lgbmClassifier.feature_importances_
featureImportance.sort_values(by='value', ascending=False, inplace=True)

plt.figure(figsize=(15,10))
featureImportanceGraph = sns.barplot(y="feature", x="value", data=featureImportance)

In [None]:
def mypredict(market_obs_df, news_obs_df, predictions_template_df):
    marketp = processMarketData(market_obs_df, False)
    del market_obs_df
    gc.collect()
    newsp = processNewsData(news_obs_df)
    del news_obs_df
    gc.collect()
    observedMarketNews = marketp.merge(newsp, how='left', on=['assetCode', 'date'])
    del marketp, newsp
    gc.collect()
    feats = [c for c in observedMarketNews.columns if c not in ['assetCode', 'date', 'returnsOpenNextMktres10']]
    preds = lgbmClassifier.predict_proba(observedMarketNews[feats])[:, 1]
    sub = pd.DataFrame({'assetCode': observedMarketNews['assetCode'], 'confidence': preds})
    predictions_template_df = predictions_template_df.merge(sub, how='left').drop(
        'confidenceValue', axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
    predictions_template_df = predictions_template_df.groupby(['assetCode'], as_index=False).agg('mean')
    env.predict(predictions_template_df)
    del observedMarketNews, predictions_template_df, preds, sub
    gc.collect()
    

In [None]:
#days = env.get_prediction_days()

In [None]:
#(market_obs_df, news_obs_df, predictions_template_df) = next(days)

In [None]:
#testds = mypredict(market_obs_df, news_obs_df, predictions_template_df)

In [None]:
#testds.info()

In [None]:
for (market_obs_df, news_obs_df, predictions_template_df) in  env.get_prediction_days():
    mypredict(market_obs_df, news_obs_df, predictions_template_df)
print('Done!')

In [None]:
env.write_submission_file()

In [None]:
# We've got a submission file!
import os
print([filename for filename in os.listdir('.') if '.csv' in filename])

In [None]:
submission_dataset  = pd.read_csv('submission.csv')
submission_dataset.head(3)

In [None]:
submission_dataset.info()

In [None]:
plt.hist(submission_dataset.confidenceValue, bins='auto', color='red')
plt.xlabel("Confidence")
plt.ylabel("Count")
plt.title("Confidence against real prediction (non trainning data)")
plt.show()