# Stock Market Prediction - Starter Kernel
### Created by Magichanics


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from itertools import chain
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error # wouldn't recommend since we're not being evaluated on MSE
from pandas.tseries.holiday import USFederalHolidayCalendar
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import datetime
import gc

from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()
print('Done!')

Loading the data... This could take a minute.
Done!
Done!


### Importing Dataframes

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

In [None]:
# decide the length of the dataset
# note if sampling, len(news_train_df) > len(market_train_df)
market_train_df = market_train_df.tail(100_000)
news_train_df = news_train_df.tail(300_000)

In [None]:
market_train_df.head()

In [None]:
news_train_df.head()

### Information on the Training Data
* There are no Unknown ``assetName`` in ``news_train_df``, but there are 24 479 rows with Unknown as the ``assetName`` in ``market_train_df``. Merging by ``assetCode`` leaves out Unknown rows, which could be problematic.
* ``Volume`` has the highest correlation in terms of ``returnsOpenNextMktres10``
* Merging by just ``assetCodes`` greatly increases the dataframe (with just 100k rows, it has turned into 10 million rows), although merging by ``assetCodes`` and ``time`` greatly decrease the original dataframe.

### Aggregations on News Data

In [6]:
news_agg_cols = [f for f in news_train_df.columns if 'novelty' in f or
                'volume' in f or
                'sentiment' in f or
                'bodySize' in f or
                'Count' in f or
                'marketCommentary' in f or
                'relevance' in f]
news_agg_dict = {}
for col in news_agg_cols:
    news_agg_dict[col] = ['mean', 'sum', 'max', 'min']
news_agg_dict['urgency'] = ['min', 'count']
news_agg_dict['takeSequence'] = ['max']

### Joining Market & News Data

In [18]:
#tempcode
# decide the length of the dataset
# note if sampling, len(news_train_df) > len(market_train_df)
(market_train_df, news_train_df) = env.get_training_data()
market_train_df = market_train_df.tail(100_000)
news_train_df = news_train_df.tail(300_000)

In [19]:
# update market dataframe to only contain the specific rows with matching indecies.
def check_index(index, indecies):

    if index in indecies:
        return True
    else:
        return False

def join_market_news(market_df, news_df, nulls=False):

    print('market_df :' + str(market_df.shape))
    
    # Fix asset codes (str -> list)
    news_df['assetCodes'] = news_df['assetCodes'].str.findall(f"'([\w\./]+)'")

    # Expand assetCodes
    assetCodes_expanded = list(chain(*news_df['assetCodes']))
    assetCodes_index = news_df.index.repeat( news_df['assetCodes'].apply(len) )

    assert len(assetCodes_index) == len(assetCodes_expanded)
    df_assetCodes = pd.DataFrame({'level_0': assetCodes_index, 'assetCode': assetCodes_expanded})
    
    # get rid of any rows that will cause null values in one dataframe or the other.
    if not nulls:
        
        # gget new dataframe
        temp_news_df_expanded = pd.merge(df_assetCodes, news_df[['time', 'assetCodes']], left_on='level_0', right_index=True, suffixes=(['','_old']))
        
        # groupby dataframes
        temp_news_df = temp_news_df_expanded.copy()[['time', 'assetCode']]
        temp_market_df = market_df.copy()[['time', 'assetCode']]
        
        # get indecies on both dataframes
        temp_news_df['news_index'] = temp_news_df.index.values
        temp_market_df['market_index'] = temp_market_df.index.values
        
        # set multiindex and join the two
        temp_news_df.set_index(['time', 'assetCode'], inplace=True)
        
        # join the two
        temp_market_df_2 = temp_market_df.join(temp_news_df, on=['time', 'assetCode'])
        del temp_market_df, temp_news_df
        
        # drop nulls in any columns
        temp_market_df_2 = temp_market_df_2.dropna()
        print('dataframe relation: ' + str(temp_market_df_2.shape))
        
        # get indecies
        market_valid_indecies = temp_market_df_2['market_index'].tolist()
        news_valid_indecies = temp_market_df_2['news_index'].tolist()
        del temp_market_df_2
            
        # get index column
        market_df['market_index'] = market_df.index.values
        market_df['is_news'] = market_df['market_index'].apply(lambda x: check_index(x, market_valid_indecies))
        market_df = market_df[market_df.is_news == True]
        print('new market dataframe: ' + str(market_df.shape))
        del market_df['market_index'], market_df['is_news']
    
    # create dataframe based on groupby
    news_col = ['time', 'assetCodes', 'headline'] + sorted(list(news_agg_dict.keys()))
    news_df_expanded = pd.merge(df_assetCodes, news_df[news_col], left_on='level_0', right_index=True, suffixes=(['','_old']))
    
    # check if the columns are in the index
    if news_valid_indecies:
        news_df_expanded['news_index'] = news_df_expanded.index.values
        news_df_expanded['is_market'] = news_df_expanded['news_index'].apply(lambda x: check_index(x, news_valid_indecies))
        news_df_expanded = news_df_expanded[news_df_expanded.is_market == True]
        print('new news dataframe: ' + str(news_df_expanded.shape))
        del news_df_expanded['news_index'], news_df_expanded['is_market']

    print('creating grouped data...')

    def news_df_feats(x):
        if x.name == 'headline':
            return list(x)
    
    # groupby time and assetcode
    news_df_expanded = news_df_expanded.reset_index()
    news_groupby = news_df_expanded.groupby(['time', 'assetCode'])
    
    # get aggregated df
    news_df_aggregated = news_groupby.agg(news_agg_dict).apply(np.float32).reset_index()
    news_df_aggregated.columns = ['_'.join(col).strip() for col in news_df_aggregated.columns.values]
    
    # get any important string dataframes
    news_df_cat = news_groupby.transform(lambda x: news_df_feats(x))['headline'].to_frame()
    new_news_df = pd.concat([news_df_aggregated, news_df_cat], axis=1)
    
    # cleanup
    del news_df_aggregated
    del news_df_cat
    del news_df
    
    # rename columns
    new_news_df.rename(columns={'time_': 'time', 'assetCode_': 'assetCode'}, inplace=True)
    new_news_df.set_index(['time', 'assetCode'], inplace=True)
    
    print('merging data...')
    
    # Join with train
    market_df = market_df.join(new_news_df, on=['time', 'assetCode'])

    # cleanup
    gc.collect()
    
    print('X shape :' + str(market_df.shape))
    
#     if not nulls:
#         market_df = market_df[market_df.headline.isnull() == False]
    
    return market_df


In [20]:
%%time
join_market_news(market_train_df, news_train_df, nulls=False)

market_df :(100000, 16)
dataframe relation: (91, 4)
new market dataframe: (66, 18)
new news dataframe: (91, 30)
creating grouped data...
merging data...
X shape :(66, 104)
CPU times: user 4.43 s, sys: 52 ms, total: 4.48 s
Wall time: 4.47 s


Unnamed: 0,time,assetCode,assetName,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,returnsClosePrevRaw10,returnsOpenPrevRaw10,returnsClosePrevMktres10,returnsOpenPrevMktres10,returnsOpenNextMktres10,universe,bodySize_mean,bodySize_sum,bodySize_max,bodySize_min,companyCount_mean,companyCount_sum,companyCount_max,companyCount_min,marketCommentary_mean,marketCommentary_sum,marketCommentary_max,marketCommentary_min,sentenceCount_mean,sentenceCount_sum,sentenceCount_max,sentenceCount_min,wordCount_mean,wordCount_sum,wordCount_max,wordCount_min,relevance_mean,relevance_sum,relevance_max,relevance_min,...,noveltyCount24H_mean,noveltyCount24H_sum,noveltyCount24H_max,noveltyCount24H_min,noveltyCount3D_mean,noveltyCount3D_sum,noveltyCount3D_max,noveltyCount3D_min,noveltyCount5D_mean,noveltyCount5D_sum,noveltyCount5D_max,noveltyCount5D_min,noveltyCount7D_mean,noveltyCount7D_sum,noveltyCount7D_max,noveltyCount7D_min,volumeCounts12H_mean,volumeCounts12H_sum,volumeCounts12H_max,volumeCounts12H_min,volumeCounts24H_mean,volumeCounts24H_sum,volumeCounts24H_max,volumeCounts24H_min,volumeCounts3D_mean,volumeCounts3D_sum,volumeCounts3D_max,volumeCounts3D_min,volumeCounts5D_mean,volumeCounts5D_sum,volumeCounts5D_max,volumeCounts5D_min,volumeCounts7D_mean,volumeCounts7D_sum,volumeCounts7D_max,volumeCounts7D_min,urgency_min,urgency_count,takeSequence_max,headline
3974154,2016-10-14 22:00:00+00:00,CE.N,Celanese Corp,1485997.0,63.37,64.17,-0.004243,0.008804,-0.004546,0.002671,-0.047927,-0.026695,-0.003754,-0.004261,0.155559,1.0,2523.000000,2523.0,2523.0,2523.0,1.000000,1.0,1.0,1.0,0.0,0.0,0.0,0.0,17.000000,17.0,17.0,17.0,391.000000,391.0,391.0,391.0,1.000000,1.000000,1.000000,1.000000,...,0.000000,0.0,0.0,0.0,1.000000,1.0,1.0,1.0,2.000000,2.0,2.0,2.0,2.000000,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,5.0,5.0,5.0,5.0,3.0,1.0,1.0,[Celanese Announces Vinyl Acetate Based Emulsi...
3982765,2016-10-20 22:00:00+00:00,SU.N,Suncor Energy Inc,2703565.0,29.35,29.02,0.001365,-0.001033,0.002637,-0.002138,0.050841,0.039026,0.050039,0.039631,0.060042,1.0,1275.000000,2550.0,2550.0,0.0,1.000000,2.0,1.0,1.0,0.0,0.0,0.0,0.0,10.500000,21.0,20.0,1.0,217.000000,434.0,424.0,10.0,1.000000,2.000000,1.000000,1.000000,...,0.000000,0.0,0.0,0.0,1.000000,2.0,2.0,0.0,1.000000,2.0,2.0,0.0,1.000000,2.0,2.0,0.0,2.0,4.0,2.0,2.0,2.0,4.0,2.0,2.0,13.0,26.0,13.0,13.0,13.0,26.0,13.0,13.0,18.0,36.0,18.0,18.0,1.0,2.0,1.0,[Suncor Energy to release third quarter 2016 f...
3989025,2016-10-26 22:00:00+00:00,CYH.N,Community Health Systems Inc,2836350.0,10.03,10.15,-0.020508,-0.022158,-0.014183,-0.002202,-0.041109,-0.006849,-0.041482,-0.005922,-0.288420,0.0,21267.000000,21267.0,21267.0,21267.0,1.000000,1.0,1.0,1.0,0.0,0.0,0.0,0.0,79.000000,79.0,79.0,79.0,3263.000000,3263.0,3263.0,3263.0,1.000000,1.000000,1.000000,1.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,[Suncor Energy to release third quarter 2016 f...
3989898,2016-10-26 22:00:00+00:00,PRIM.O,Primoris Services Corp,175819.0,19.60,19.54,-0.003559,-0.019076,0.000932,-0.001320,0.005644,-0.006104,0.005089,-0.004471,0.128136,0.0,2045.000000,2045.0,2045.0,2045.0,1.000000,1.0,1.0,1.0,0.0,0.0,0.0,0.0,14.000000,14.0,14.0,14.0,369.000000,369.0,369.0,369.0,1.000000,1.000000,1.000000,1.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.000000,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,"[Community Health Systems, Inc. Previews Third..."
3997808,2016-11-02 22:00:00+00:00,ACN.N,Accenture PLC,5361141.0,118.61,116.14,0.021267,-0.003090,0.024020,0.002864,0.015323,-0.004884,0.035732,0.027647,-0.005935,1.0,9011.000000,9011.0,9011.0,9011.0,1.000000,1.0,1.0,1.0,0.0,0.0,0.0,0.0,38.000000,38.0,38.0,38.0,1328.000000,1328.0,1328.0,1328.0,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.0,1.0,1.0,2.000000,2.0,2.0,2.0,2.000000,2.0,2.0,2.0,2.000000,2.0,2.0,2.0,6.0,6.0,6.0,6.0,10.0,10.0,10.0,10.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,26.0,26.0,26.0,26.0,3.0,1.0,1.0,[Primoris Services Corporation Schedules Confe...
3998102,2016-11-02 22:00:00+00:00,CE.N,Celanese Corp,819837.0,71.82,72.14,-0.008285,-0.015960,0.010989,0.007388,0.023322,0.039822,0.135773,0.109919,0.066203,1.0,2170.000000,2170.0,2170.0,2170.0,1.000000,1.0,1.0,1.0,0.0,0.0,0.0,0.0,14.000000,14.0,14.0,14.0,320.000000,320.0,320.0,320.0,1.000000,1.000000,1.000000,1.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.000000,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,4.0,4.0,3.0,1.0,1.0,[Accenture Completes Acquisition of Australian...
4003362,2016-11-07 22:00:00+00:00,AGU.N,Agrium Inc,427224.0,92.62,90.84,0.034745,0.015426,0.002283,-0.000320,-0.002370,-0.001868,0.013739,0.037225,0.049207,0.0,6988.000000,13976.0,6988.0,6988.0,2.000000,4.0,2.0,2.0,0.0,0.0,0.0,0.0,33.000000,66.0,33.0,33.0,1071.000000,2142.0,1071.0,1071.0,1.000000,2.000000,1.000000,1.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,3.000000,6.0,3.0,3.0,3.000000,6.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0,110.0,55.0,55.0,55.0,110.0,55.0,55.0,3.0,2.0,1.0,[Celanese Announces Acetic Acid Price Increase...
4004609,2016-11-07 22:00:00+00:00,POT.N,Potash Corporation of Saskatchewan Inc,4697058.0,16.41,16.08,0.029486,0.004372,0.006155,-0.007926,-0.018541,-0.018315,-0.001854,0.023907,0.062845,1.0,4658.666504,13976.0,6988.0,0.0,1.666667,5.0,2.0,1.0,0.0,0.0,0.0,0.0,22.333334,67.0,33.0,1.0,718.333313,2155.0,1071.0,13.0,1.000000,3.000000,1.000000,1.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,4.333333,13.0,5.0,3.0,4.333333,13.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,90.0,30.0,30.0,30.0,90.0,30.0,30.0,1.0,3.0,1.0,[AGRIUM AND POTASHCORP PROVIDE UPDATE ON APPRO...
4006484,2016-11-08 22:00:00+00:00,Q.N,Quintiles IMS Holdings Inc,2252115.0,75.94,74.27,0.018099,0.009927,0.013083,-0.001926,0.020288,-0.000673,0.020369,-0.000635,-0.042716,1.0,3233.000000,3233.0,3233.0,3233.0,1.000000,1.0,1.0,1.0,0.0,0.0,0.0,0.0,16.000000,16.0,16.0,16.0,426.000000,426.0,426.0,426.0,0.288675,0.288675,0.288675,0.288675,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,19.0,19.0,19.0,3.0,1.0,1.0,[AGRIUM AND POTASHCORP PROVIDE UPDATE ON APPRO...
4007707,2016-11-09 22:00:00+00:00,GPN.N,Global Payments Inc,1104843.0,72.81,70.43,0.002478,-0.014965,-0.008680,-0.015872,-0.011402,-0.033615,-0.026035,-0.030894,-0.020443,1.0,4681.000000,4681.0,4681.0,4681.0,1.000000,1.0,1.0,1.0,0.0,0.0,0.0,0.0,34.000000,34.0,34.0,34.0,762.000000,762.0,762.0,762.0,1.000000,1.000000,1.000000,1.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,3.0,1.0,1.0,[Agrium and PotashCorp Provide Update on Appro...


#### Aggregations on X_train

In [None]:
def aggregations(df):
    
    # get columns
    lst_of_lst_of_cols = [[f for f in df.columns if 'return' in f and f != 'returnsOpenNextMktres10'],
                              [f for f in df.columns if 'novelty' in f],
                              [f for f in df.columns if 'volume' in f],
                              [f for f in df.columns if 'sentiment' in f]]
    
    agg_suffixes = ['aggReturn ', 'aggNovelty ', 'aggVolume ', 'aggSentiment ']
    
    for i_cols in range(len(lst_of_lst_of_cols)):
        
        # setup map of aggregations
        agg_dict = {}
        for col in lst_of_lst_of_cols[i_cols]:
            agg_dict[col] = ['mean', 'var', 'sum', 'std', 'max', 'min']
            
        # preform aggregations
        df_agg = df.groupby('sourceId').agg(agg_dict)
        df_agg.columns = pd.Index(['agg_' + e[0] + "_" + e[1].lower() for e in df_agg.columns.tolist()])
        
        # clean up dataframe and merge
        df = df.join(df_agg, how = 'left', on = 'sourceId', lsuffix = agg_suffixes[i_cols])
        del df_agg
        gc.collect()
        
        print('finished ' + agg_suffixes[i_cols])
        
    print('New dataframe shape: ' + str(df.shape))
    return df

X_train = aggregations(X_train)

### Text Processing with MultinomialNB

In [None]:
def get_headline(headlines_df):
    
    # get headlines as list
    headlines_lst = []
    for row in range(0,len(headlines_df.index)):
        headlines_lst.append(headlines_df.iloc[row])

    # split headlines to separate words
    basicvectorizer = CountVectorizer()
    headlines_vectorized = basicvectorizer.fit_transform(headlines_lst)
    
    print(headlines_vectorized.shape)
    return headlines_vectorized, basicvectorizer

def headline_mapping(target, headlines_vectored, headline_vectorizer):
    
    # round target values if using logistic regression
    target = round(target,0)
    
    # get model (testing with model that isn't )
    from sklearn.naive_bayes import MultinomialNB
    headline_model = MultinomialNB()
    headline_model = headline_model.fit(headlines_vectored, target)
    
    # get coefficients
    basicwords = headline_vectorizer.get_feature_names()
    basiccoeffs = headline_model.coef_.tolist()[0]
    coeff_df = pd.DataFrame({'Word' : basicwords, 
                            'Coefficient' : basiccoeffs})
    
    # convert dataframe to dictionary of coefficients
    coefficient_dict = dict(zip(coeff_df.Word, coeff_df.Coefficient))

    return coefficient_dict, coeff_df['Coefficient'].mean()

def get_coeff_col(headlines_df, coeff_dict, coeff_default):
    
    def get_coeff(word_lst):
        
        # iter through every word
        coeff_sum = 0
        for word in word_lst:
            if word in coeff_dict:
                coeff_sum += coeff_dict[word]
            else:
                coeff_sum += coeff_default
        
        # get average coefficient
        return coeff_sum / len(word_lst)
        
    basicvectorizer = CountVectorizer()
    
    # loop through every item
    headlines_coeff_lst = []
    for row in range(0,len(headlines_df.index)):
        headlines_coeff_lst.append(get_coeff(str(headlines_df.iloc[row]).split(' ')))
    
    return pd.Series(headlines_coeff_lst)

coefficient_dict, coefficient_default = headline_mapping(X_train['returnsOpenNextMktres10'],
                                            *get_headline(X_train['headline']))

X_train['headline_coeff_mean'] = get_coeff_col(X_train['headline'], coefficient_dict, coefficient_default)

### Get Time Features

In [None]:
# ripped from my previous kernel, NYC Taxi Fare

# first get dates
def split_time(df):
    
    # convert to string (will find a more efficient way to do this without converting to string)
    df['time'] = df['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
    
    # split date_time into categories
    df['time_day'] = df['time'].str.slice(8,10)
    df['time_month'] = df['time'].str.slice(5,7)
    df['time_year'] = df['time'].str.slice(0,4)
    df['time_hour'] = df['time'].str.slice(11,13)
    
    # source: https://www.kaggle.com/nicapotato/taxi-rides-time-analysis-and-oof-lgbm
    df['temp_time'] = df['time'].str.replace(" UTC", "")
    df['temp_time'] = pd.to_datetime(df['temp_time'], format='%Y-%m-%d %H:%M:%S')
    
    df['time_day_of_year'] = df.temp_time.dt.dayofyear
    df['time_week_of_year'] = df.temp_time.dt.weekofyear
    df["time_weekday"] = df.temp_time.dt.weekday
    df["time_quarter"] = df.temp_time.dt.quarter
    
    del df['temp_time']
    gc.collect()
    
    # convert to non-object columns
    time_feats = ['time_day', 'time_month', 'time_year', 'time_hour']
    df[time_feats] = df[time_feats].apply(pd.to_numeric)
    
    # determine whether the day is set on a holiday
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start='2007-01-01', end='2018-09-27').to_pydatetime()
    df['on_holiday'] = df['time'].str.slice(0,10).apply(lambda x: 1 if x in holidays else 0)
    
    # note to self: encode time later on
    
    return df

X_train = split_time(X_train)

In [None]:
def get_misc_features(X_df):
    
    # Adding daily difference
    new_col = X_df["close"] - X_df["open"]
    X_df.insert(loc=6, column="daily_diff", value=new_col)
    X_df['close_to_open'] =  np.abs(X_df['close'] / X_df['open'])

### Label Encoding

In [None]:
def group_delete(df, del_features):
    for f in del_features:
        del df[f]

def encoding(df, categorical_feats):
    df_encoded = pd.get_dummies(df[categorical_feats])
    df.join(df_encoded, how = 'right')
    group_delete(df, categorical_feats)
    print('new shape: ' + str(df.shape))
    return df

group_delete(X_train, ['time', 'sourceId', 'headline', 'assetCodes'])
X_train = encoding(X_train, [f for f in X_train.columns if X_train[f].dtype == 'object'])

### Cleaning Data

In [None]:
# will use a more efficient way later on
fcol = [c for c in X_train.columns if c not in ['sourceTimestamp', 'firstCreated', 'returnsOpenNextMktres10', 
                                                'assetName_x', 'universe', 'provider', 'subjects',
                                               'audiences', 'marketCommentary', 'assetName_y', 'sourceTimestamp'
                                               'firstCreated']] #<---- added


### Using LGBM for Modelling

In [None]:
# prepare x dataframes for modelling/prediction
def convert_to_X(market_obs_df, news_obs_df):
    
    # this repeats everything that was done previously
    X_test = join_market_news(market_obs_df, news_obs_df)
    X_test = aggregations(X_test)
    X_test['headline_coeff_mean'] = get_coeff_col(X_test['headline'], coefficient_dict, coefficient_default)
    X_test = split_time(X_test)
    group_delete(X_test, ['time', 'sourceId', 'headline', 'assetCodes'])
    X_test = encoding(X_test, ['assetCode', 'headlineTag'])
    X_test = X_test[[f for f in X_test.columns if 'int' in str(X_test[f].dtype) or 'float' in str(X_test[f].dtype)]]
    
    return X_test

In [None]:
y_train = X_train['returnsOpenNextMktres10']
del X_train['returnsOpenNextMktres10']

In [None]:
import lightgbm as lgb
import time

# set model and parameters
params = {'learning_rate': 0.02, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'seed': 2018}

In [None]:
#split data (for cross validation)
x1, x2, y1, y2 = train_test_split(X_train[fcol], 
                                  y_train, 
                                  test_size=0.25, 
                                  random_state=99)

In [None]:
# train
t = time.time()
print('Fitting Up')

# cross validation
lgb_model = lgb.train(params, 
                        lgb.Dataset(x1, label=y1), 
                        5000, 
                        lgb.Dataset(x2, label=y2), 
                        verbose_eval=100, 
                        early_stopping_rounds=200)

# lgb_model = lgb.train(params, 
#                         lgb.Dataset(X_train[fcol], label=y_train),
#                         verbose_eval=100)

print(f'Done, time = {time.time() - t}')

In [None]:
def make_predictions(market_obs_df, news_obs_df):
    
    print('market_obs_df shape: ' + str(market_obs_df.shape))
    print('news_obs_df shape: ' + str(news_obs_df.shape))
    
    # predict using given model
    X_test = convert_to_X(market_obs_df, news_obs_df)
    print('Created X_test with features: ' + str(X_test[fcol].columns))
    
    # there is an error:
    # ValueError: Length of values does not match length of index
    prediction_values = np.clip(lgb_model.predict(X_test[fcol]), -1, 1)
    
    print('finished predictions')

    return prediction_values

### Making Predictions

Now the difference between the training and test data would be these two columns,  ``['returnsOpenNextMktres10', 'universe']``. We will be trying to predict ``returnsOpenNextMktres10`` and using that as the ``confidenceValue``.

In [None]:
for (market_obs_df, news_obs_df, predictions_template_df) in env.get_prediction_days(): # Looping over days from start of 2017 to 2019-07-15
    
    print('predictions_template_df shape: ' + str(predictions_template_df.shape))
    # make predictions
    predictions_template_df['confidenceValue'] = make_predictions(market_obs_df, news_obs_df)
    
    # save predictions
    env.predict(predictions_template_df)


### Export Submission

In [None]:
env.write_submission_file() # Writes your submission file
print('finished!')

### Sources:
* [Getting Started - DJ Sterling](https://www.kaggle.com/dster/two-sigma-news-official-getting-started-kernel)
* [Bare bones script - William Cukierski](https://www.kaggle.com/wcukierski/bare-bones-script-loop-with-comments)
* [Extra data - aaron7sun](https://www.kaggle.com/aaron7sun/stocknews)
* [Text Preprocessing - Andrew Gelé](https://www.kaggle.com/ndrewgele/omg-nlp-with-the-djia-and-reddit)
* [fake news - SamLloyd](https://www.kaggle.com/sjdlloyd/it-s-fake-news-this-is-top-of-the-leaderboard)
* [a simple model - Bruno G. do Amaral](https://www.kaggle.com/bguberfain/a-simple-model-using-the-market-data)
* [LGBM Model - the1owl](https://www.kaggle.com/the1owl/my-two-sigma-cents-only)