In [211]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV
import data_engineering as de
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

## Data Wrangling

1. Load the Data
2. Create related features (e.g. article counts, open price rolling averages)
3. split by stock

In [149]:
df = pd.read_csv('../data/complete_next_open.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69433 entries, 0 to 69432
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Publishing Time  63703 non-null  object 
 1   Market Date      69433 non-null  object 
 2   Ticker           69433 non-null  object 
 3   Sector           69433 non-null  object 
 4   finvader_neg     63703 non-null  float64
 5   finvader_neu     63703 non-null  float64
 6   finvader_pos     63703 non-null  float64
 7   finvader_tot     63703 non-null  float64
 8   Source           63703 non-null  object 
 9   Headline         63703 non-null  object 
 10  Text             63703 non-null  object 
 11  URL              63703 non-null  object 
 12  Open             69433 non-null  float64
 13  High             69433 non-null  float64
 14  Low              69433 non-null  float64
 15  Close            69433 non-null  float64
 16  Volume           69433 non-null  int64  
 17  Dividends   

In [150]:
def overall_sentiment(x:int):
    threshold = .1
    if x > threshold:
        return 'pos'
    elif x < -threshold:
        return 'neg'
    else:
        return 'neu'

In [151]:
df['overall_sen'] = df['finvader_tot'].apply(overall_sentiment)
df['overall_sen'] = df['overall_sen'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69433 entries, 0 to 69432
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Publishing Time  63703 non-null  object  
 1   Market Date      69433 non-null  object  
 2   Ticker           69433 non-null  object  
 3   Sector           69433 non-null  object  
 4   finvader_neg     63703 non-null  float64 
 5   finvader_neu     63703 non-null  float64 
 6   finvader_pos     63703 non-null  float64 
 7   finvader_tot     63703 non-null  float64 
 8   Source           63703 non-null  object  
 9   Headline         63703 non-null  object  
 10  Text             63703 non-null  object  
 11  URL              63703 non-null  object  
 12  Open             69433 non-null  float64 
 13  High             69433 non-null  float64 
 14  Low              69433 non-null  float64 
 15  Close            69433 non-null  float64 
 16  Volume           69433 non-null  int64  

In [152]:
counts = df.groupby(['Market Date', 'Ticker'])['overall_sen'].value_counts()
counts.loc['2019-03-15', 'AAPL']['pos']


0

In [153]:
features = ['finvader_neg',
            'finvader_neu',
            'finvader_pos',
            'finvader_tot',
            'Open',
            'High',
            'Low',
            'Close',
            'Volume',
            'Dividends',
            'Stock Splits']
df_mean = df.groupby(['Market Date', 'Ticker'])[features].mean().reset_index()
df_mean

Unnamed: 0,Market Date,Ticker,finvader_neg,finvader_neu,finvader_pos,finvader_tot,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2019-03-01,AAPL,,,,,41.887973,42.097075,41.553888,42.053814,103544800.0,0.0,0.0
1,2019-03-01,ABBV,,,,,62.740368,63.589807,62.354977,62.999920,8567900.0,0.0,0.0
2,2019-03-01,AMZN,,,,,82.756500,83.712997,82.550003,83.586502,99498000.0,0.0,0.0
3,2019-03-01,BAC,,,,,25.918994,26.201778,25.812949,25.901320,45771500.0,0.0,0.0
4,2019-03-01,GOOGL,,,,,56.549999,57.500000,56.549999,57.425999,34086000.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19180,2024-03-28,MSFT,,,,,420.202052,421.110417,418.365369,419.962494,21871200.0,0.0,0.0
19181,2024-03-28,NVDA,,,,,900.000000,913.000000,891.929993,903.559998,43521200.0,0.0,0.0
19182,2024-03-28,UNH,,,,,495.000000,495.869995,489.299988,494.700012,3820000.0,0.0,0.0
19183,2024-03-28,V,,,,,277.975547,279.283124,276.608082,278.564453,5844400.0,0.0,0.0


In [154]:
labels = {'pos_art_count':'pos', 'neg_art_count':'neg', 'neu_art_count':'neu'}
for l in labels:
    df_mean[l] = df_mean.apply(lambda x: counts.loc[x['Market Date'], x['Ticker']][labels[l]], axis = 1)
df_mean['total_articles'] = df_mean['pos_art_count'] + df_mean['neg_art_count'] + df_mean['neu_art_count']


In [155]:
df_mean['Market Date'] = pd.to_datetime(df_mean['Market Date'])
df_mean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19185 entries, 0 to 19184
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Market Date     19185 non-null  datetime64[ns]
 1   Ticker          19185 non-null  object        
 2   finvader_neg    13455 non-null  float64       
 3   finvader_neu    13455 non-null  float64       
 4   finvader_pos    13455 non-null  float64       
 5   finvader_tot    13455 non-null  float64       
 6   Open            19185 non-null  float64       
 7   High            19185 non-null  float64       
 8   Low             19185 non-null  float64       
 9   Close           19185 non-null  float64       
 10  Volume          19185 non-null  float64       
 11  Dividends       19185 non-null  float64       
 12  Stock Splits    19185 non-null  float64       
 13  pos_art_count   19185 non-null  int64         
 14  neg_art_count   19185 non-null  int64         
 15  ne

In [156]:
tickers = df_mean['Ticker'].unique()
ticker_frames = {}
for tick in tickers:
    ticker_frames[tick] = df_mean[df_mean['Ticker'] == tick].set_index('Market Date').drop(columns  = ['Ticker', 'Dividends'])
ticker_frames['AAPL']

Unnamed: 0_level_0,finvader_neg,finvader_neu,finvader_pos,finvader_tot,Open,High,Low,Close,Volume,Stock Splits,pos_art_count,neg_art_count,neu_art_count,total_articles
Market Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-03-01,,,,,41.887973,42.097075,41.553888,42.053814,103544800.0,0.0,0,0,1,1
2019-03-04,,,,,42.226871,42.721989,41.813471,42.265327,109744800.0,0.0,0,0,1,1
2019-03-05,,,,,42.286956,42.301376,41.950465,42.188412,78949600.0,0.0,0,0,1,1
2019-03-06,,,,,41.981718,42.178806,41.806265,41.945667,83241600.0,0.0,0,0,1,1
2019-03-07,,,,,41.789428,41.926429,41.344785,41.460152,99185600.0,0.0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-22,,,,,171.527346,172.815607,169.829652,172.046646,71106600.0,0.0,0,0,1,1
2024-03-25,,,,,170.338972,171.707111,169.220478,170.618591,54288300.0,0.0,0,0,1,1
2024-03-26,,,,,169.769734,171.187808,169.350304,169.480133,57388400.0,0.0,0,0,1,1
2024-03-27,,,,,170.179175,173.364857,169.879579,173.075241,60273300.0,0.0,0,0,1,1


In [157]:
for tick, frame in ticker_frames.items():
    frame['3avg Open'] = frame['Open'].rolling(window = 3).mean()
    frame['7avg Open'] = frame['Open'].rolling(window=-7).mean()
ticker_frames['AAPL']

Unnamed: 0_level_0,finvader_neg,finvader_neu,finvader_pos,finvader_tot,Open,High,Low,Close,Volume,Stock Splits,pos_art_count,neg_art_count,neu_art_count,total_articles,3avg Open,7avg Open
Market Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2019-03-01,,,,,41.887973,42.097075,41.553888,42.053814,103544800.0,0.0,0,0,1,1,,
2019-03-04,,,,,42.226871,42.721989,41.813471,42.265327,109744800.0,0.0,0,0,1,1,,
2019-03-05,,,,,42.286956,42.301376,41.950465,42.188412,78949600.0,0.0,0,0,1,1,42.133933,
2019-03-06,,,,,41.981718,42.178806,41.806265,41.945667,83241600.0,0.0,0,0,1,1,42.165181,
2019-03-07,,,,,41.789428,41.926429,41.344785,41.460152,99185600.0,0.0,0,0,1,1,42.019367,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-22,,,,,171.527346,172.815607,169.829652,172.046646,71106600.0,0.0,0,0,1,1,174.606511,173.838503
2024-03-25,,,,,170.338972,171.707111,169.220478,170.618591,54288300.0,0.0,0,0,1,1,172.892170,173.504671
2024-03-26,,,,,169.769734,171.187808,169.350304,169.480133,57388400.0,0.0,0,0,1,1,170.545351,173.337754
2024-03-27,,,,,170.179175,173.364857,169.879579,173.075241,60273300.0,0.0,0,0,1,1,170.095960,172.601608


In [158]:
for tick, frame in ticker_frames.items():
    frame['indicator'] = -frame['Open'] + frame.shift(-1)['Open']
    frame['indicator'] = frame['indicator'].apply(lambda x: 1 if x >= 0 else 0)
    ticker_frames[tick] = frame[frame['finvader_tot'].notna()]
ticker_frames['AAPL']

Unnamed: 0_level_0,finvader_neg,finvader_neu,finvader_pos,finvader_tot,Open,High,Low,Close,Volume,Stock Splits,pos_art_count,neg_art_count,neu_art_count,total_articles,3avg Open,7avg Open,indicator
Market Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2019-03-15,0.000000,0.660000,0.340000,0.039600,44.428467,45.024531,44.161680,44.733707,156171600.0,0.0,0,0,1,1,44.144049,42.942763,1
2019-03-18,0.027000,0.805000,0.168000,0.419750,44.656790,45.279293,44.654385,45.190365,104879200.0,0.0,1,0,1,2,44.428463,43.352386,1
2019-03-19,0.022364,0.773273,0.204182,0.304418,45.269685,45.423508,44.685636,44.832249,126585600.0,0.0,7,0,4,11,44.784981,43.971457,0
2019-03-20,0.055800,0.777100,0.167000,0.184520,44.760132,45.543671,44.399609,45.224007,124140800.0,0.0,4,1,5,10,44.895536,44.340218,1
2019-03-21,0.047333,0.743417,0.209083,0.235392,45.671060,47.187659,45.620585,46.889626,204136800.0,0.0,7,3,2,12,45.233626,44.684259,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-12,0.068500,0.773375,0.158000,0.227625,172.915453,173.794265,170.778352,172.995346,59825400.0,0.0,4,1,3,8,171.464095,171.511650,0
2024-03-13,0.101700,0.768400,0.129900,-0.054850,172.535987,172.955416,170.528699,170.898209,52488700.0,0.0,4,6,0,10,172.719064,171.029449,1
2024-03-14,0.078636,0.755727,0.165636,0.091591,172.675796,174.073894,171.816961,172.765671,72913500.0,0.0,6,4,1,11,172.709079,171.336177,0
2024-03-15,0.058900,0.835100,0.105900,0.045200,170.938152,172.386185,170.059339,172.386185,121664700.0,0.0,4,3,3,10,172.049978,171.351871,1


## Logistic Regression

Time to model.

1. Throw in 1 ,3 ,7 day averages as features. 
2. Could also do averages of finvader scores. 
3. Can change indicator flag to be whether next day price is higher than the average of previous stock prices, rather than the current days -- For now stick to current indicator flags
4. Should generate confusion matrices, look at other scoring methods as well! 
5. Upgrade Random Forest (XGboost?) ? -- Being Done by Jem

In [226]:
lr = LogisticRegression(penalty = 'l1', solver = 'liblinear')
dummy = DummyClassifier(strategy= 'most_frequent')
 

In [227]:
parameters = {'C' : [.001, .01, .1, 1, 10, 100]}
clf = GridSearchCV(lr, parameters)

In [228]:
best_para = {}
lr_scores = {}
feature_ranks = {}
dummy_scores = {}
for tick, frame in ticker_frames.items():
    test, train = de.train_test_split(frame)
    X_train  =train.drop(columns = 'indicator')
    y_train =  train['indicator']
    X_test = test.drop(columns = 'indicator')
    y_test = test['indicator'] 
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    model = clf.fit(X_train_scaled, y_train)
    rfe = RFE(estimator=lr, n_features_to_select=3).fit(X_train_scaled, y_train)
    dumb = dummy.fit(X_train_scaled, y_train)
    feature_ranks[tick] = [frame.columns[i] for i in rfe.get_support(1)]
    best_para[tick] = list(model.best_params_.values())
    predict_true = model.predict(X_test_scaled)
    predict_dummy = dummy.predict(X_test_scaled)
    lr_scores[tick] = (accuracy_score(y_test, predict_true), precision_score(y_test, predict_true), 
                       recall_score(y_test, predict_true), f1_score(y_test, predict_true))
    dummy_scores[tick] = (accuracy_score(y_test, predict_dummy), precision_score(y_test, predict_dummy),
                           recall_score(y_test, predict_dummy), f1_score(y_test, predict_dummy))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [229]:
for tick in lr_scores:
    a,b,c,d = lr_scores[tick]
    w, x, y, z = dummy_scores[tick]
    print(tick, a-w, b-x, c-y, d-z)
print('ticker,      accuracy,       precision,      recall,     f1')

AAPL 0.1911468812877264 0.14947787282353342 -0.08148148148148149 0.08576719620657869
ABBV 0.24525043177892925 0.2669124693423425 -0.18380062305295952 0.1054166666666666
AMZN 0.2954773869346733 0.325755628034464 -0.22309197651663404 0.12829171120396465
BAC 0.30287648054145516 0.7877813504823151 0.7954545454545454 0.7915993537964459
GOOGL 0.28600823045267487 0.2949164851125635 -0.18039215686274512 0.13134873382551404
JNJ 0.2272727272727273 0.19573399237578337 -0.11111111111111116 0.10365081754514238
JPM 0.25 0.28038922155688617 -0.24649859943977592 0.0900571753932291
LLY 0.1913214990138068 0.13982663759991698 -0.05514705882352944 0.09001236444395444
MA 0.26791808873720147 0.2671711609547591 -0.1907894736842105 0.11426073971555006
MRK 0.2516666666666667 0.3331981981981983 -0.29780564263322884 0.08085364338400025
MSFT 0.16593886462882101 0.32252643070558495 -0.4684317718940937 -0.04133510345844904
NVDA 0.1511904761904762 0.32955832890777137 -0.48577680525164113 -0.05732024103090205
UNH 0.2

In [230]:
best_para, feature_ranks

({'AAPL': [100],
  'ABBV': [100],
  'AMZN': [10],
  'BAC': [10],
  'GOOGL': [100],
  'JNJ': [100],
  'JPM': [100],
  'LLY': [100],
  'MA': [100],
  'MRK': [100],
  'MSFT': [100],
  'NVDA': [10],
  'UNH': [10],
  'V': [10],
  'WFC': [10]},
 {'AAPL': ['Open', 'Close', '3avg Open'],
  'ABBV': ['Close', 'pos_art_count', '7avg Open'],
  'AMZN': ['finvader_neu', 'Open', '7avg Open'],
  'BAC': ['finvader_pos', 'Close', '3avg Open'],
  'GOOGL': ['Close', 'neu_art_count', '3avg Open'],
  'JNJ': ['Open', 'Close', '3avg Open'],
  'JPM': ['finvader_tot', 'Close', 'neg_art_count'],
  'LLY': ['Volume', '3avg Open', '7avg Open'],
  'MA': ['Close', 'pos_art_count', 'neg_art_count'],
  'MRK': ['Open', 'Close', 'neu_art_count'],
  'MSFT': ['finvader_pos', 'Close', 'Volume'],
  'NVDA': ['Close', 'Volume', '7avg Open'],
  'UNH': ['Open', 'Close', '7avg Open'],
  'V': ['finvader_neu', 'finvader_pos', 'Close'],
  'WFC': ['finvader_tot', 'Close', '7avg Open']})

## Running Simulation

In [None]:
cv_trades = [{},{},{},{}]
cv_opens = [{},{},{},{}]

for tick, frame in ticker_frames.items():
    test, train = de.train_test_split(frame)
    X_train  =train.drop(columns = 'indicator')
    y_train =  train['indicator']
    X_test = test.drop(columns = 'indicator')
    y_test = test['indicator'] 
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    model = clf.fit(X_train_scaled, y_train)
    #CrossValue
    i=0
    for train_idx, test_idx in de.get_cv_splits(X_train_scaled):
        cv_opens[i][tick] = train.loc[test_idx, "Open"]

        df_tt = train.loc[train_idx].drop(columns=['Open'])
        df_ho = train.loc[test_idx].drop(columns=['Open'])

        pred_change, trades = lstm_model.run_lstm_model(df_tt, df_ho)
        cv_trades[i][tick] = trades
        i+=1

## Random Forest


outdated code below, used to mess around with basic random forest classifier and how it looks at feature importance. 

In [164]:
# forest = RandomForestClassifier()
# f_params = {'max_depth':[2,3,4,5,6,7,8], 'max_features':[1,2,3,4,5]}
# f_clf = GridSearchCV(forest, f_params)


In [165]:
# feature_importance = {}
# forest_scores = {}
# for tick, frame in ticker_frames.items():
#     test, train = de.train_test_split(frame)
#     model = f_clf.fit(train.drop(columns = 'indicator'), train['indicator'])
#     print(tick, model.best_params_)
#     feature_importance[tick] = model.best_estimator_.feature_importances_
#     score = model.score(test.drop(columns = 'indicator'), test['indicator'])
#     forest_scores[tick] = score


AAPL {'max_depth': 2, 'max_features': 4}


KeyboardInterrupt: 

In [None]:
# ticker_frames['AAPL'].info()
# feature_importance


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1254 entries, 2019-03-15 to 2024-03-18
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   finvader_neg    1254 non-null   float64
 1   finvader_neu    1254 non-null   float64
 2   finvader_pos    1254 non-null   float64
 3   finvader_tot    1254 non-null   float64
 4   Open            1254 non-null   float64
 5   High            1254 non-null   float64
 6   Low             1254 non-null   float64
 7   Close           1254 non-null   float64
 8   Volume          1254 non-null   float64
 9   Stock Splits    1254 non-null   float64
 10  pos_art_count   1254 non-null   int64  
 11  neg_art_count   1254 non-null   int64  
 12  neu_art_count   1254 non-null   int64  
 13  total_articles  1254 non-null   int64  
 14  indicator       1254 non-null   int64  
dtypes: float64(10), int64(5)
memory usage: 189.0 KB


{'AAPL': array([0.08272001, 0.09028469, 0.10744935, 0.08361805, 0.12212212,
        0.05094236, 0.08634029, 0.10851653, 0.10302431, 0.        ,
        0.05088494, 0.03210544, 0.02630264, 0.05568927]),
 'ABBV': array([0.11605411, 0.08067004, 0.09249412, 0.09974874, 0.09837868,
        0.08111936, 0.07299264, 0.09301106, 0.11541594, 0.        ,
        0.05829454, 0.0062725 , 0.00908459, 0.07646368]),
 'AMZN': array([0.05947292, 0.08151492, 0.0770803 , 0.06807625, 0.08622731,
        0.10020993, 0.07732937, 0.07278428, 0.10311641, 0.        ,
        0.10353052, 0.04542393, 0.03891933, 0.08631454]),
 'BAC': array([0.09715431, 0.09311841, 0.08983845, 0.13413093, 0.10180377,
        0.06951914, 0.06577092, 0.12704199, 0.11778439, 0.        ,
        0.03180268, 0.01552518, 0.0239796 , 0.03253023]),
 'GOOGL': array([0.06490577, 0.08834948, 0.08314039, 0.09216072, 0.08718987,
        0.08653844, 0.07934919, 0.07571266, 0.10751013, 0.        ,
        0.05754681, 0.0498483 , 0.06290091, 0.06

In [None]:
# forest_scores

{'AAPL': 0.5492957746478874,
 'ABBV': 0.49222797927461137,
 'AMZN': 0.5306532663316583,
 'BAC': 0.494077834179357,
 'GOOGL': 0.5123456790123457,
 'JNJ': 0.5698051948051948,
 'JPM': 0.4852941176470588,
 'LLY': 0.5424063116370809,
 'MA': 0.5170648464163823,
 'MRK': 0.47333333333333333,
 'MSFT': 0.5327510917030568,
 'NVDA': 0.5297619047619048,
 'UNH': 0.5087719298245614,
 'V': 0.5078809106830122,
 'WFC': 0.528830313014827}