In [105]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV
import data_engineering as de
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

## Data Wrangling

In [85]:
df = pd.read_csv('../data/complete_next_open.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69433 entries, 0 to 69432
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Publishing Time  63703 non-null  object 
 1   Market Date      69433 non-null  object 
 2   Ticker           69433 non-null  object 
 3   Sector           69433 non-null  object 
 4   finvader_neg     63703 non-null  float64
 5   finvader_neu     63703 non-null  float64
 6   finvader_pos     63703 non-null  float64
 7   finvader_tot     63703 non-null  float64
 8   Source           63703 non-null  object 
 9   Headline         63703 non-null  object 
 10  Text             63703 non-null  object 
 11  URL              63703 non-null  object 
 12  Open             69433 non-null  float64
 13  High             69433 non-null  float64
 14  Low              69433 non-null  float64
 15  Close            69433 non-null  float64
 16  Volume           69433 non-null  int64  
 17  Dividends   

In [86]:
def overall_sentiment(x:int):
    threshold = .1
    if x > threshold:
        return 'pos'
    elif x < -threshold:
        return 'neg'
    else:
        return 'neu'

In [87]:
df['overall_sen'] = df['finvader_tot'].apply(overall_sentiment)
df['overall_sen'] = df['overall_sen'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69433 entries, 0 to 69432
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Publishing Time  63703 non-null  object  
 1   Market Date      69433 non-null  object  
 2   Ticker           69433 non-null  object  
 3   Sector           69433 non-null  object  
 4   finvader_neg     63703 non-null  float64 
 5   finvader_neu     63703 non-null  float64 
 6   finvader_pos     63703 non-null  float64 
 7   finvader_tot     63703 non-null  float64 
 8   Source           63703 non-null  object  
 9   Headline         63703 non-null  object  
 10  Text             63703 non-null  object  
 11  URL              63703 non-null  object  
 12  Open             69433 non-null  float64 
 13  High             69433 non-null  float64 
 14  Low              69433 non-null  float64 
 15  Close            69433 non-null  float64 
 16  Volume           69433 non-null  int64  

In [88]:
counts = df.groupby(['Market Date', 'Ticker'])['overall_sen'].value_counts()
counts.loc['2019-03-15', 'AAPL']['pos']


0

In [89]:
features = ['finvader_neg',
            'finvader_neu',
            'finvader_pos',
            'finvader_tot',
            'Open',
            'High',
            'Low',
            'Close',
            'Volume',
            'Dividends',
            'Stock Splits']
df_mean = df.groupby(['Market Date', 'Ticker'])[features].mean().reset_index()
df_mean

Unnamed: 0,Market Date,Ticker,finvader_neg,finvader_neu,finvader_pos,finvader_tot,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2019-03-01,AAPL,,,,,41.887973,42.097075,41.553888,42.053814,103544800.0,0.0,0.0
1,2019-03-01,ABBV,,,,,62.740368,63.589807,62.354977,62.999920,8567900.0,0.0,0.0
2,2019-03-01,AMZN,,,,,82.756500,83.712997,82.550003,83.586502,99498000.0,0.0,0.0
3,2019-03-01,BAC,,,,,25.918994,26.201778,25.812949,25.901320,45771500.0,0.0,0.0
4,2019-03-01,GOOGL,,,,,56.549999,57.500000,56.549999,57.425999,34086000.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19180,2024-03-28,MSFT,,,,,420.202052,421.110417,418.365369,419.962494,21871200.0,0.0,0.0
19181,2024-03-28,NVDA,,,,,900.000000,913.000000,891.929993,903.559998,43521200.0,0.0,0.0
19182,2024-03-28,UNH,,,,,495.000000,495.869995,489.299988,494.700012,3820000.0,0.0,0.0
19183,2024-03-28,V,,,,,277.975547,279.283124,276.608082,278.564453,5844400.0,0.0,0.0


In [90]:
labels = {'pos_art_count':'pos', 'neg_art_count':'neg', 'neu_art_count':'neu'}
for l in labels:
    df_mean[l] = df_mean.apply(lambda x: counts.loc[x['Market Date'], x['Ticker']][labels[l]], axis = 1)
df_mean['total_articles'] = df_mean['pos_art_count'] + df_mean['neg_art_count'] + df_mean['neu_art_count']


In [91]:
df_mean['Market Date'] = pd.to_datetime(df_mean['Market Date'])
df_mean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19185 entries, 0 to 19184
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Market Date     19185 non-null  datetime64[ns]
 1   Ticker          19185 non-null  object        
 2   finvader_neg    13455 non-null  float64       
 3   finvader_neu    13455 non-null  float64       
 4   finvader_pos    13455 non-null  float64       
 5   finvader_tot    13455 non-null  float64       
 6   Open            19185 non-null  float64       
 7   High            19185 non-null  float64       
 8   Low             19185 non-null  float64       
 9   Close           19185 non-null  float64       
 10  Volume          19185 non-null  float64       
 11  Dividends       19185 non-null  float64       
 12  Stock Splits    19185 non-null  float64       
 13  pos_art_count   19185 non-null  int64         
 14  neg_art_count   19185 non-null  int64         
 15  ne

In [92]:
tickers = df_mean['Ticker'].unique()
ticker_frames = {}
for tick in tickers:
    ticker_frames[tick] = df_mean[df_mean['Ticker'] == tick].set_index('Market Date').drop(columns  = ['Ticker', 'Dividends'])
ticker_frames['AAPL']

Unnamed: 0_level_0,finvader_neg,finvader_neu,finvader_pos,finvader_tot,Open,High,Low,Close,Volume,Stock Splits,pos_art_count,neg_art_count,neu_art_count,total_articles
Market Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-03-01,,,,,41.887973,42.097075,41.553888,42.053814,103544800.0,0.0,0,0,1,1
2019-03-04,,,,,42.226871,42.721989,41.813471,42.265327,109744800.0,0.0,0,0,1,1
2019-03-05,,,,,42.286956,42.301376,41.950465,42.188412,78949600.0,0.0,0,0,1,1
2019-03-06,,,,,41.981718,42.178806,41.806265,41.945667,83241600.0,0.0,0,0,1,1
2019-03-07,,,,,41.789428,41.926429,41.344785,41.460152,99185600.0,0.0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-22,,,,,171.527346,172.815607,169.829652,172.046646,71106600.0,0.0,0,0,1,1
2024-03-25,,,,,170.338972,171.707111,169.220478,170.618591,54288300.0,0.0,0,0,1,1
2024-03-26,,,,,169.769734,171.187808,169.350304,169.480133,57388400.0,0.0,0,0,1,1
2024-03-27,,,,,170.179175,173.364857,169.879579,173.075241,60273300.0,0.0,0,0,1,1


In [93]:
for tick, frame in ticker_frames.items():
    frame['indicator'] = -frame['Open'] + frame.shift(-1)['Open']
    frame['indicator'] = frame['indicator'].apply(lambda x: 1 if x >= 0 else 0)
    ticker_frames[tick] = frame[frame['finvader_tot'].notna()]
ticker_frames['AAPL']

Unnamed: 0_level_0,finvader_neg,finvader_neu,finvader_pos,finvader_tot,Open,High,Low,Close,Volume,Stock Splits,pos_art_count,neg_art_count,neu_art_count,total_articles,indicator
Market Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-03-15,0.000000,0.660000,0.340000,0.039600,44.428467,45.024531,44.161680,44.733707,156171600.0,0.0,0,0,1,1,1
2019-03-18,0.027000,0.805000,0.168000,0.419750,44.656790,45.279293,44.654385,45.190365,104879200.0,0.0,1,0,1,2,1
2019-03-19,0.022364,0.773273,0.204182,0.304418,45.269685,45.423508,44.685636,44.832249,126585600.0,0.0,7,0,4,11,0
2019-03-20,0.055800,0.777100,0.167000,0.184520,44.760132,45.543671,44.399609,45.224007,124140800.0,0.0,4,1,5,10,1
2019-03-21,0.047333,0.743417,0.209083,0.235392,45.671060,47.187659,45.620585,46.889626,204136800.0,0.0,7,3,2,12,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-12,0.068500,0.773375,0.158000,0.227625,172.915453,173.794265,170.778352,172.995346,59825400.0,0.0,4,1,3,8,0
2024-03-13,0.101700,0.768400,0.129900,-0.054850,172.535987,172.955416,170.528699,170.898209,52488700.0,0.0,4,6,0,10,1
2024-03-14,0.078636,0.755727,0.165636,0.091591,172.675796,174.073894,171.816961,172.765671,72913500.0,0.0,6,4,1,11,0
2024-03-15,0.058900,0.835100,0.105900,0.045200,170.938152,172.386185,170.059339,172.386185,121664700.0,0.0,4,3,3,10,1


## Logistic Regression

Time to model.

1. Throw in 1 ,3 ,7 day averages as features. 
2. Could also do averages of finvader scores. 
3. Can change indicator flag to be whether next day price is higher than the average of previous stock prices, rather than the current days
4. Should generate confusion matrices, look at other scoring methods as well! 
5. Upgrade Random Forest (XGboost?) ? 

In [94]:
lr = LogisticRegression(penalty = 'l1', solver = 'liblinear')
 

In [128]:
parameters = {'C' : [.01, .1, 1, 10]}
clf = GridSearchCV(lr, parameters)

In [135]:
best_para = {}
lr_scores = {}
for tick, frame in ticker_frames.items():
    test, train = de.train_test_split(frame)
    model = clf.fit(train.drop(columns = 'indicator'), train['indicator'])
    best_para[tick] = list(model.best_params_.values())
    score = model.score(test.drop(columns = 'indicator'), test['indicator'])
    lr_scores[tick] = score





In [136]:
best_para, lr_scores

({'AAPL': [0.1],
  'ABBV': [0.01],
  'AMZN': [10],
  'BAC': [1],
  'GOOGL': [10],
  'JNJ': [1],
  'JPM': [1],
  'LLY': [10],
  'MA': [1],
  'MRK': [0.01],
  'MSFT': [1],
  'NVDA': [0.1],
  'UNH': [0.1],
  'V': [0.01],
  'WFC': [0.01]},
 {'AAPL': 0.5472837022132797,
  'ABBV': 0.5544041450777202,
  'AMZN': 0.5417085427135678,
  'BAC': 0.7529610829103215,
  'GOOGL': 0.5154320987654321,
  'JNJ': 0.6688311688311688,
  'JPM': 0.5,
  'LLY': 0.5364891518737672,
  'MA': 0.5221843003412969,
  'MRK': 0.5316666666666666,
  'MSFT': 0.5109170305676856,
  'NVDA': 0.47619047619047616,
  'UNH': 0.8621553884711779,
  'V': 0.51138353765324,
  'WFC': 0.5271828665568369})

In [118]:
feature_ranks = {}
rfe = RFE(lr)
for tick, frame in ticker_frames.items():
    test, train = de.train_test_split(frame)
    model = rfe.fit(train.drop(columns = 'indicator'), train['indicator'])
    feature_ranks[tick] = [frame.columns[i] for i in model.get_support(1)]
feature_ranks



{'AAPL': ['finvader_tot',
  'Open',
  'High',
  'Low',
  'Close',
  'pos_art_count',
  'neg_art_count'],
 'ABBV': ['Open',
  'High',
  'Low',
  'Close',
  'pos_art_count',
  'neg_art_count',
  'total_articles'],
 'AMZN': ['Open',
  'Low',
  'Close',
  'pos_art_count',
  'neg_art_count',
  'neu_art_count',
  'total_articles'],
 'BAC': ['Open',
  'High',
  'Low',
  'Close',
  'pos_art_count',
  'neu_art_count',
  'total_articles'],
 'GOOGL': ['Open',
  'High',
  'Low',
  'Close',
  'neg_art_count',
  'neu_art_count',
  'total_articles'],
 'JNJ': ['Open',
  'High',
  'Low',
  'Close',
  'Volume',
  'pos_art_count',
  'neu_art_count'],
 'JPM': ['finvader_tot',
  'Open',
  'High',
  'Close',
  'neg_art_count',
  'neu_art_count',
  'total_articles'],
 'LLY': ['Open',
  'High',
  'Low',
  'Close',
  'neg_art_count',
  'neu_art_count',
  'total_articles'],
 'MA': ['Open',
  'High',
  'Low',
  'Close',
  'Volume',
  'pos_art_count',
  'neg_art_count'],
 'MRK': ['Open',
  'High',
  'Low',
  'Clo

## Random Forest

In [119]:
forest = RandomForestClassifier()
f_params = {'max_depth':[2,3,4,5,6,7,8], 'max_features':[1,2,3,4,5]}
f_clf = GridSearchCV(forest, f_params)


In [144]:
feature_importance = {}
forest_scores = {}
for tick, frame in ticker_frames.items():
    test, train = de.train_test_split(frame)
    model = f_clf.fit(train.drop(columns = 'indicator'), train['indicator'])
    print(tick, model.best_params_)
    feature_importance[tick] = model.best_estimator_.feature_importances_
    score = model.score(test.drop(columns = 'indicator'), test['indicator'])
    forest_scores[tick] = score


AAPL {'max_depth': 3, 'max_features': 4}
ABBV {'max_depth': 3, 'max_features': 2}
AMZN {'max_depth': 3, 'max_features': 1}
BAC {'max_depth': 5, 'max_features': 4}
GOOGL {'max_depth': 5, 'max_features': 1}
JNJ {'max_depth': 8, 'max_features': 5}
JPM {'max_depth': 8, 'max_features': 5}
LLY {'max_depth': 2, 'max_features': 4}
MA {'max_depth': 2, 'max_features': 1}
MRK {'max_depth': 8, 'max_features': 4}
MSFT {'max_depth': 8, 'max_features': 4}
NVDA {'max_depth': 3, 'max_features': 1}
UNH {'max_depth': 7, 'max_features': 5}
V {'max_depth': 8, 'max_features': 1}
WFC {'max_depth': 2, 'max_features': 1}


In [147]:
ticker_frames['AAPL'].info()
feature_importance


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1254 entries, 2019-03-15 to 2024-03-18
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   finvader_neg    1254 non-null   float64
 1   finvader_neu    1254 non-null   float64
 2   finvader_pos    1254 non-null   float64
 3   finvader_tot    1254 non-null   float64
 4   Open            1254 non-null   float64
 5   High            1254 non-null   float64
 6   Low             1254 non-null   float64
 7   Close           1254 non-null   float64
 8   Volume          1254 non-null   float64
 9   Stock Splits    1254 non-null   float64
 10  pos_art_count   1254 non-null   int64  
 11  neg_art_count   1254 non-null   int64  
 12  neu_art_count   1254 non-null   int64  
 13  total_articles  1254 non-null   int64  
 14  indicator       1254 non-null   int64  
dtypes: float64(10), int64(5)
memory usage: 189.0 KB


{'AAPL': array([0.08272001, 0.09028469, 0.10744935, 0.08361805, 0.12212212,
        0.05094236, 0.08634029, 0.10851653, 0.10302431, 0.        ,
        0.05088494, 0.03210544, 0.02630264, 0.05568927]),
 'ABBV': array([0.11605411, 0.08067004, 0.09249412, 0.09974874, 0.09837868,
        0.08111936, 0.07299264, 0.09301106, 0.11541594, 0.        ,
        0.05829454, 0.0062725 , 0.00908459, 0.07646368]),
 'AMZN': array([0.05947292, 0.08151492, 0.0770803 , 0.06807625, 0.08622731,
        0.10020993, 0.07732937, 0.07278428, 0.10311641, 0.        ,
        0.10353052, 0.04542393, 0.03891933, 0.08631454]),
 'BAC': array([0.09715431, 0.09311841, 0.08983845, 0.13413093, 0.10180377,
        0.06951914, 0.06577092, 0.12704199, 0.11778439, 0.        ,
        0.03180268, 0.01552518, 0.0239796 , 0.03253023]),
 'GOOGL': array([0.06490577, 0.08834948, 0.08314039, 0.09216072, 0.08718987,
        0.08653844, 0.07934919, 0.07571266, 0.10751013, 0.        ,
        0.05754681, 0.0498483 , 0.06290091, 0.06

In [146]:
forest_scores

{'AAPL': 0.5492957746478874,
 'ABBV': 0.49222797927461137,
 'AMZN': 0.5306532663316583,
 'BAC': 0.494077834179357,
 'GOOGL': 0.5123456790123457,
 'JNJ': 0.5698051948051948,
 'JPM': 0.4852941176470588,
 'LLY': 0.5424063116370809,
 'MA': 0.5170648464163823,
 'MRK': 0.47333333333333333,
 'MSFT': 0.5327510917030568,
 'NVDA': 0.5297619047619048,
 'UNH': 0.5087719298245614,
 'V': 0.5078809106830122,
 'WFC': 0.528830313014827}