In [None]:
## NOTE: This notebook requires the installation of GoogleNews-vectors-negative300.bin

In [49]:
import pandas as pd
import os
import datetime
import pandas_datareader.data as web
import numpy as np
import gensim
from gensim.models import Word2Vec
import nltk
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, f1_score
from scipy.sparse import vstack, hstack
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

In [2]:
speeches_file_path = './Text_Scraping/speeches_df_2000_to_2023.csv'

speeches_df = pd.read_csv(speeches_file_path)
speeches_df['Date'] = pd.to_datetime(speeches_df['Date'])
speeches_df = speeches_df.drop(['Unnamed: 0', 'quarter', 'authorId', 'year'], axis=1)

types = ['Speech'] * speeches_df.shape[0]
speeches_df['Type'] = types

meetings_path = './Text_Scraping/FOMC_Statements_and_Minutes.csv'
meetings_df = pd.read_csv(meetings_path)
meetings_df['Date'] = pd.to_datetime(meetings_df['Date'])
meetings_df = meetings_df.drop(['Unnamed: 0'], axis=1)

authors = []
for row in meetings_df['Type']:
    if row == 'Minutes':
        authors.append('FOMC Minutes')
    elif row == 'Statement':
        authors.append('FOMC Statement')
    else:
        authors.append('None')

meetings_df['Author'] = authors

text_df = result_df = pd.concat([speeches_df, meetings_df], axis=0)

In [3]:
macro_path = 'Macro-Micro/macro_indicators.csv'

macro_df = pd.read_csv(macro_path)
macro_df['Date'] = macro_df['Unnamed: 0']
macro_df['Date'] = pd.to_datetime(macro_df['Date'])
df = pd.merge(text_df, macro_df, on='Date', how='inner')

directory = 'Macro-Micro/company_sentiment_count/'


for filename in os.listdir(directory):
    if filename == 'JPMorgan_Chase__Co.csv' or filename == 'Goldman_Sachs_Group_Inc.csv':
        continue
    f = os.path.join(directory, filename)
    
    # Read company sentiment DataFrame
    company_df = pd.read_csv(f)
    
    # Calculate sentiment score
    company_df[filename.split('.')[0] + ' sentiment'] = (company_df['positive'] - company_df['negative']) / (company_df['positive'] + company_df['negative'])
    
    
    
    # Rename 'date' column to 'Date'
    company_df.rename(columns={'date': 'Date'}, inplace=True)
    
    # Convert 'Date' column to datetime format
    company_df['Date'] = pd.to_datetime(company_df['Date'])
    
    company_df = company_df[~company_df['Date'].duplicated(keep='first')]
    
    company_df.set_index('Date', inplace=True)
    
    
    # Resample the DataFrame to include every day
    df_resampled = company_df.resample('D').asfreq()

    # Sort the DataFrame by the "Date" column
    df_resampled.sort_values(by='Date', inplace=True)

    # Forward fill missing values in all columns
    df_resampled.ffill(inplace=True)

    # Reset the index to make "Date" a column again
    df_resampled.reset_index(inplace=True)
    
    company_df = df_resampled
    
    company_df[filename.split('.')[0] + ' sentiment'] = company_df[filename.split('.')[0] + ' sentiment'].fillna(method='ffill')
    
    sentiment_col = []
    for date in df['Date']:
        if date in company_df['Date'].values:
            cell_value = company_df.loc[company_df['Date'] == date, filename.split('.')[0] + ' sentiment'].iloc[0]
            sentiment_col.append(cell_value)
        else:
            sentiment_col.append(0)
    
    df[filename.split('.')[0] + ' sentiment'] = sentiment_col

df=df.drop(['Unnamed: 0'], axis=1)
df

Unnamed: 0,Date,Author,Text,Type,year,quarter,2_YR_Treasury,Eff_Fed_Funds,Real_GDP,Core_CPI,...,UnitedHealth_Group_Inc sentiment,Cisco_Systems_Inc sentiment,Chevron_Corp sentiment,Microsoft_Corp sentiment,CocaCola_Co sentiment,Amgen_Inc sentiment,Caterpillar_Inc sentiment,Procter__Gamble_Co sentiment,Walmart_Inc sentiment,Merck__Co_Inc sentiment
0,2000-12-08,Chairman Alan Greenspan,\n \n\r\nBuildings such as this new Birmingha...,Speech,,,5.50,6.47,13262.250,2.983840,...,-0.146199,-0.361702,-0.348485,-0.606383,0.084746,-0.212121,-0.170984,-0.292929,-0.076923,-1.000000
1,2000-12-06,"Vice Chairman Roger W. Ferguson, Jr.",\n \n\n\r\n\tThank you for inviting me to the...,Speech,,,5.42,6.48,13262.250,2.983840,...,-0.146199,-0.361702,-0.348485,-0.606383,0.084746,-0.212121,-0.170984,-0.292929,-0.076923,-1.000000
2,2000-12-06,Governor Edward M. Gramlich,"\n \n\n Subprime Lending, Predatory Lending \...",Speech,,,5.42,6.48,13262.250,2.983840,...,-0.146199,-0.361702,-0.348485,-0.606383,0.084746,-0.212121,-0.170984,-0.292929,-0.076923,-1.000000
3,2000-12-05,Chairman Alan Greenspan,"\n \n\r\nTechnological innovation, and in par...",Speech,,,5.49,6.51,13262.250,2.983840,...,-0.146199,-0.361702,-0.348485,-0.606383,0.084746,-0.212121,-0.170984,-0.292929,-0.076923,-1.000000
4,2000-11-21,Governor Edward M. Gramlich,\n \n\n Financial Literacy \n \r\nPartnership...,Speech,,,5.86,6.50,13262.250,2.881613,...,-0.146199,-0.361702,-0.348485,-0.606383,0.084746,-0.212121,-0.170984,-0.292929,-0.076923,-1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1916,2010-05-09,FOMC Statement,"\nThe Federal Reserve, the central bank of the...",Statement,2010.0,2.0,0.83,0.20,15605.628,0.799868,...,-0.364532,-0.449876,-0.509537,-0.441935,-0.416938,-0.380835,-0.500000,-0.267057,-0.551020,-0.794949
1917,2020-03-03,FOMC Statement,"\nThe Federal Reserve, the central bank of the...",Statement,2020.0,1.0,0.71,1.59,18951.992,2.641068,...,-0.318504,-0.549211,-0.426654,-0.481121,-0.273030,-0.888889,-0.362930,-0.138095,-0.440520,-0.273951
1918,2020-03-23,FOMC Statement,"\nThe Federal Reserve, the central bank of the...",Statement,2020.0,1.0,0.28,0.15,18951.992,2.641068,...,-0.318504,-0.549211,-0.426654,-0.481121,-0.273030,-0.888889,-0.362930,-0.138095,-0.440520,-0.273951
1919,2020-03-31,FOMC Statement,"\nThe Federal Reserve, the central bank of the...",Statement,2020.0,1.0,0.23,0.08,18951.992,2.641068,...,-0.410000,-0.549211,-0.558648,-0.490241,-0.341693,-0.425390,-0.362930,-0.360419,-0.440520,-0.424149


In [4]:
# Setting PR Column
delay = datetime.timedelta(days=60)

future_change = []

rate_classification = []

curr_rates = []

future_rates = []

for date in df['Date']:
    data = web.DataReader(['FEDFUNDS'], 'fred', date, date + delay)
    if len(data['FEDFUNDS']) < 2:
        
        future_change.append(np.nan)
        rate_classification.append(np.nan)
        future_rates.append(np.nan)
        curr_rates.append(np.nan)
    else:
        future_change.append(data['FEDFUNDS'][-1] - data['FEDFUNDS'][0])
        curr_rates.append(data['FEDFUNDS'][0])
        future_rates.append(data['FEDFUNDS'][-1])
        
        if data['FEDFUNDS'][-1] > data['FEDFUNDS'][0]:
            rate_classification.append(1)
        elif data['FEDFUNDS'][-1] == data['FEDFUNDS'][0]:
            rate_classification.append(0)
        else:
            rate_classification.append(-1)

df['PR'] = future_change
df['Class'] = rate_classification
df['Future'] = future_rates
df['Curr'] = curr_rates
df = df.dropna()

In [5]:
def get_semantic_features(X_train, X_test, train_model, vector_size):

    X_train = [nltk.word_tokenize(row) for row in X_train]
    X_test = [nltk.word_tokenize(row) for row in X_test]
    
    
    
    train_embeddings = []
    for row in X_train:
        embedding = []
        zeros = True
        for word in row:
            if word in train_model.wv:
                embedding.append(train_model.wv[word])
                zeros = False
                
        if zeros:
            train_embeddings.append(np.zeros(vector_size))
        else:
            train_embeddings.append(np.mean(embedding, axis=0))
            
    X_train_average = train_embeddings
    
    test_embeddings = []
    for row in X_test:
        embedding = []
        zeros = True
        for word in row:
            if word in train_model.wv:
                embedding.append(train_model.wv[word])
                zeros = False
                
        if zeros:
            test_embeddings.append(np.zeros(vector_size))
        else:
            test_embeddings.append(np.mean(embedding, axis=0))
            
    X_test_average = test_embeddings
    
    return X_train_average, X_test_average

In [63]:
X_text = df['Text']
X_text_dates = df[['Date', 'Type']]
X_macro = df[['2_YR_Treasury', 'Eff_Fed_Funds',
       'Real_GDP', 'Core_CPI', 'PCE', 'Unemployment', 'Savings_Rate',
       'Retail_Sales', 'Manufacturing_PMI', 'Consumer_Sent', 'Liquidity',
       'Volatility', 'SP_500', '2_YR_Treasury_pct_chng',
       'Eff_Fed_Funds_pct_chng', 'Real_GDP_pct_chng', 'Core_CPI_pct_chng',
       'PCE_pct_chng', 'Unemployment_pct_chng', 'Savings_Rate_pct_chng',
       'Retail_Sales_pct_chng', 'Manufacturing_PMI_pct_chng',
       'Consumer_Sent_pct_chng', 'Liquidity_pct_chng', 'Volatility_pct_chng',
       'SP_500_pct_chng']]

X_micro = df[['Honeywell_International_Inc sentiment',
       'Travelers_Companies_Inc sentiment', 'Boeing_Co sentiment',
       'American_Express_Co sentiment', 'Nike_Inc sentiment',
       'Walt_Disney_Co sentiment', 'Intel_Corp sentiment',
       'Johnson__Johnson sentiment', 'McDonalds_Corp sentiment',
       'International_Business_Machines_Corp sentiment',
       'Walgreens_Boots_Alliance_Inc sentiment', '3M_Co sentiment',
       'Visa_Inc sentiment', 'Apple_Inc sentiment',
       'Verizon_Communications_Inc sentiment', 'Dow_Inc sentiment',
       'Salesforce_Inc sentiment', 'Home_Depot_Inc sentiment',
       'UnitedHealth_Group_Inc sentiment', 'Cisco_Systems_Inc sentiment',
       'Chevron_Corp sentiment', 'Microsoft_Corp sentiment',
       'CocaCola_Co sentiment', 'Amgen_Inc sentiment',
       'Caterpillar_Inc sentiment', 'Procter__Gamble_Co sentiment',
       'Walmart_Inc sentiment', 'Merck__Co_Inc sentiment']]

X_auto = df['Curr']


y_reg = df['PR']

y_class = df['Class']


X_text_train, X_text_test, X_train_macro, X_test_macro, X_train_micro, X_test_micro, X_auto_train, X_auto_test, y_reg_train, y_reg_test, y_class_train, y_class_test = train_test_split(
    X_text, X_macro, X_micro, X_auto, y_reg, y_class, test_size=0.2
)

# vectorizer the text
tfidf_vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words='english', max_df=0.7, min_df=5)
X_text_train_tfidf = tfidf_vectorizer.fit_transform(X_text_train)
X_text_test_tfidf = tfidf_vectorizer.transform(X_text_test)

# scaling for macro model
scaler = StandardScaler()
X_train_macro.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train_macro.fillna(0, inplace=True)
X_test_macro.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test_macro.fillna(0, inplace=True)
X_train_macro = scaler.fit_transform(X_train_macro)
X_test_macro = scaler.transform(X_test_macro)

# semantic model
google_model_key = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, limit=40000)
google_model = gensim.models.Word2Vec()
google_model.wv = google_model_key
X_train_google, X_test_google = get_semantic_features(X_text_train, X_text_test, google_model, 300)

In [64]:
# auto model
auto_reg = LinearRegression()
auto_reg.fit(np.array(X_auto_train).reshape(-1, 1), y_reg_train)
auto_reg_pred = auto_reg.predict(np.array(X_auto_test).reshape(-1, 1))

# text only regression
text_reg_model = LinearRegression()
text_reg_model.fit(X_text_train_tfidf, y_reg_train)
y_pred_text_reg = text_reg_model.predict(X_text_test_tfidf)

# macro only regression
macro_reg_model = LinearRegression()
macro_reg_model.fit(X_train_macro, y_reg_train)
y_pred_macro_reg = macro_reg_model.predict(X_test_macro)

# micro only regression
micro_reg_model = LinearRegression()
micro_reg_model.fit(X_train_micro, y_reg_train)
y_pred_micro_reg = micro_reg_model.predict(X_test_micro)

# semantic regression
semantic_reg_model = LinearRegression()
semantic_reg_model.fit(X_train_google, y_reg_train)
y_pred_semantic_reg = semantic_reg_model.predict(X_test_google)

# dummy model
dummy=DummyClassifier(strategy='most_frequent')
dummy.fit(X_text_train_tfidf, y_class_train)
dummy_pred = dummy.predict(X_text_test_tfidf)

# text only class
text_class_model = LogisticRegression(max_iter = 1000)
text_class_model.fit(X_text_train_tfidf, y_class_train)
y_pred_text_class = text_class_model.predict(X_text_test_tfidf)

# numerical only class
macro_class_model = LogisticRegression(max_iter = 1000)
macro_class_model.fit(X_train_macro, y_class_train)
y_pred_macro_class = macro_class_model.predict(X_test_macro)

# micro only class
micro_class_model = LogisticRegression(max_iter = 1000)
micro_class_model.fit(X_train_micro, y_class_train)
y_pred_micro_class = micro_class_model.predict(X_test_micro)

# semantic class
semantic_class_model = LogisticRegression(max_iter = 1000)
semantic_class_model.fit(X_train_google, y_class_train)
y_pred_semantic_class = semantic_class_model.predict(X_test_google)

In [65]:
print('-------Regression Models-------')
auto_mse = mean_squared_error(y_reg_test, auto_reg_pred)
print("Auto MSE:", auto_mse)

text_only_mse = mean_squared_error(y_reg_test, y_pred_text_reg)
print("Text MSE:", text_only_mse)

macro_only_mse = mean_squared_error(y_reg_test, y_pred_macro_reg)
print("Macro MSE:", macro_only_mse)

micro_only_mse = mean_squared_error(y_reg_test, y_pred_micro_reg)
print("Micro MSE:", micro_only_mse)

semantic_only_mse = mean_squared_error(y_reg_test, y_pred_semantic_reg)
print("Semantic MSE:", semantic_only_mse)

print('-------Classification Models-------')
auto_f1 = f1_score(y_class_test, dummy_pred, average='weighted')
print("Auto F1:", auto_f1)

text_only_f1 = f1_score(y_class_test, y_pred_text_class, average='weighted')
print("Text F1:", text_only_f1)

macro_only_f1 = f1_score(y_class_test, y_pred_macro_class, average='weighted')
print("Macro F1:", macro_only_f1)

micro_only_f1 = f1_score(y_class_test, y_pred_micro_class, average='weighted')
print("Micro F1:", micro_only_f1)

semantic_only_f1 = f1_score(y_class_test, y_pred_semantic_class, average='weighted')
print("Semantic F1:", semantic_only_f1)

-------Regression Models-------
Auto MSE: 0.05827756565176713
Text MSE: 0.030600491775177367
Macro MSE: 0.03442092023180792
Micro MSE: 0.04198849484010601
Semantic MSE: 1.8129433880015189
-------Classification Models-------
Auto F1: 0.26074925500212853
Text F1: 0.5961327155419982
Macro F1: 0.5688358828104745
Micro F1: 0.6100823045267488
Semantic F1: 0.39477332978381197


In [66]:
X_text_tfidf = vstack([X_text_train_tfidf, X_text_test_tfidf] )
X_macro = np.vstack([X_train_macro, X_test_macro])
X_micro = pd.concat([X_train_micro, X_test_micro], axis=0)
X_google = np.vstack([X_train_google, X_test_google])
y_reg = pd.concat([y_reg_train, y_reg_test], axis = 0)
y_class = pd.concat([y_class_train, y_class_test], axis = 0)

result_df = pd.DataFrame(columns = ['tfidf', 'macro', 'micro', 'semantic', 'mse', 'mae', 'f1', 'accuracy'])

for tfidf in range(2):
    for macro in range(2):
        for micro in range(2):
            for semantic in range(2):
                
                
                if not (macro or micro or tfidf or semantic):
                    continue
                
                all_features = None
                
                if tfidf:
                    text_features = X_text_tfidf 
                    if macro:
                        text_features = hstack([text_features, X_macro])
                    if micro:
                        text_features = hstack([text_features, X_micro])
                    if semantic:
                        text_features = hstack([text_features, X_google])
                    all_features = text_features
                else:
                    non_text_features = pd.DataFrame()
                    if macro:
                        non_text_features = pd.concat([non_text_features, pd.DataFrame(X_macro).reset_index(drop=True)], axis=1)
                    if micro:
                        non_text_features = pd.concat([non_text_features, pd.DataFrame(X_micro).reset_index(drop=True)], axis=1)
                    if semantic:
                        non_text_features = pd.concat([non_text_features, pd.DataFrame(X_google).reset_index(drop=True)], axis=1)
                    all_features = non_text_features
                    all_features.columns = all_features.columns.astype(str)
                
                
                reg_model = LinearRegression()
                log_model = LogisticRegression(max_iter=1000)
                
                scoring_metrics_reg = {
                    'mse': make_scorer(mean_squared_error),
                    'mae': make_scorer(mean_absolute_error) 
                }

                scoring_metrics_log = {
                    'accuracy': 'accuracy',
                    'f1': 'f1_weighted'
                }

                
                cv_results_reg = cross_validate(reg_model, all_features, y_reg, cv=10, scoring=scoring_metrics_reg)
                cv_results_log = cross_validate(log_model, all_features, y_class, cv=10, scoring=scoring_metrics_log)

                mse = cv_results_reg['test_mse'].mean()
                mae = cv_results_reg['test_mae'].mean()
                accuracy = cv_results_log['test_accuracy'].mean()
                f1 = cv_results_log['test_f1'].mean()

                result_df.loc[len(result_df)] = [tfidf, macro, micro, semantic, mse, mae, f1, accuracy]
                print([tfidf, macro, micro, semantic, mse, mae, f1, accuracy])
                

result_df

[0, 0, 0, 1, 0.2559737351175463, 0.33362534537271754, 0.3108413127732934, 0.45146341463414635]
[0, 0, 1, 0, 0.030241093127397516, 0.11952334603721522, 0.5371242135924592, 0.5811585365853658]
[0, 0, 1, 1, 0.43129079368945755, 0.46511571348127745, 0.548242721132067, 0.5911585365853658]
[0, 1, 0, 0, 0.021586576310486046, 0.0990380116416582, 0.5889299142908546, 0.6008536585365853]
[0, 1, 0, 1, 0.4712372334534862, 0.446133852374966, 0.6014938941654606, 0.6134146341463416]
[0, 1, 1, 0, 0.020058930259986764, 0.09741105039244033, 0.6381426984842082, 0.6507926829268293]
[0, 1, 1, 1, 10.79010118685279, 1.7580724672059453, 0.6533158037180492, 0.6607317073170731]
[1, 0, 0, 0, 0.02030545676412905, 0.08301857484801314, 0.5833399862077776, 0.6008536585365853]
[1, 0, 0, 1, 0.020285679111794026, 0.08270891558552801, 0.5882743101950539, 0.6059756097560974]
[1, 0, 1, 0, 0.018633851311659462, 0.08356535143237268, 0.6039012860784683, 0.6309756097560976]
[1, 0, 1, 1, 0.01856184501410352, 0.08318498110276012

Unnamed: 0,tfidf,macro,micro,semantic,mse,mae,f1,accuracy
0,0.0,0.0,0.0,1.0,0.255974,0.333625,0.310841,0.451463
1,0.0,0.0,1.0,0.0,0.030241,0.119523,0.537124,0.581159
2,0.0,0.0,1.0,1.0,0.431291,0.465116,0.548243,0.591159
3,0.0,1.0,0.0,0.0,0.021587,0.099038,0.58893,0.600854
4,0.0,1.0,0.0,1.0,0.471237,0.446134,0.601494,0.613415
5,0.0,1.0,1.0,0.0,0.020059,0.097411,0.638143,0.650793
6,0.0,1.0,1.0,1.0,10.790101,1.758072,0.653316,0.660732
7,1.0,0.0,0.0,0.0,0.020305,0.083019,0.58334,0.600854
8,1.0,0.0,0.0,1.0,0.020286,0.082709,0.588274,0.605976
9,1.0,0.0,1.0,0.0,0.018634,0.083565,0.603901,0.630976


In [29]:
result_df.to_csv('result_df.csv', index=False)