## Import Modules

In [None]:
import src.finbert as fb
import src.ws_yahoo as wsy
import pandas as pd

import os
import glob

## Sentiment CSV Creator

In [None]:
# Pulls website articles, puts into Dataframe
# This takes quite a while to run

def main_dataframe_creator(ticker_list, start, end):
    folder_path = './csv/'
    os.chdir(folder_path)   
    
    for item in ticker_list:
        ticker = item
        start = start
        end = end
        df = wsy.dataframe_price_sentiment(ticker, start, end)
        df.to_csv(f'./{item}.csv', index=False)

In [None]:
# Uses article CSVs to create sentiment csvs
# This takes a VERY long time to run

def sentiment_CSV():
    csv_path = '../csv/'
    sentiment_path = '../sentiment/'
    extension = 'csv'
    os.chdir(csv_path)
    result = glob.glob('*.{}'.format(extension))
    print(result)

    for item in result:
        os.chdir(csv_path)
        print(item)
        df_csv = pd.read_csv(item)
        df_sentiment = fb.sentiment_poster(df_csv)
        os.chdir(sentiment_path)
        df_sentiment.to_csv(f'./{df_sentiment.company[0]}-sentiment.csv')

In [None]:
# Pulls CSVs out from ../sentiment and puts into one single dataframe
# This is the final dataset for feature engineering and the pipeline

def sentiment_dataframe():
    path = '../sentiment/'
    extension = 'csv'
    os.chdir(path)
    result = glob.glob('*.{}'.format(extension))
    print(result)

    df_list = []

    for item in result:
        df = pd.read_csv(item, index_col=0)
        df_list.append(df)

    df_sentiment = pd.concat(df_list, ignore_index=True)
    
    return df_sentiment

## Ticker News Article Scraping

In [None]:
# ticker_list = ['WPM', 'PAAS', 'HL', 'MAG', 'CDE'] <- example
# currently the ws_yahoo.py only grabs 2 articles, needs to be manually updated in script
ticker_list = ['HBM', 'PAAS']
start = '2017-01-01'
end = '2022-11-30'

main_dataframe_creator(ticker_list, start, end)

In [None]:
# This function generates all the sentiment CSVs from the news articles
# This can take a VERY long time - about 30 seconds per article
# Web scraper generally grabs 80-90 articles per ticker (if no preset limits on)

sentiment_CSV()

In [None]:
# This function compiles all the sentiment csvs into a single dataframe
df = sentiment_dataframe()

In [None]:
df

## Feature Engineering

In [None]:
def feature_set1(df):
    # weight of sentiment, depending on which sentiment, the other two weighted against the dominant
    df.loc[df['fb_body_stmt'] == 0, 'fb_body_weight'] = (df['fb_body_posi'] / (df['fb_body_neut'] + df['fb_body_nega']))
    df.loc[df['fb_body_stmt'] == 1, 'fb_body_weight'] = (df['fb_body_nega'] / (df['fb_body_posi'] + df['fb_body_neut']))
    df.loc[df['fb_body_stmt'] == 2, 'fb_body_weight'] = (df['fb_body_neut'] / (df['fb_body_posi'] + df['fb_body_nega']))
    df.loc[df['fb_head_stmt'] == 0, 'fb_head_weight'] = (df['fb_head_posi'] / (df['fb_head_neut'] + df['fb_head_nega']))
    df.loc[df['fb_head_stmt'] == 1, 'fb_head_weight'] = (df['fb_head_nega'] / (df['fb_head_posi'] + df['fb_head_neut']))
    df.loc[df['fb_head_stmt'] == 2, 'fb_head_weight'] = (df['fb_head_neut'] / (df['fb_head_posi'] + df['fb_head_nega']))

    # do the headlines and body sentiments align
    df.loc[df['b_body_stmt'] == df['b_head_stmt'], 'b_alignment'] = 1
    df.loc[df['b_body_stmt'] != df['b_head_stmt'], 'b_alignment'] = 0
    df.loc[df['fb_body_stmt'] == df['fb_head_stmt'], 'fb_alignment'] = 1
    df.loc[df['fb_body_stmt'] != df['fb_head_stmt'], 'fb_alignment'] = 0
    df.loc[df['b_alignment'] == df['fb_alignment'], 'all_alignment'] = 1
    df.loc[df['b_alignment'] != df['fb_alignment'], 'all_alignment'] = 0

    # get dummies
    df = pd.concat([df, pd.get_dummies(df['b_body_stmt'], prefix='b_body_stmt')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['b_head_stmt'], prefix='b_head_stmt')], axis=1)

    df = pd.concat([df, pd.get_dummies(df['fb_body_stmt'], prefix='fb_body_stmt')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['fb_head_stmt'], prefix='fb_head_stmt')], axis=1)

    columns_to_drop = ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',  # Financial columns
    'company', 'level_0', 'index', 'url', 'headline' , 'source', 'body', 'date' # Categorical columns
    ]

    df_final = df.drop(columns=columns_to_drop).reset_index(drop=True)

    return df_final

In [None]:
df_final = feature_set1(df)

## ML Pipeline

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import xgboost as xgb


## WARNINGS ARE TURNED OFF ##

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore')

lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()  
xboost = xgb.XGBClassifier()

#### Datamodel Functions

In [None]:
def datamodel_creation(df):
    # Identify numeric and categorical features, columns to keep and drop
    drop_feats = ['b_body_stmt', 'b_head_stmt', 'fb_body_stmt', 'fb_head_stmt']
    target_feat = ['target']

    df_target = df[target_feat]
    df_pipeline = df.drop(columns=target_feat)
    model_feats = df_pipeline.columns.tolist()

    X_train, X_test, y_train, y_test = train_test_split(df_pipeline, df_target, test_size=0.20, random_state=42)

    datamodel = [X_train, y_train, X_test, y_test]

    return datamodel, model_feats

#### Metrics and Pipeline

In [None]:
def metrics(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    metrics_list = [recall, precision, accuracy, f1, roc_auc]
    metrics_tag = ['recall', 'precision', 'accuracy', 'f1 score', 'roc auc score']
    metrics_def = [
        '--> Recall is the fraction correctly identified as positive out of all predicted positives',
        '--> Precision is the fraction correctly identified as positive out of all positives',
        '--> Accuracy is the fraction of predictions our model got correct',
        '--> F1 Score is the harmonic mean of models precision and recall',
        '--> ROC-AUC Score shows the performance of the model at all classification levels']

    for i in range(len(metrics_list)):
        print(f'the {metrics_tag[i]} is: {metrics_list[i]} {metrics_def[i]}')

    print(f'the confusion matrix is:\n{cm}')

    return metrics_list

In [None]:
def pipeline(model, datamodel):

    numeric_transform = Pipeline([
        ('scaling', StandardScaler()),
        ('select_k_best', SelectKBest(k=3))
    ])

    preprocessing = ColumnTransformer([
        ('numeric', numeric_transform, model_feats),
    ])

    union = FeatureUnion([
        ('pca', PCA(n_components=3)),
        ('select_k_best', SelectKBest(k=3))
    ])

    pipeline = Pipeline([
        ('preprocessing', preprocessing),
        ('features', union),
        ('model', model)
    ])

    if model == lr:
        params = {
            'features__pca__n_components': [1, 2, 3, 4, 5],
            'features__select_k_best__k': [1, 2, 3, 4, 5],
            'model__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            'model__penalty': ['1l', 'l2', 'elasticnet', 'none']
        }
    elif model == rfc:
        params = {
            'features__pca__n_components': [1, 2, 3, 4, 5],
            'features__select_k_best__k': [1, 2, 3, 4, 5],
            # 'model__criterion' : ["gini", "entropy", "log_loss"],
            'model__max_depth': [10, 25, 50, 100],
            'model__n_jobs' : [-1],
            # 'model__max_features': ["sqrt", "log2", None],
            'model__n_estimators': [10, 50, 75, 100, 250]
        }
    elif model == gbc:
        params = {
            'features__pca__n_components': [1, 2, 3, 4, 5],
            'features__select_k_best__k': [1, 2, 3, 4, 5],
            'model__n_estimators': [50, 75, 100, 250, 500]
        }
    elif model == xboost:
        params = {
        'model__max_depth': [2, 3, 4, 5, 6],
        'model__eta': [1, 2, 3], 
        'model__nthread': [-1],
        'model__objective': ['binary:logistic']
        }

    # model = pipeline.fit(datamodel[0], datamodel[1])

    grid_search = GridSearchCV(pipeline, params, verbose=0)
    model = grid_search.fit(datamodel[0], datamodel[1])
    print(f'The parameters were:\n{grid_search.best_params_}') 
    print(f'The best score was: {grid_search.best_score_}')

    y_pred = model.predict(datamodel[2])
    y_prob = model.predict_proba(datamodel[2])
    y_best = model.best_score_
    # y_feats = grid_search.feature_importances_

    metrics_list = metrics(datamodel[3], y_pred)
    
    return model, metrics_list, y_prob, y_pred, y_best

#### Function Runs

In [None]:
datamodel, model_feats = datamodel_creation(df_final)

In [None]:
# 16.8s
model_lr, metrics_lr, prob_lr, pred_lr, best_lr = pipeline(lr, datamodel)

In [None]:
# 1m 25.6s
model_rfc, metrics_rfc, prob_rfc, pred_rfc, best_rfc = pipeline(rfc, datamodel)

In [None]:
# 21.7s
model_gbc, metrics_gbc, prob_gbc, pred_gbc, best_gbc = pipeline(gbc, datamodel)

In [None]:
# 2.8s
model_xgb, metrics_xgb, prob_xgb, pred_xgb, best_xgb = pipeline(xboost, datamodel)

#### Model Metrics and Graphics

In [None]:
import numpy as np
metrics_tag = ['recall', 'precision', 'accuracy', 'f1 score', 'roc auc score']
model_bests = [best_lr, best_rfc, best_gbc, best_xgb]

df_met_lr = pd.DataFrame(np.array(metrics_lr).reshape(1,-1), columns=metrics_tag).T
df_met_lr['model'] = 'Logistic Regression'
df_met_rfc = pd.DataFrame(np.array(metrics_rfc).reshape(1,-1), columns=metrics_tag).T
df_met_rfc['model'] = 'Random Forest'
df_met_gbc = pd.DataFrame(np.array(metrics_gbc).reshape(1,-1), columns=metrics_tag).T
df_met_gbc['model'] = 'Gradient Boosting'
df_met_xgb = pd.DataFrame(np.array(metrics_xgb).reshape(1,-1), columns=metrics_tag).T
df_met_xgb['model'] = 'XGBoost'


df_met = pd.concat([df_met_lr, df_met_rfc, df_met_gbc, df_met_xgb])
df_met = df_met.reset_index()
df_met = df_met.rename(columns={'index':'metric', 0:'score'})
df_met

graph = pd.pivot_table(
    df_met,
    values=['score'],
    index=['metric'],
    columns=['model'],
    aggfunc=np.sum,
)

graph.plot(
    kind='bar', 
    ylim=(0,1),
    title='Model Metrics',
    figsize=(10,7),
    fontsize=15,
    rot=45,
)