## Predict the daily buy/sell target variable with respect to the news sentiment

In [None]:
from datetime import datetime

import numpy as np
from pyspark.sql.window import Window

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

import pandas as pd

spark = SparkSession.builder.getOrCreate()


### Data Processing
- Calculate sentiment score from the new york times new data using FinBert transformer model.
- Calculate percent change in the stock from previous day and calculate labels as buy or sell based on these changes.
- Join the 2 dataframes

In [None]:
@udf
def get_sentiment(text):
    finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
    tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

    nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)
    results = nlp(text.split('.'))
#     for r in results
    df_res = pd.DataFrame(results)
    sents = df_res[df_res['label']!='Neutral'].groupby(by='label', as_index=False).count().sort_values(by='score', ascending=False)
    score = 0
    for i,v in sents.iterrows():
        if 'Positive' in v['label']:
            score+=v['score']
        elif 'Negative' in v['label']:
            score-=v['score']
#     pos = list(sents[sents['label']=='Positive'].score)[0]
#     neg = list(sents[sents['label']=='Negative'].score)[0]
#     neg = list(sents[sents['label']=='Neutral'].score)[0]
    return score

@udf('int')
def convert(close_change):
    if close_change>=0:
        return 0
    else:
        return 1

In [None]:
mongo_username = "####"
mongo_password =  "####"
mongo_ip_address = "g14cluster.tlbgg.mongodb.net"
database_name = "g14_db"
collection_name = "nyt_news"
connection_string = f"mongodb+srv://{mongo_username}:{mongo_password}@{mongo_ip_address}/{database_name}.{collection_name}"

df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri",connection_string).load()

df = df.filter(df.ticker=='AAPL')
df_sentiment = df.withColumn("sentiment_para", get_sentiment(df.lead_paragraph))
df_sentiment = df_sentiment.withColumn("sentiment_score2", df_sentiment["sentiment_para"].cast(IntegerType())).drop("sentiment_para")
df_sents_grp = df_sentiment.select('date', 'ticker', first('sentiment_score2').over(Window.partitionBy('ticker').orderBy('date')).alias('avg_score'))

collection_name = "ts_da"
connection_string = f"mongodb+srv://{mongo_username}:{mongo_password}@{mongo_ip_address}/{database_name}.{collection_name}"

df_tsda = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri",connection_string).load()

df_tsda = df_tsda.filter(df_tsda.symbol=='AAPL')
df_tsda_prev = df_tsda.select('amount', 'close', 'coefficient',
               'date', 'high', 'low', 'open',
               'symbol', 'timezone', 'volume',
               lag('close',1).over(Window.partitionBy('symbol').orderBy('date')).alias('prev_close')
               )
df_tsda_chg = df_tsda_prev.select('amount', 'close', 'coefficient',
               'date', 'high', 'low', 'open',
               'symbol', 'timezone', 'volume',
               ((df_tsda_prev.close-df_tsda_prev.prev_close)/df_tsda_prev.prev_close).alias('close_change')).cache()
df_sentnews_tsda=df_tsda_chg\
.join(df_sents_grp, (df_sents_grp.ticker==df_tsda_chg.symbol) & (df_sents_grp.date==df_tsda_chg.date))\
.select(df_tsda_chg.date, 'symbol', 'close_change', log(df_tsda_chg.volume).alias('logvolume'), 'avg_score')

df_sentnews_tsda = df_sentnews_tsda.na.drop()
df_sentnews_tsda = df_sentnews_tsda.withColumn('buy_sell', convert('close_change'))
df_sentnews_tsda = df_sentnews_tsda.withColumn("date",col("date").cast(DateType())).withColumn('date_n',date_add(col('date'),-1).alias('date_n'))
df_sentnews_tsda = df_sentnews_tsda.withColumn('month', date_format(col('date_n'),'M').cast(LongType())).withColumn('year', date_format(col('date_n'),'y').cast(LongType()))

In [None]:
df_sentnews_tsda.show(5)

+----------+------+--------------------+------------------+---------+--------+----------+-----+----+
|      date|symbol|        close_change|         logvolume|avg_score|buy_sell|    date_n|month|year|
+----------+------+--------------------+------------------+---------+--------+----------+-----+----+
|2022-01-03|  AAPL|0.025004223686435303|18.466621328112897|       -1|       0|2022-01-02|    1|2022|
|2022-01-03|  AAPL|0.025004223686435303|18.466621328112897|       -1|       0|2022-01-02|    1|2022|
|2022-01-04|  AAPL|-0.01269161035108...|18.413761239301795|       -1|       1|2022-01-03|    1|2022|
|2022-01-05|  AAPL|-0.02659988870339...| 18.36450821808277|       -1|       1|2022-01-04|    1|2022|
|2022-01-13|  AAPL|-0.01902808636700594|  18.2523302556847|       -1|       1|2022-01-12|    1|2022|
|2022-01-18|  AAPL|-0.01889408909689...| 18.32086301640156|       -1|       1|2022-01-17|    1|2022|
|2022-01-20|  AAPL|-0.01034710942669...| 18.33098046419263|       -1|       1|2022-01-19|  

### One Hot Encoding

In [None]:
def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        # For each given colum, create OneHotEncoder. 
        # dropLast : Whether to drop the last category in the encoded vector (default: true)
        ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        ohe_model = ohe.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = ohe_model.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

cat_cols = ['year', 'month']
# new joined and transformed dataframe
df_sents_f = oneHotEncodeColumns(df_sentnews_tsda, cat_cols)

### Create a dataframe with Features and Labels using VectorAssembler + Split the data

In [None]:
reg_incols = ['logvolume', 'avg_score']
va = VectorAssembler(outputCol="features",
                     inputCols=reg_incols)
df_va = va.transform(df_sents_f).select('features', 'buy_sell').withColumnRenamed('buy_sell', 'label')

#splitting the data
splits = df_va.randomSplit([0.8, 0.2], seed = 1)
train = splits[0].cache()
validation = splits[1].cache()
#setting evaluator
reval = BinaryClassificationEvaluator()

### Logistic Regression

In [None]:
lr = LogisticRegression()
model = lr.fit(train)

validpredicts = model.transform(validation)
print(validpredicts.show())
reval = BinaryClassificationEvaluator()
print (reval.getMetricName() +":" + str(reval.evaluate(validpredicts)))

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[17.7629804723881...|    0|[-0.4175541402589...|[0.39710217077067...|       1.0|
|[17.8253622237061...|    1|[-0.3873097214379...|[0.40436509790427...|       1.0|
|[17.8883602518576...|    1|[-0.3567665143613...|[0.41174252637443...|       1.0|
|[17.9871774077935...|    0|[-0.3088571917200...|[0.42339370988722...|       1.0|
|[17.9912298019269...|    0|[-0.3068924776361...|[0.42387343048526...|       1.0|
|[18.0321929521760...|    0|[-0.2870323960227...|[0.42873054038109...|       1.0|
|[18.0564484003869...|    1|[-0.2752726759302...|[0.43161312153346...|       1.0|
|[18.0564484003869...|    1|[-0.2752726759302...|[0.43161312153346...|       1.0|
|[18.0664391227952...|    0|[-0.2704288941117...|[0.43280180518680...|       1.0|
|[18.06894067489

### Outcomes
- The model uses sentiment scores generated using FinBert transformer model as well as the log of volume traded.
- As observed above, the Logistic Regression model predicts weather to buy or sell the stock using sentiment score with Area under ROC score of ~0.52
- Evaluated on 2 models - random forest and logistic regression but since it gave better results on logistic regression, therefore only included logistic model here.

#### Save processed vector assembler data to mongo db

In [None]:
def sparseToDenseArray(sparse_array):
    return sparse_array.toArray().tolist()

udf_sparse_dense_array = udf(sparseToDenseArray, ArrayType(FloatType()))
va_df_dense_v_to_array =  df_va.select(udf_sparse_dense_array(df_va["features"]).alias("features"), df_va["label"])

In [None]:
collection = 'senti_processed_data'
connection_string = f"mongodb+srv://{mongo_username}:{mongo_password}@{mongo_ip_address}/{database_name}.{collection}"
va_df_dense_v_to_array.write.format("com.mongodb.spark.sql.DefaultSource")\
                     .mode("overwrite")\
                     .option("uri", connection_string)\
                     .save()