In [1]:
import findspark
findspark.init()

In [2]:
import yaml
import os

import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification

from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType
from pyspark.sql.functions import col, to_date, to_timestamp
from pyspark.errors import PySparkException
from pymongo import MongoClient
from pymongo import errors

2024-03-06 19:05:33.549121: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def gen_mongo_uri(mongo_conf) -> str | None:
    """
    generate mongo connection uri based on input
    :return:
    """
    if not mongo_conf:
        return None
    return (f"mongodb+srv://{mongo_conf['user']}:"
            f"{mongo_conf['token']}@"
            f"{mongo_conf['host']}"
            f"/?retryWrites=true&w=majority")

def get_property(conf) -> dict:
    """
    get property for database
    """
    return {key: conf[key] for key in conf.keys()
                         & {'user', 'password', 'driver'}}

def gen_maria_jdbc(conf, db: str = '') -> str | None:
    """
    get connection jdbc string for maria database
    """
    if not conf:
        return None

    db = conf['database'] if not db else db
    return (f"jdbc:mysql://{conf['host']}:"
            f"{conf['port']}/"
            f"{db}?permitMysqlScheme")

def gen_postgres_jdbc(conf, db: str = '') -> str | None:
    """
    get connection jdbc string for maria database
    """
    if not conf:
        return None
    db = conf['database'] if not db else db
    return (f"jdbc:postgresql://{conf['host']}:"
            f"{conf['port']}/{db}")

def gen_mssql_jdbc(conf, db: str = '') -> str | None:
    """
    get connection jdbc string for maria database
    """
    if not conf:
        return None
    db = conf['database'] if not db else db
    return (f"jdbc:sqlserver://{conf['host']}:{conf['port']};"
            f"databaseName={db};encrypt=true;trustServerCertificate=true;")

def gen_azure_jdbc(conf, db: str = '') -> str | None:
    """
    get connection jdbc string for azure sql database
    """
    if not conf:
        return None
    db = conf['database'] if not db else db
    return (f"jdbc:sqlserver://{conf['host']}:{conf['port']};"
            f"databaseName={db};encrypt=true;")


def init_mongodb_client(uri: str) -> MongoClient | None:
    """
    initialize the mongo client
    """
    try:
        # Initialize new MongoDB client
        client = MongoClient(uri)
    except errors.ConnectionFailure as e:
        # Handle connection failure gracefully
        print(f"Failed to connect to MongoDB: {e}")

        return None
    else:
        return client

def prepare_dataframe(spark, sc, data) -> DataFrame:
    """
    prepare dataframe when the data might be in different data types
    :param spark: spark session
    :param sc: spark context
    :param data: data to be processed
    :return: spark sql dataframe
    """
    if isinstance(data, pd.DataFrame) and not data.empty:
        print('data is pandas df and not empty')
        df = spark.createDataFrame(data)
    elif isinstance(data, DataFrame) and not data.isEmpty():
        print('data is spark df and not empty')
        df = data
    elif isinstance(data, list):
        print('data is list and not empty')
        df = spark.read.json(sc.parallelize([json.dumps(record) for record in data]))
    elif isinstance(data, dict):
        print('data is dictionary and not empty')
        data = [data]
        df = spark.read.json(sc.parallelize([json.dumps(record) for record in data]))
    else:
        
        print('data empty')
        # initialize empty dataframe
        schema = StructType([])
        df = spark.createDataFrame([], schema)

    return df
    
def write_to_mongo(spark, data, 
                   uri: str, db: str, col: str) -> None:
    """
    write data into mongo db
    :param spark: sparkSession
    :param data: 
    :param uri: str, 
    :param db: str, 
    :param col: str, 
    :return:
    """
    sc = spark.sparkContext
    df = prepare_dataframe(spark, sc, data)
    config = {
        'uri': uri,
        'database': db,
        'collection': col
    }
    if not df.isEmpty():
        df.write.format("mongo") \
            .options(**config) \
            .mode("append") \
            .save()

def write_to_database(spark, data, conf,
                   db: str, write_table: str, type: str = 'mariadb') -> None:
    """
    write data into database table
    :param data:
    :param uri:
    :param db:
    :param table:
    :param type:
    :return:
    """
    sc = spark.sparkContext
    df = prepare_dataframe(spark, sc, data)
    properties = get_property(conf)

    jdbc = ''
    if type == 'mariadb':
        jdbc = gen_maria_jdbc(conf, db)
    elif type == 'postgres':
        jdbc = gen_postgres_jdbc(conf, db)
    elif type == 'mssql':
        jdbc = gen_maria_jdbc(conf, db)
    else:
        raise ValueError('error when generating jdbc')

    if not df.isEmpty():
        df.write.jdbc(
            url=jdbc,
            table=write_table,
            mode="append",
            properties=properties
        )
    else:
        print('empty dataset')

def sentiment_analysis(data, tokenizer, model) -> pd.DataFrame:
    """
    sentiment analysis and retrun dataframe with score
    """
    data['sentiment'] = None
    for idx, text in enumerate(data['text']):
        tokenized_news = tokenizer(text, truncation=True, return_tensors="tf")
        logits = model.predict(tokenized_news).logits
        probabilities = tf.nn.softmax(logits)
        data.at[idx, 'sentiment'] = probabilities.numpy()
    return data

In [4]:
with open('conf.yaml', 'r') as file:
    config = yaml.safe_load(file)

mongo_uri = gen_mongo_uri(config['mongodb'])

In [5]:
try:
    # getting the spark instance
    spark = SparkSession.builder \
        .appName('Big Data Project ETL') \
        .config("spark.mongodb.input.uri", mongo_uri) \
        .config("spark.mongodb.output.uri", mongo_uri) \
        .config("spark.jars.packages", "com.microsoft.azure:spark-mssql-connector_2.12:1.2.0,"
                                       "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
        .master("local[*]") \
        .getOrCreate()
except PySparkException as e:
    print(f"Failed to get or create Spark: {e}")
except Exception as e:
    print(f"Exception Caught: {e}")

:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml

In [6]:
stock_symbols = [
        'AAPL', 'MSFT', 'NVDA', 'META', 'AMZN', 'TSLA', 'GOOGL',
        'ON', 'DBD', 'DSGX', 'GTLB', 'LOGI', 'CRSR',
        'LNG', 'SWN', 'APA', 'BTU', 'CL',
        'BMY', 'THC', 'TNDM',
        'MOS', 'AXTA', 'KOP',
        'SBLK', 'EME', 'DNOW',
    ]


In [7]:
# news option 1
# read from maria - FINNHUB api
# company news
maria_fin_news_df = spark.read.jdbc(
    url=gen_maria_jdbc(config['mariadb']),
    table=f"(SELECT FROM_UNIXTIME(datetime) as date, source, related as symbol, summary as text, "
              f"headline as title FROM finn_company_news) t",
    properties=get_property(config['mariadb'])
)
maria_fin_news_df.show(5)

+-------------------+--------+------+--------------------+--------------------+
|               date|  source|symbol|                text|               title|
+-------------------+--------+------+--------------------+--------------------+
|2024-02-29 13:18:00|   Yahoo|  AAPL|Apple investors w...|Apple Shareholder...|
|2024-02-29 12:45:00|Benzinga|  AAPL|Looking for stock...|Tesla Bull Points...|
|2024-02-29 12:38:00|Benzinga|  AAPL|Looking for stock...|EXCLUSIVE: Magnif...|
|2024-02-29 12:30:58|   Yahoo|  AAPL|Major Chinese thi...|Chinese retailers...|
|2024-02-29 12:19:23|   Yahoo|  AAPL|Germany wants Big...|Germany wants Big...|
+-------------------+--------+------+--------------------+--------------------+
only showing top 5 rows



In [8]:
# news option 2
# read from postgresql - FMP api
# stock news
postgre_fmp_news_df = spark.read.jdbc(
    url=gen_postgres_jdbc(config['postgres']),
    # table=f"(SELECT publishedDate as date, title, symbol, site as source, "
    #           f"text FROM fmp_stock_news) t",
    table='fmp_stock_news',
    properties=get_property(config['postgres'])
)
postgre_fmp_news_df = postgre_fmp_news_df.withColumnRenamed("publishedDate", "date")\
                                        .withColumn("date", to_timestamp(col("date"), "yyyy-MM-dd HH:mm:ss")) \
                                        .withColumnRenamed("site", "source") \
                                        .drop("image").drop("url")
postgre_fmp_news_df.show(5)

+-------------------+---------------+------+--------------------+--------------------+
|               date|         source|symbol|                text|               title|
+-------------------+---------------+------+--------------------+--------------------+
|2024-03-03 07:05:00|The Motley Fool|   ARM|Arm Holdings stoc...|Is It Too Late to...|
|2024-03-03 07:00:00|The Motley Fool|  DOCN|It's hard to igno...|Forget Nvidia: 2 ...|
|2024-03-03 06:57:00|The Motley Fool|  PLUG|Plug Power is a l...|Plug Power Stock ...|
|2024-03-03 06:50:00|The Motley Fool|  OKTA|Okta's financial ...|Okta Stock Has 55...|
|2024-03-03 06:45:00|The Motley Fool|  PLTR|Palantir's fourth...|Why Palantir Stoc...|
+-------------------+---------------+------+--------------------+--------------------+
only showing top 5 rows



In [9]:
# testing purpose
# union news from different sources (different API)
news_unioned_df = postgre_fmp_news_df.unionAll(maria_fin_news_df)
news_unioned_df.show(5)

+-------------------+---------------+------+--------------------+--------------------+
|               date|         source|symbol|                text|               title|
+-------------------+---------------+------+--------------------+--------------------+
|2024-03-03 07:05:00|The Motley Fool|   ARM|Arm Holdings stoc...|Is It Too Late to...|
|2024-03-03 07:00:00|The Motley Fool|  DOCN|It's hard to igno...|Forget Nvidia: 2 ...|
|2024-03-03 06:57:00|The Motley Fool|  PLUG|Plug Power is a l...|Plug Power Stock ...|
|2024-03-03 06:50:00|The Motley Fool|  OKTA|Okta's financial ...|Okta Stock Has 55...|
|2024-03-03 06:45:00|The Motley Fool|  PLTR|Palantir's fourth...|Why Palantir Stoc...|
+-------------------+---------------+------+--------------------+--------------------+
only showing top 5 rows



In [10]:
# read from mysql for historical data
# each stock will be saved in dividual table respectively after join other data columns
data = {}
for symbol in stock_symbols:
    maria_hist_data_df = spark.read.jdbc(
        url=gen_maria_jdbc(config['mariadb']),
        table=f'(SELECT Date, Close, Volume from yf_historical_data WHERE symbol="{symbol}") t',
        properties=get_property(config['mariadb'])
    )
    data[symbol] = maria_hist_data_df.withColumnRenamed("Close",symbol) \
                                    .withColumnRenamed("Volume",f'{symbol}_vol')

In [11]:
# test if data is constructed correctly
data["NVDA"].show(n=3)

+-------------------+------------------+--------+
|               Date|              NVDA|NVDA_vol|
+-------------------+------------------+--------+
|2012-04-19 00:00:00|3.4149999618530273|66718800|
|2012-04-20 00:00:00|3.3475000858306885|50959200|
|2012-04-23 00:00:00| 3.307499885559082|70624400|
+-------------------+------------------+--------+
only showing top 3 rows



In [12]:
# load inside transactions from mongodb
mongo_df = spark.read.format('mongo') \
    .option("database", 'finance_api') \
    .option("collection", 'finn_insider_transactions') \
    .load()

mongo_df.show(5)

+--------------------+--------------------+------+
|                 _id|                data|symbol|
+--------------------+--------------------+------+
|{65e550a0b142680e...|[{-100000, , 2024...|  AAPL|
|{65e550bab142680e...|[{-300, , 2024-02...|  MSFT|
|{65e550d4b142680e...|[{-5000, , 2024-0...|  NVDA|
|{65e550ecb142680e...|[{-31493, , 2024-...|  META|
|{65e55104b142680e...|[{-500, , 2024-02...|  AMZN|
+--------------------+--------------------+------+
only showing top 5 rows



In [13]:
news_unioned_df = news_unioned_df.toPandas()

In [14]:
news_unioned_df.shape, news_unioned_df.columns

((12260, 5),
 Index(['date', 'source', 'symbol', 'text', 'title'], dtype='object'))

In [15]:
# load pretrained model and tokenizer
pipe = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model = TFAutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
Some weights of the PyTorch model were not used when initializing the TF 2.0 mode

In [16]:
# get id2labels so that we know the map
id2labels = model.config.id2label
id2labels

{0: 'negative', 1: 'neutral', 2: 'positive'}

In [17]:
# sentiment analysis
news_sentiment = sentiment_analysis(news_unioned_df, tokenizer, model)

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving




In [18]:
news_sentiment.shape, news_sentiment.columns

((12260, 6),
 Index(['date', 'source', 'symbol', 'text', 'title', 'sentiment'], dtype='object'))

In [21]:
# convert datetime to date
news_sentiment['date'] = pd.to_datetime(news_sentiment['date']).dt.date
# aggregate sentiment results that happened on the same day and same stock
news_sent_df = news_sentiment.groupby(['symbol', 'date'])[['sentiment']] \
                                .agg(lambda x: tf.reduce_mean(tf.concat([tf.convert_to_tensor(arr) 
                                                                         for arr in x.values], axis=0), axis=0)
                                     if len(x) > 1 else x ) \
                            .reset_index()
news_sent_df['sentiment'] = news_sent_df['sentiment'].apply(lambda x: np.argmax(x)).astype('int8')

In [22]:
news_sent_df['sentiment'][:5]

0    2
1    1
2    1
3    1
4    1
Name: sentiment, dtype: int8

In [23]:
# save dataframe into csv file
news_sent_df.to_csv(os.path.join('data/news_sentiment_all.csv') )

In [24]:
spark.stop()

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


ConnectionRefusedError: [Errno 111] Connection refused