In [19]:
import yaml

import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification

from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType
from pyspark.sql.functions import col, to_date, to_timestamp
from pyspark.errors import PySparkException
from pymongo import MongoClient
from pymongo import errors

In [33]:
def gen_mongo_uri(mongo_conf) -> str | None:
    """
    generate mongo connection uri based on input
    :return:
    """
    if not mongo_conf:
        return None
    return (f"mongodb+srv://{mongo_conf['user']}:"
            f"{mongo_conf['token']}@"
            f"{mongo_conf['host']}"
            f"/?retryWrites=true&w=majority")

def get_property(conf) -> dict:
    """
    get property for database
    """
    return {key: conf[key] for key in conf.keys()
                         & {'user', 'password', 'driver'}}

def gen_maria_jdbc(conf, db: str = '') -> str | None:
    """
    get connection jdbc string for maria database
    """
    if not conf:
        return None

    db = conf['database'] if not db else db
    return (f"jdbc:mysql://{conf['host']}:"
            f"{conf['port']}/"
            f"{db}?permitMysqlScheme")

def gen_postgres_jdbc(conf, db: str = '') -> str | None:
    """
    get connection jdbc string for maria database
    """
    if not conf:
        return None
    db = conf['database'] if not db else db
    return (f"jdbc:postgresql://{conf['host']}:"
            f"{conf['port']}/{db}")

def gen_mssql_jdbc(conf, db: str = '') -> str | None:
    """
    get connection jdbc string for maria database
    """
    if not conf:
        return None
    db = conf['database'] if not db else db
    return (f"jdbc:sqlserver://{conf['host']}:{conf['port']};"
            f"databaseName={db};encrypt=true;trustServerCertificate=true;")

def gen_azure_jdbc(conf, db: str = '') -> str | None:
    """
    get connection jdbc string for azure sql database
    """
    if not conf:
        return None
    db = conf['database'] if not db else db
    return (f"jdbc:sqlserver://{conf['host']}:{conf['port']};"
            f"databaseName={db};encrypt=true;")


def init_mongodb_client(uri: str) -> MongoClient | None:
    """
    initialize the mongo client
    """
    try:
        # Initialize new MongoDB client
        client = MongoClient(uri)
    except errors.ConnectionFailure as e:
        # Handle connection failure gracefully
        print(f"Failed to connect to MongoDB: {e}")

        return None
    else:
        return client

def prepare_dataframe(spark, sc, data) -> DataFrame:
    """
    prepare dataframe when the data might be in different data types
    :param spark: spark session
    :param sc: spark context
    :param data: data to be processed
    :return: spark sql dataframe
    """
    if isinstance(data, pd.DataFrame) and not data.empty:
        print('data is pandas df and not empty')
        df = spark.createDataFrame(data)
    elif isinstance(data, DataFrame) and not data.isEmpty():
        print('data is spark df and not empty')
        df = data
    elif isinstance(data, list):
        print('data is list and not empty')
        df = spark.read.json(sc.parallelize([json.dumps(record) for record in data]))
    elif isinstance(data, dict):
        print('data is dictionary and not empty')
        data = [data]
        df = spark.read.json(sc.parallelize([json.dumps(record) for record in data]))
    else:
        
        print('data empty')
        # initialize empty dataframe
        schema = StructType([])
        df = spark.createDataFrame([], schema)

    return df
    
def write_to_mongo(spark, data, 
                   uri: str, db: str, col: str) -> None:
    """
    write data into mongo db
    :param spark: sparkSession
    :param data: 
    :param uri: str, 
    :param db: str, 
    :param col: str, 
    :return:
    """
    sc = spark.sparkContext
    df = prepare_dataframe(spark, sc, data)
    config = {
        'uri': uri,
        'database': db,
        'collection': col
    }
    if not df.isEmpty():
        df.write.format("mongo") \
            .options(**config) \
            .mode("append") \
            .save()

def write_to_database(spark, data, conf,
                   db: str, write_table: str, type: str = 'mariadb') -> None:
    """
    write data into database table
    :param data:
    :param uri:
    :param db:
    :param table:
    :param type:
    :return:
    """
    sc = spark.sparkContext
    df = prepare_dataframe(spark, sc, data)
    properties = get_property(conf)

    jdbc = ''
    if type == 'mariadb':
        jdbc = gen_maria_jdbc(conf, db)
    elif type == 'postgres':
        jdbc = gen_postgres_jdbc(conf, db)
    elif type == 'mssql':
        jdbc = gen_maria_jdbc(conf, db)
    else:
        raise ValueError('error when generating jdbc')

    print(jdbc)

    if not df.isEmpty():
        print('writing data to', type)
        df.write.jdbc(
            url=jdbc,
            table=write_table,
            mode="append",
            properties=properties
        )
    else:
        print('empty dataset')

def sentiment_analysis(data, tokenizer, model) -> pd.DataFrame:
    """
    sentiment analysis and retrun dataframe with score
    """
    data["Sentiment"] = ""
    sentiment = []
    for i in data['text']:
        tokenized_news = tokenizer(i, return_tensors="tf")
        logits = model.predict(tokenized_news).logits
        probabilities = tf.nn.softmax(logits)
        sentiment.append(probabilities)
    data["Sentiment"] = sentiment
    return data

In [5]:
with open('conf.yaml', 'r') as file:
        config = yaml.safe_load(file)

mongo_uri = gen_mongo_uri(config['mongodb'])

In [6]:
try:
    # getting the spark instance
    spark = SparkSession.builder \
        .appName('Big Data Project ETL') \
        .config("spark.driver.bindAddress", "0.0.0.0") \
        .config("spark.mongodb.input.uri", mongo_uri) \
        .config("spark.mongodb.output.uri", mongo_uri) \
        .config("spark.jars.packages", "com.microsoft.azure:spark-mssql-connector_2.12:1.2.0,"
                                       "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
        .getOrCreate()
except PySparkException as e:
    print(f"Failed to get or create Spark: {e}")

:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml

Ivy Default Cache set to: /home/sutring/.ivy2/cache

In [7]:
stock_symbols = [
        'AAPL', 'MSFT', 'NVDA', 'META', 'AMZN', 'TSLA', 'GOOGL',
        'ON', 'DBD', 'DSGX', 'GTLB', 'LOGI', 'CRSR',
        'LNG', 'SWN', 'APA', 'BTU', 'CL',
        'BMY', 'THC', 'TNDM',
        'MOS', 'AXTA', 'KOP',
        'SBLK', 'EME', 'DNOW',
    ]


In [8]:
# news option 1
# read from maria - FINNHUB api
# company news
maria_fin_news_df = spark.read.jdbc(
    url=gen_maria_jdbc(config['mariadb']),
    table=f"(SELECT FROM_UNIXTIME(datetime) as date, source, related as symbol, summary as text, "
              f"headline as title FROM finn_company_news) t",
    properties=get_property(config['mariadb'])
)
maria_fin_news_df.show(5)

+-------------------+-------------+------+--------------------+--------------------+
|               date|       source|symbol|                text|               title|
+-------------------+-------------+------+--------------------+--------------------+
|2024-02-23 09:14:00|Seeking Alpha|  AAPL|Looking for stock...|Wedbush sees 1995...|
|2024-02-23 08:41:00|     TipRanks|  AAPL|Looking for stock...|Apple (NASDAQ:AAP...|
|2024-02-23 08:35:00|        Yahoo|  AAPL|AAPL, COST, WSM, ...|Zacks Market Edge...|
|2024-02-23 08:23:00|  MarketWatch|  AAPL|Looking for stock...|Two years after U...|
|2024-02-23 08:09:00|     DowJones|  AAPL|The actions of ma...|Two years after U...|
+-------------------+-------------+------+--------------------+--------------------+
only showing top 5 rows



In [9]:
# news option 2
# read from postgresql - FMP api
# stock news
postgre_fmp_news_df = spark.read.jdbc(
    url=gen_postgres_jdbc(config['postgres']),
    # table=f"(SELECT publishedDate as date, title, symbol, site as source, "
    #           f"text FROM fmp_stock_news) t",
    table='fmp_stock_news',
    properties=get_property(config['postgres'])
)
postgre_fmp_news_df = postgre_fmp_news_df.withColumnRenamed("publishedDate", "date")\
                                        .withColumn("date", to_timestamp(col("date"), "yyyy-MM-dd HH:mm:ss")) \
                                        .withColumnRenamed("site", "source") \
                                        .drop("image").drop("url")
postgre_fmp_news_df.show(5)

+-------------------+--------------------+------+--------------------+--------------------+
|               date|              source|symbol|                text|               title|
+-------------------+--------------------+------+--------------------+--------------------+
|2024-02-28 09:11:13|Zacks Investment ...|  PDCO|Patterson Cos. (P...|Patterson Cos. (P...|
|2024-02-28 09:11:12|Zacks Investment ...|  EDIT|Editas Medicine (...|Editas Medicine (...|
|2024-02-28 09:11:11|Zacks Investment ...|  NOVT|Novanta (NOVT) ca...|Novanta (NOVT) Me...|
|2024-02-28 09:10:09| Proactive Investors| VRBFF|VanadiumCorp Reso...|VanadiumCorp Reso...|
|2024-02-28 09:09:42|     The Motley Fool|  NVDA|Micron Technology...|Nvidia Stock Inve...|
+-------------------+--------------------+------+--------------------+--------------------+
only showing top 5 rows



In [24]:
# testing purpose
# union news from different sources (different API)
news_unioned_df = postgre_fmp_news_df.unionAll(maria_fin_news_df)
news_unioned_df.show(5)

+-------------------+--------------------+------+--------------------+--------------------+
|               date|              source|symbol|                text|               title|
+-------------------+--------------------+------+--------------------+--------------------+
|2024-02-28 09:11:13|Zacks Investment ...|  PDCO|Patterson Cos. (P...|Patterson Cos. (P...|
|2024-02-28 09:11:12|Zacks Investment ...|  EDIT|Editas Medicine (...|Editas Medicine (...|
|2024-02-28 09:11:11|Zacks Investment ...|  NOVT|Novanta (NOVT) ca...|Novanta (NOVT) Me...|
|2024-02-28 09:10:09| Proactive Investors| VRBFF|VanadiumCorp Reso...|VanadiumCorp Reso...|
|2024-02-28 09:09:42|     The Motley Fool|  NVDA|Micron Technology...|Nvidia Stock Inve...|
+-------------------+--------------------+------+--------------------+--------------------+
only showing top 5 rows



In [10]:
# read from mysql for historical data
# each stock will be saved in dividual table respectively after join other data columns
data = {}
for symbol in stock_symbols:
    maria_hist_data_df = spark.read.jdbc(
        url=gen_maria_jdbc(config['mariadb']),
        table=f'(SELECT Date, Close, Volume from yf_historical_data WHERE symbol="{symbol}") t',
        properties=get_property(config['mariadb'])
    )
    data[symbol] = maria_hist_data_df.withColumnRenamed("Close",symbol) \
                                    .withColumnRenamed("Volume",f'{symbol}_vol')

In [11]:
# test if data is constructed correctly
data["NVDA"].show(n=3)

+-------------------+------------------+--------+
|               Date|              NVDA|NVDA_vol|
+-------------------+------------------+--------+
|2004-02-26 00:00:00|1.8875000476837158|34497600|
|2004-02-27 00:00:00|1.8541669845581055|59554800|
|2004-03-01 00:00:00|1.8816670179367065|50662800|
+-------------------+------------------+--------+
only showing top 3 rows



In [12]:
# load inside transactions from mongodb
mongo_df = spark.read.format('mongo') \
    .option("database", 'finance_api') \
    .option("collection", 'finn_insider_transactions') \
    .load()

mongo_df.show(5)

+--------------------+--------------------+------+
|                 _id|                data|symbol|
+--------------------+--------------------+------+
|{65de41856693865d...|[{-1852, , 2024-0...|  AAPL|
|{65de41ab6693865d...|[{-300, , 2024-02...|  MSFT|
|{65de41ce6693865d...|[{-36000, , 2024-...|  NVDA|
|{65de41f16693865d...|[{-31493, , 2024-...|  META|
|{65de42136693865d...|[{-500, , 2024-02...|  AMZN|
+--------------------+--------------------+------+
only showing top 5 rows



In [None]:
# load pretrained model and tokenizer
pipe = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model = TFAutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

In [None]:
# sentiment analysis
news_unioned_df = news_unioned_df.toPandas()
news_sentiment = sentiment_analysis(news_unioned_df)
news_df = news_sentiment[["symbol", "Sentiment"]].sort_values(by="symbol")

In [None]:
# append sentiment score to historical data
news_symbols = news_df["symbol"].unique()
data_news = {}
for i in news_symbols:
    news = []
    news_for_symbol = news_df[news_df["symbol"] == i]
    for sentiment in news_for_symbol["Sentiment"]:
        news.append(sentiment)
    data_news[i] = news 

In [34]:
# append result data from sentiment analysis model
write_to_database(spark, data["NVDA"], config['mariadb'],
                   db='finance_out', write_table='historical_with_sentiment', type = 'mariadb')

data is spark df and not empty
jdbc:mysql://13.92.123.83:3306/finance_out?permitMysqlScheme
writing data to mariadb


In [25]:
spark.stop()