In [None]:
import pip
import sys
import signal
import time
import os
from dataclasses import dataclass
from dotenv import load_dotenv

import chromedriver_autoinstaller
# chromedriver_autoinstaller.install()

load_dotenv()

In [None]:
# Package installer function to handle missing packages
def install(package):
    print(package + ' package for Python not found, pip installing now....')
    pip.main(['install', package])
    print(package + ' package has been successfully installed for Python\n Continuing Process...')

# Ensure beautifulsoup4 is installed
try:
    from bs4 import BeautifulSoup
except:
    install('beautifulsoup4')
    from bs4 import BeautifulSoup

# Ensure selenium is installed
try:
    from selenium import webdriver
except:
    install('selenium')
finally:
    from selenium import webdriver
    # from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException

In [None]:
class MarketWatchETL:
    def __init__(self, ticker):
        self.driver = None
        self._service = Service(
                '/Users/jerryli/Downloads/chromedriver-mac-arm64/chromedriver'
            )
        self._opts = webdriver.ChromeOptions()
        self._base_url = 'https://www.marketwatch.com/investing/stock/'
        self._retries = 10
        self._MW_USER = os.getenv('MARKETWATCH_USER')
        self._MW_KEY = os.getenv('MARKETWATCH_KEY')
        # self._service = Service('/Users/jerryli/Downloads/chromedriver-mac-arm64/chromedriver')
        # self.ticker = ticker.lower()
       # self._dcap = dict(DesiredCapabilities.PHANTOMJS)
        # self._opts = webdriver.ChromeOptions()
        # self._dcap = dict(DesiredCapabilities.CHROME)
        # self._dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        #                                                    "AppleWebKit/537.36 (KHTML, like Gecko) "
        #                                                    "Chrome/119.0.0.0 Safari/537.36")
        # self._base_url = 'https://www.marketwatch.com/investing/stock/'
        # self._retries = 10

    def login(self):
        self._initiate_driver()
        self.driver.get(self._base_url)
        
        close_popup_icon = self.driver.find_element(By.XPATH, "//div[@id='cx-scrim']//div[@id='cx-scrim-wrapper']/button")
        close_popup_icon.click()
        
        profile_xpath = "//div[@class='profile logged-out']/label[@class='btn--text btn--profile j-toggle-label']"
        profile_icon = self.driver.find_element(By.XPATH, profile_xpath)
        self.driver.execute_script("arguments[0].click();", profile_icon)
        
        options_xpath = "//ul[@class='profile__menu j-menu-contents j-toggle--profile']/li"
        option_list = self.driver.find_elements(By.XPATH, options_xpath)
        option_list[1].click()
        
        try:
            # username_xpath = "//div[@class='input-icon-container']/input[@id='username']"
            username_input_box = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.ID, 'username'))
            )
            # driver.execute_script("arguments[0].click();", username_input_box)
            # username_input_box.clear()
            username_input_box.send_keys(self._MW_USER)
        except TimeoutException as e:
            print(e)
            
        continue_xpath = "//button[@class='solid-button continue-submit dj-btn-primary group']"
        continue_btn = self.driver.find_element(By.XPATH, continue_xpath)
        self.driver.execute_script("arguments[0].click();", continue_btn)
        # continue_btn.click()
        
        try:
            pw_xpath = "//div[@id='password-login-card-container']//div[@class='input-icon-container']/input[@id='password-login-password']"
            pw_class = "password dj-input w-full pr-10"
            pw_input_box = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, pw_xpath))
            )
            # self.driver.execute_script("arguments[0].click();", pw_input_box)
            # pw_input_box.clear()
            self.driver.execute_script(
                f"""document.getElementsByClassName("{pw_class}")[0].setAttribute('value', '{self._MW_KEY}')""",
                pw_input_box
            )
            # pw_input_box.send_keys(self._MW_KEY)
        except TimeoutException as e:
            print(e)
            
        signin_xpath = "//button[@class='solid-button new-design basic-login-submit dj-btn-primary group w-full']"
        signin_btn = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, signin_xpath)))
        # signin_btn = self.driver.find_element(By.XPATH, signin_xpath)
        # self.driver.execute_script(
        #     """document.querySelector('button.solid-button.new-design.basic-login-submit.dj-btn-primary.group.w-full').click();"""
        #     )
        # self.driver.execute_script("arguments[0].click();", signin_btn)
        signin_btn.click()

    @staticmethod
    def _cleaned_key_data_object(ticker, raw_data):
        cleaned_data = {}
        raw_labels = raw_data['labels']
        raw_values = raw_data['values']
        i = 0
        for raw_label in raw_labels:
            raw_value = raw_values[i]
            cleaned_data.update({str(raw_label.get_text()): raw_value.get_text()})
            i += 1
        return {ticker: cleaned_data}
    
    @staticmethod
    def _cleaned_competitors_object(ticker, raw_data):
        cleaned_data = {}
        raw_names = raw_data['competitor_name']
        raw_chgs = raw_data['price_perc_chg']
        raw_mcs = raw_data['market_cap']
        i = 0
        for raw_name in raw_names:
            raw_chg = raw_chgs[i]
            raw_mc = raw_mcs[i]
            cleaned_data.update({
                str(raw_name.get_text()): {
                    'per_chg': raw_chg.get_text(),
                    'market_cap': raw_mc.get_text()
                }
            })
            i += 1
        return {ticker: cleaned_data}

    def _initiate_driver(self):
        try:
            if self.driver is None:
                self.driver = webdriver.Chrome(
                    service=self._service, 
                    options=self._opts)
        except:
            print('***SETUP ERROR: The PhantomJS Web Driver is either not configured or incorrectly configured!***')
            sys.exit(1)
            
    def _get_site(self, ticker, **site_kwargs):
        _url = self._base_url + ticker
        if site_kwargs is not None:
            _sub_page = ''
            for k, v in site_kwargs.items():
                if 'page' in k:
                    _sub_page += '/' + str(v)
            _url += _sub_page
        self.driver.get(_url)
            
    def _scrape_key_data(self):
        raw_data_obj = {}
        i = 0
        while i < self._retries:
            try:
                time.sleep(3)
                html = self.driver.page_source
                soup = BeautifulSoup(html, "html.parser")
                items = soup.find_all('li', class_="kv__item")
                labels, values = list(), list()
                for item in items:
                    labels.append(item.find('small', class_="label"))
                    values.append(item.find('span', class_="primary"))
                
                if labels and values:
                    raw_data_obj.update({'labels': labels})
                    raw_data_obj.update({'values': values})
                    break
                else:
                    i += 1
            except:
                i += 1
                continue
        if i == self._retries:
            print('Please check your internet connection!\nUnable to connect...')
            sys.exit(1)
        self.driver.service.process.send_signal(signal.SIGTERM)
        return raw_data_obj
    
    def _scrape_competitors(self):
        raw_data_obj = {}
        i = 0
        while i < self._retries:
            try:
                time.sleep(3)
                html = self.driver.page_source
                soup = BeautifulSoup(html, "html.parser")
                table = soup.find('tbody', class_="table__body")
                rows = table.find_all('tr', class_="table__row")
                names = rows.find_all('td', class_="table__cell w50")
                chgs = rows.find_all('td', class_="table__cell w25")
                mktcap = rows.find_all('td', class_="table__cell w25 number")
                if names and chgs and mktcap:
                    raw_data_obj.update({'competitor_name': names})
                    raw_data_obj.update({'price_perc_chg': chgs})
                    raw_data_obj.update({'market_cap': mktcap})
                    break
                else:
                    i += 1
            except:
                i += 1
                continue
        if i == self._retries:
            print('Please check your internet connection!\nUnable to connect...')
            sys.exit(1)
        self.driver.service.process.send_signal(signal.SIGTERM)
        return raw_data_obj

    def _scrape_financials(self, ticker, mode=None):
        raw_data_obj = {}
    
    def get_stock_key_data(self, ticker):
        self._get_site(ticker.lower())
        _raw_fin_data = self._scrape_key_data()
        # self.driver.quit()
        return self._cleaned_key_data_object(ticker.lower(), _raw_fin_data)
    
    def get_stock_financials(self, ticker, mode=None):
        pages = dict(page1='financials')
        match mode:
            case 'bs':
                pages = dict(page2='balance-sheet')
            case 'bs_qt':
                pages |= dict(page2='balance-sheet', page3='quarter')
            case 'cf':
                pages |= dict(page2='cash-flow')
            case 'cf_qt':
                pages |= dict(page2='cash-flow', page3='quarter')
            case _:
                pass
        self._get_site(ticker.lower(), **pages)
        # _raw_comp_data = self._scrape_financials()
        self.driver.quit()
        return self._cleaned_key_data_object(ticker.lower(), _raw_comp_data)

In [None]:
mw = MarketWatchETL()
mw.login()

In [None]:
msft_key_data = mw.get_stock_key_data('MSFT')
msft_key_data

In [None]:
testing_results = {
    'msft': 
    {
        'Open': '$368.48',
        'Day Range': '366.10 - 371.60',
        '52 Week Range': '219.35 - 384.30',
        'Market Cap': '$2.78T',
        'Shares Outstanding': '7.43B',
        'Public Float': '7.32B',
        'Beta': '1.19',
        'Rev. per Employee': '$987.83K',
        'P/E Ratio': '35.96',
        'EPS': '$10.33',
        'Yield': '0.81%',
        'Dividend': '$0.75',
        'Ex-Dividend Date': 'Feb 14, 2024',
        'Short Interest': '49.28M',
        '% of Float Shorted': '0.67%',
        'Average Volume': '24.82M'
    },
    'abcd':
    {
        'Open': '$368.48',
        'Day Range': '366.10 - 371.60',
        '52 Week Range': '219.35 - 384.30',
        'Market Cap': '$2.78T',
        'Shares Outstanding': '7.43B',
        'Public Float': '7.32B',
        'Beta': '1.19',
        'Rev. per Employee': '$987.83K',
        'P/E Ratio': '35.96',
        'EPS': '$10.33',
        'Yield': '0.81%',
        'Dividend': '$0.75',
        'Ex-Dividend Date': 'Feb 14, 2024',
        'Short Interest': '49.28M',
        '% of Float Shorted': '0.67%',
        'Average Volume': '24.82M'
    },
}

In [None]:
import pandas as pd
a = pd.DataFrame(testing_results)
# a.stack().reset_index()
a

In [None]:
# analyst estimates - `_base_url + ticker + /analystestimates`
# financials - `_base_url + ticker + /financials`
# options - _base_url + ticker + /options`

In [None]:
msft_key_data = MarketWatchETL('MSFT').get_stock_key_data()
aapl_key_data = MarketWatchETL('AAPL').get_stock_key_data()
wfc_key_data = MarketWatchETL('WFC').get_stock_key_data()