# **Dinnect Data Science Hackathon**
# Yahoo Finance Conversation Scrapper
## Written by: Hamidreza Salahi
### 16 Oct 2024
## Email: salahi92.h@gmail.com


In [1]:
import numpy as np
import pandas as pd


import yfinance as yf

from datetime import datetime, timedelta
import pytz
import json
import requests
from bs4 import BeautifulSoup
import time

import demoji
import re
import string
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
import attr
import nltk
from nltk.util import ngrams

import urllib.request
import csv
from autocorrect import Speller

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [5]:
tickers = ["ONCO", "CNEY", "TNXP", "APLD", "KTTA"]
end_date = datetime.strptime('2024-09-26', '%Y-%m-%d').date()
latest_start_date = datetime.strptime('2022-04-13', '%Y-%m-%d').date()

In [None]:
# Convert date to datetime by adding time (00:00:00)
latest_start_datetime = datetime.combine(latest_start_date, datetime.min.time())
earliest_start_datetime = datetime.combine(end_date, datetime.min.time())
# Get the Unix timestamp
cutoff_time_epoch = latest_start_datetime.timestamp()
max_epoch_time = earliest_start_datetime.timestamp()

# Function to scrape comments for a single ticker
def scrape_comments_for_ticker(ticker, msg_count_per_page=60, cutoff_time_epoch=cutoff_time_epoch, max_epoch_time = max_epoch_time ):
    # Initialize the list to store parsed data
    parsed_data = []
    
    # Yahoo Finance community page for the ticker
    url = f'https://finance.yahoo.com/quote/{ticker}/community?p={ticker}'
    
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0'})
    soup = BeautifulSoup(response.text, 'html.parser')

    data = json.loads(soup.select_one('#spotim-config').get_text(strip=True))['config']
    
    
    api_url = "https://api-2-0.spot.im/v1.0.0/conversation/read"

    # Prepare the payload and headers for the API request
    payload = json.dumps({
        "conversation_id": data['spotId'] + data['uuid'].replace('_', '$'),
        "count": msg_count_per_page,
        "offset": 0
    })
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0',
        'Content-Type': 'application/json',
        'x-spot-id': data['spotId'],
        'x-post-id': data['uuid'].replace('_', '$'),
    }
    time.sleep(2)
    # Make the first request to get the total number of messages
    response = requests.post(api_url, headers=headers, data=payload)
    data = response.json()
    total_num_msgs = data['conversation']['messages_count']
    max_offsets = total_num_msgs
    time.sleep(5)
    # Iterate over offsets in chunks to get all messages
    offsets = range(0, max_offsets, msg_count_per_page)

    for offset in offsets:
        try:
            url = f'https://finance.yahoo.com/quote/{ticker}/community?p={ticker}'
            response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0'})
            soup = BeautifulSoup(response.text)
            data = json.loads(soup.select_one('#spotim-config').get_text(strip=True))['config']
            url = "https://api-2-0.spot.im/v1.0.0/conversation/read"

            payload = json.dumps({
              "conversation_id": data['spotId'] + data['uuid'].replace('_', '$'),
              "count": msg_count_per_page,
              "offset": offset
            })
            headers = {
              'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0',
              'Content-Type': 'application/json',
              'x-spot-id': data['spotId'],
              'x-post-id': data['uuid'].replace('_', '$'),
            }
            time.sleep(2)
            response = requests.post(url, headers=headers, data=payload)
            data = response.json()
            comments = data['conversation']['comments']
            for comment in comments:
                content_texts = comment['content']
                for content in content_texts:
                    if content['type']=='text':
                        comment_time_epoch = comment['written_at']
                        if comment_time_epoch > max_epoch_time:
                            continue
                        text = content['text']
                        parsed_data.append([text, comment_time_epoch])
                replies_count = comment['replies_count']
                if replies_count!=0:
                    replies = comment['replies']
                    for reply in replies:
                        reply_time = reply['written_at']
                        reply_content = reply['content']
                        for reply in reply_content:
                            if reply['type']=='text':
                                reply_text = reply['text']
                                parsed_data.append([reply_text, reply_time])
            if comment_time_epoch < cutoff_time_epoch:
                break
            time.sleep(5)
        except Exception as e:
            print(f"Error scraping offset {offset} for ticker {ticker}: {e}")
            time.sleep(5)
    return parsed_data

# Scrape data for all tickers
all_tickers_parsed_data = {}

for ticker in tickers:
    print(f"Scraping comments for {ticker}...")
    ticker_data = scrape_comments_for_ticker(ticker)
    all_tickers_parsed_data[ticker] = (ticker_data)
    time.sleep(10)


In [None]:
with open('all_tickers_parsed_data.json', 'w') as json_file:
    json.dump(all_tickers_parsed_data, json_file)

In [7]:
# Read the JSON file
with open('all_tickers_parsed_data.json', 'r') as json_file:
    all_tickers_parsed_data = json.load(json_file)

In [8]:
all_tickers_comments_df = {}

for ticker, ticker_comments in all_tickers_parsed_data.items():
    comment_df = pd.DataFrame(ticker_comments, columns=['text','date'])
    comment_df['date'] = pd.to_datetime(comment_df['date'], unit='s', utc=True).dt.tz_convert('America/New_York')
    comment_df['date'] = pd.to_datetime(comment_df['date']).dt.date
    all_tickers_comments_df[ticker] = comment_df

In [9]:
def clean_text(text):
    # Initialization the twitter tokenizer
    tk = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True) 
    # Initialization the lemmatizer
    lemmatizer = WordNetLemmatizer()  
    # Trying to avoid deleting the negative verbs as it affects the meaning of the comments.
    stop_words = stopwords.words('english') + ["i'll","i'm", "should", "could"]
    negative_verbs = [ "shan't",'shouldn',"shouldn't",'wasn','weren','won','wouldn','aren','couldn','didn','doesn','hadn','hasn','haven','isn','ma','mightn','mustn',"mustn't",'needn',"needn't","wouldn't","won't","weren't","wasn't","couldn","not","nor","no","mightn't","isn't","haven't","hadn't","hasn't","didn't","doesn't","aren't","don't","couldn't","never"]
    stop_words =[word for word in stop_words if word not in negative_verbs ] 
    
    # Lowering comments
    lower_comment = text.lower() 
    # Removing hashtag and cashtag symbols
    comment = re.sub(r"[#$]"," ",lower_comment)
    # Removing links from comments
    comment = re.sub(r"https?:\/\/.*[\r\n]*"," ", comment)
    # Translating emojies into thier descriptions
    comment = demoji.replace_with_desc(comment)
    # removing numerical values
    comment = re.sub(r"[0-9]|-->","",comment)
    comment = re.sub(r"<.*?>", " ", comment)
    # Tokenize the comments by twitter tokenzier.
    comment = tk.tokenize(comment)
    # Choosing the words that don't exist in stopwords, thier lengths are more than 2 letters and then lemmatize them.
    comment = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in comment if word not in stop_words and word not in string.punctuation and len(word)>2 and "." not in word]
    # return the tokens in one sentence 
    comment = " ".join(comment)
    
    return comment

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [10]:
for ticker, comment_df in all_tickers_comments_df.items():
    comment_df['cleaned'] = comment_df["text"].apply(lambda row:clean_text(row))


In [22]:
csv_file_mapping = {}
for ticker, df in all_tickers_comments_df.items():
    df.dropna(inplace=True)
    file_name = f"{ticker}.csv"
    df.to_csv(file_name, index=False)
    csv_file_mapping[ticker] = file_name

# Save the mapping of ticker names to file paths as a JSON file
with open('ticker_file_mapping.json', 'w') as json_file:
    json.dump(csv_file_mapping, json_file)

In [12]:
# Load the file path mapping
with open('ticker_file_mapping.json', 'r') as json_file:
    csv_file_mapping = json.load(json_file)

# Load each CSV into a DataFrame and recreate the original dictionary
all_tickers_comments_df = {}
for ticker, file_name in csv_file_mapping.items():
    all_tickers_comments_df[ticker] = pd.read_csv(file_name)


In [13]:
sent = SentimentIntensityAnalyzer()
tickers_pol_df = {}
for ticker, comments in all_tickers_comments_df.items():
    comments['polarity'] = comments.text.apply(lambda s: sent.polarity_scores(s)['compound'])
    comments['date'] =pd.to_datetime(comments['date'],infer_datetime_format=True)
    comments['date'] =pd.to_datetime(comments['date'].dt.strftime("%m/%d/%y"))
    Pol_df = pd.DataFrame(comments.groupby('date')['polarity'].mean())
    tickers_pol_df[ticker] = Pol_df

In [25]:
csv_Pol_mapping = {}
for ticker, df in tickers_pol_df.items():
    df.dropna(inplace=True)
    file_name = f"{ticker}_Pol.csv"
    df.to_csv(file_name, index=True)
    csv_Pol_mapping[ticker] = file_name

# Save the mapping of ticker names to file paths as a JSON file
with open('ticker_Pol_mapping.json', 'w') as json_file:
    json.dump(csv_Pol_mapping, json_file)