In [None]:
#!pip install vaderSentiment
#!pip install beautifulsoup4
#!pip install requests


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup
import requests
analyzer = SentimentIntensityAnalyzer()

### function to scrape

In [56]:
import pandas as pd
from datetime import datetime, timedelta

def generate_urls(keyword, num_days):
    base_url = 'https://news.google.com/search?q={keyword}%20{date}&hl=en-IN&gl=IN&ceid=IN%3Aen'
    urls = []
    dates = []
    start_date = datetime.now() - timedelta(days=1)
    for i in range(num_days):
        date = (start_date - timedelta(days=i)).strftime('%d/%m/%y')
        formatted_date = (start_date - timedelta(days=i)).strftime('%d/%m/%Y')
        url = base_url.format(keyword=keyword, date=date)
        urls.append(url)
        dates.append(formatted_date)
    return urls, dates


In [57]:
def scrape_headlines(url):
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'lxml')
    buttons = soup.find_all('button')
    labels_apple = []
    for button in buttons:
        label = button.get('aria-label')
        if label:
            labels_apple.append(label)
    filtered_labels = labels_apple[6:-1]
    cleaned_labels = [label.replace('More -', '').strip() for label in filtered_labels]
    combined_text_apple = ' '.join(cleaned_labels)
    return (combined_text_apple)

In [58]:
import pandas as pd
from tqdm import tqdm  

def get_score(keyword, num_days):
    urls, dates = generate_urls(keyword, num_days)
    data = []

    progress_bar = tqdm(total=len(urls), desc=f'Scraping headlines for {keyword.upper()}')

    for url, date in zip(urls, dates):
        headlines = scrape_headlines(url)
        if headlines:
            scores = analyzer.polarity_scores(headlines)
            avg = scores['compound']
            data.append({'Date': date, 'Sentiment': avg, 'Stock Symbol': keyword.upper()})
        else:
            data.append({'Date': date, 'Sentiment': None, 'Stock Symbol': keyword.upper()})
        
        progress_bar.update(1)

    progress_bar.close()

    df = pd.DataFrame(data)
    
    return df


In [59]:
df_aapl = get_score('aapl',1830)
df_aapl



Scraping headlines for AAPL:   0%|                                                            | 0/1830 [00:00<?, ?it/s][A[A

Scraping headlines for AAPL:   0%|                                                    | 1/1830 [00:01<36:34,  1.20s/it][A[A

Scraping headlines for AAPL:   0%|                                                    | 2/1830 [00:02<32:51,  1.08s/it][A[A

Scraping headlines for AAPL:   0%|                                                    | 3/1830 [00:03<33:48,  1.11s/it][A[A

Scraping headlines for AAPL:   0%|                                                    | 4/1830 [00:04<30:19,  1.00it/s][A[A

Scraping headlines for AAPL:   0%|▏                                                   | 5/1830 [00:05<30:42,  1.01s/it][A[A

Scraping headlines for AAPL:   0%|▏                                                   | 6/1830 [00:06<31:44,  1.04s/it][A[A

Scraping headlines for AAPL:   0%|▏                                                   | 7/1830 [00:07<34:40, 

Unnamed: 0,Date,Sentiment,Stock Symbol
0,19/06/2024,0.9257,AAPL
1,18/06/2024,0.9804,AAPL
2,17/06/2024,0.9895,AAPL
3,16/06/2024,0.9179,AAPL
4,15/06/2024,0.8807,AAPL
...,...,...,...
1825,21/06/2019,,AAPL
1826,20/06/2019,0.0000,AAPL
1827,19/06/2019,0.2960,AAPL
1828,18/06/2019,0.0000,AAPL


In [60]:
df_msft = get_score('msft',1830)

Scraping headlines for MSFT: 100%|█████████████████████████████████████████████████| 1830/1830 [25:20<00:00,  1.20it/s]


In [61]:
df_msft

Unnamed: 0,Date,Sentiment,Stock Symbol
0,19/06/2024,0.9388,MSFT
1,18/06/2024,0.9912,MSFT
2,17/06/2024,0.9868,MSFT
3,16/06/2024,0.9091,MSFT
4,15/06/2024,0.7717,MSFT
...,...,...,...
1825,21/06/2019,,MSFT
1826,20/06/2019,,MSFT
1827,19/06/2019,-0.6486,MSFT
1828,18/06/2019,,MSFT


In [62]:
df_googl = get_score('googl',1830)

Scraping headlines for GOOGL: 100%|████████████████████████████████████████████████| 1830/1830 [34:07<00:00,  1.12s/it]


In [63]:
df_googl

Unnamed: 0,Date,Sentiment,Stock Symbol
0,19/06/2024,0.9860,GOOGL
1,18/06/2024,0.5391,GOOGL
2,17/06/2024,0.9915,GOOGL
3,16/06/2024,0.4932,GOOGL
4,15/06/2024,0.7184,GOOGL
...,...,...,...
1825,21/06/2019,0.4404,GOOGL
1826,20/06/2019,,GOOGL
1827,19/06/2019,0.4939,GOOGL
1828,18/06/2019,0.0000,GOOGL


In [64]:
df_amzn = get_score('amzn',1830)

Scraping headlines for AMZN: 100%|█████████████████████████████████████████████████| 1830/1830 [36:20<00:00,  1.19s/it]


In [65]:
df_amzn

Unnamed: 0,Date,Sentiment,Stock Symbol
0,19/06/2024,0.9850,AMZN
1,18/06/2024,0.9282,AMZN
2,17/06/2024,0.9965,AMZN
3,16/06/2024,0.9981,AMZN
4,15/06/2024,0.9996,AMZN
...,...,...,...
1825,21/06/2019,0.9075,AMZN
1826,20/06/2019,0.9793,AMZN
1827,19/06/2019,-0.6391,AMZN
1828,18/06/2019,0.9931,AMZN


In [66]:
df_ap=df_aapl.copy()
df_ms=df_msft.copy()
df_g=df_googl.copy()
df_am=df_amzn.copy()

In [69]:
combined_df = pd.concat([df_aapl, df_msft, df_googl, df_amzn], axis=0)
(combined_df)

Unnamed: 0,Date,Sentiment,Stock Symbol
0,19/06/2024,0.9257,AAPL
1,18/06/2024,0.9804,AAPL
2,17/06/2024,0.9895,AAPL
3,16/06/2024,0.9179,AAPL
4,15/06/2024,0.8807,AAPL
...,...,...,...
1825,21/06/2019,0.9075,AMZN
1826,20/06/2019,0.9793,AMZN
1827,19/06/2019,-0.6391,AMZN
1828,18/06/2019,0.9931,AMZN


In [70]:
combined_df = combined_df.ffill()
(combined_df).head()

Unnamed: 0,Date,Sentiment,Stock Symbol
0,19/06/2024,0.9257,AAPL
1,18/06/2024,0.9804,AAPL
2,17/06/2024,0.9895,AAPL
3,16/06/2024,0.9179,AAPL
4,15/06/2024,0.8807,AAPL


In [71]:
combined_df.isnull().sum()

Date            0
Sentiment       0
Stock Symbol    0
dtype: int64

In [72]:
combined_df.to_csv('C:\\Users\\Admin\\Desktop\\stock_analysis\\sentiment_data.csv')