In [1]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import bs4

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import requests

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/valentinaolaritei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/valentinaolaritei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def plot_number_sources(df):
    
    """
    Plot the number of sources which are contained in the dataframe
    
    Args:
        pd.DataFrame: dataframe which containes in the column 'date' the period
        
    Returns:
        matplotlib bar chart
    """
    plot_sources = df['source'].value_counts().sort_index() \
                    .plot(kind = 'bar',
                         title = 'Count of sources',
                          alpha = 1, 
                          rot = 90,
                          color='green',
                          edgecolor='black',
                         figsize = (8, 5))
    plot_sources.set_xlabel('Sources')
    plt.show()

In [None]:
def plot_number_gisc_sectors(df):
    
    """
    Plot the number of GISC sectors which are contained in the dataframe
    
    Args:
        pd.DataFrame: dataframe which containes in the column 'date' the period
        
    Returns:
        matplotlib bar chart
    """
    plot_sources = df['gisc_sectors'].value_counts().sort_index() \
                    .plot(kind = 'bar',
                         title = 'Number of GISC Sector',
                          alpha = 1, 
                          rot = 90,
                          color='green',
                          edgecolor='black',
                         figsize = (8, 5))
    plot_sources.set_xlabel('GISC Sector')
    plt.show()

In [3]:
def plot_number_articles_every_year(df):
    
    """
    Plot the number of articles which were published every year
    
    
    Args:
        pd.DataFrame: dataframe which containes in the column 'date' the period
        
    Returns:
        matplotlib bar chart   
    """
    
    
    df['year'] = df['date'].dt.year

    articles_per_year = df['year'].value_counts().sort_index().plot(kind = 'bar',
                                                                    title = 'Count of Articles per year',
                                                                    alpha = 1,
                                                                    color='green', 
                                                                    edgecolor='black',
                                                                    rot = 90,
                                                                    figsize = (8, 5))
    articles_per_year.set_xlabel('Year')
    articles_per_year.set_ylabel('Articles')
    plt.show() 

In [4]:
def plot_monthly_articles_last_9_years(df):
    
    """
    function to plot the number of articles in every month in the last 9 years
    The function will be called using --> plot_monthly_articles_last_9_years(df)


    Args:
        pd.DataFrame: dataframe which containes in the column 'date' the period  

    Returns:
        matplotlib bar chart
    """    
    
    # Extract year and month from the 'date' column
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    
    # select the last 9 years because there are more articles
    df = df[df['year']>= 2016]

    # Group the dataframe by year and month and count the number of articles
    grouped = df.groupby(['year', 'month']).size().reset_index(name='count')

    # Create subplots with 3 rows and 3 columns
    fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 15))

    # Iterate over each year and plot the monthly counts
    for i, (year, group) in enumerate(grouped.groupby('year')):
        row = i // 3  # Row index
        col = i % 3   # Column index

        # Plot the monthly counts with custom colors
        ax = axes[row, col]
        group.plot(x='month', y='count', kind='bar', ax=ax, legend=False, color='green', edgecolor='black')
        ax.set_title(year)
        ax.set_xlabel('Month')
        ax.set_ylabel('Number of Articles')

        # Add legend
        ax.legend(['Articles'])

    # Adjust the spacing between subplots
    plt.subplots_adjust(hspace=0.25)

    # Show the plots
    plt.show()

In [None]:
def plot_number_stock_article(df):
    
    """
    Plot the number of sources which are contained in the dataframe
    
    Args:
        pd.DataFrame: dataframe which containes in the column 'date' the period
        
    Returns:
        matplotlib bar chart
    """
    plot_stocks = df['stock'].value_counts().sort_index() \
                    .plot(kind = 'bar',
                         title = 'Articles per stock',
                          alpha = 1, 
                          rot = 90,
                          color='green',
                          edgecolor='black',
                         figsize = (8, 5))
    plot_stocks.set_xlabel('Stocks')
    plot_stocks.set_ylabel('Number of Articles')
    plot_stocks.set_xticklabels([])
    plt.show()

In [None]:
# function used to compute the common words

def preprocess(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove non-alphabetic characters (this includes punctuation)
    text = re.sub(r'[^a-z\s]', '', text)
    # Split text into words
    words = text.split()
    return words

# Create a function to get the most common words for each stock
def get_top_word_frequencies(group, top_n=20):
    words = []
    for title in group['title']:
        words.extend(preprocess(title))
    # Get the top N most common words
    common_words = Counter(words).most_common(top_n)
    return dict(common_words)

In [None]:
def plot_more_2500_stock_article(df):
    """
    Plot the number of sources which are contained in the dataframe
    
    Args:
        pd.DataFrame: dataframe which containes in the column 'date' the period
        
    Returns:
        matplotlib bar chart
    """
    # Filter stocks with more than 7,500 articles
    filtered_df = df['stock'].value_counts().loc[lambda x: x > 2500].sort_index()
    
    # Plot only the filtered data
    plot_stocks = filtered_df.plot(kind='bar',
                                   title='Stocks with more than 2,500 articles',
                                   alpha=1, 
                                   rot=90,
                                   color='green',
                                   edgecolor='black',
                                   figsize=(8, 5))
    
    # Remove x-axis label
    plot_stocks.set_xlabel('Stocks')
    plot_stocks.set_ylabel('Number of Articles')
    # Remove x-axis tick labels
    plt.show()

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
import pandas as pd


def text_sentimental(df):
    
    """
    function to compute sentimental analysis on the headlines of financial news
    'SentimentIntensityAnalyzer' is used for this task

    Args:
        pd.DataFrame: dataframe which containes in a column the link of the text that has to be analyzed
        

    Returns:
        pd.DataFrame: same dataframe as the one in the input, with financial sentimental analysis added
    """
    
    for index, row in df.iterrows():

        text = row['title']
        
        # Initialise sentiment analyser    
        sid = SentimentIntensityAnalyzer()
        # Get positive, negative, neutral and compound scores
        polarity = sid.polarity_scores(text)

        # Update the DataFrame with the sentiment scores
        df.loc[index, 'positivity_text'] = polarity['pos']
        df.loc[index, 'neutrality_text'] = polarity['neu']
        df.loc[index, 'negativity_text'] = polarity['neg']
        df.loc[index, 'compound_text'] = polarity['compound']    
    
    return df

In [None]:
def get_sp500_indexes():
    
    """
    function to scrap data of SP500 from Wikipedia

    Args:
        

    Returns:
        pd.DataFrame: dataframe SP500 data scraped from https://en.wikipedia.org/wiki/List_of_S%26P_500_companies
    """
    
    
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs4.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    stocks = []
    tickers = []
    gisc_sectors = []

    # Import stock tickers
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        stock = row.findAll('td')[1].text
        gisc_sector = row.findAll('td')[2].text
        gisc_sectors.append(gisc_sector)
        tickers.append(ticker)
        stocks.append(stock)

    # Store stock tickers into a list
    tickers = [s.replace('\n', '') for s in tickers]
    stocks = [s.replace('\n', '') for s in stocks]
    gisc_sectors = [s.replace('\n', '') for s in gisc_sectors]
    
    # Lowercase
    lowercase_tickers = [ticker.lower() for ticker in tickers]
    
    # return lowercase_tickers, stocks, gisc_sectors
    return pd.DataFrame(data = {'ticker':lowercase_tickers, 'stock':stocks, 'gisc_sectors':gisc_sectors})