In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
cd drive/MyDrive/IE7500_GroupB/Notebooks

/content/drive/.shortcut-targets-by-id/1hXOWgLy2A2GiwQDqG1SudIATn6Z_m1b6/IE7500_GroupB/Notebooks


#Load libraries

In [25]:
# Load libraries+

import pandas as pd
import numpy as np
import re
import os

import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px


from datetime import datetime
from wordcloud import WordCloud

import io
import base64


#Get dataset

In [4]:
df = pd.read_csv("saved_dfs/df_for_models.csv", low_memory=False)

In [5]:
dtypes_dict= pd.read_csv("saved_dfs/merged_df_dict.csv")
dtypes_dict = dict(zip(dtypes_dict.iloc[:, 0], dtypes_dict.iloc[:, 1]))

df = df.astype(dtypes_dict)

# Describe Dataset

In [6]:
df.head(3)

Unnamed: 0,headline,url,publisher,date,stock,tokens,normalized_tokens,filtered_tokens,lemmas,sentiment_score,...,sector_Industrials,sector_Miscellaneous,sector_Real Estate,sector_Technology,sector_Telecommunications,sector_Utilities,recommendation_label,publisher_label,country_label,industry_label
0,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01,A,"['Agilent', 'Technologies', 'Announces', 'Pric...","['agilent', 'technologies', 'announces', 'pric...","['agilent', 'technologies', 'announces', 'pric...","['agilent', 'technology', 'announces', 'pricin...",0.0,...,True,False,False,False,False,False,1,4,45,18
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18,A,"['Agilent', '(', 'A', ')', 'Gears', 'Up', 'for...","['agilent', 'a', 'gears', 'up', 'for', 'q2', '...","['agilent', 'gears', 'q2', 'earnings', 'cards']","['agilent', 'gear', 'q2', 'earnings', 'card']",0.0,...,True,False,False,False,False,False,1,16,45,18
2,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15,A,"['J.P.', 'Morgan', 'Asset', 'Management', 'Ann...","['morgan', 'asset', 'management', 'announces',...","['morgan', 'asset', 'management', 'announces',...","['morgan', 'asset', 'management', 'announces',...",0.3612,...,True,False,False,False,False,False,1,4,45,18


In [9]:
df.dtypes

Unnamed: 0,0
headline,object
url,object
publisher,object
date,datetime64[ns]
stock,object
tokens,object
normalized_tokens,object
filtered_tokens,object
lemmas,object
sentiment_score,float64


In [11]:
df.isnull().sum()

Unnamed: 0,0
headline,0
url,0
publisher,0
date,0
stock,0
tokens,0
normalized_tokens,0
filtered_tokens,0
lemmas,0
sentiment_score,0


In [26]:
def analyze_basic_stats(df):
    print(f"Dataset Shape: {df.shape}")

    print("\nBasic Statistics for Numerical Columns:")
    print(df.describe())

    missing_data = df.isnull().sum()
    print("\nMissing Values by Column:")
    print(missing_data[missing_data > 0])

In [27]:
analyze_basic_stats(df)

Dataset Shape: (1845559, 46)

Basic Statistics for Numerical Columns:
                                date  sentiment_score    Market Cap  \
count                        1845559     1.845559e+06  9.992610e+05   
mean   2016-11-25 21:37:01.463524352     7.433652e-02  3.065176e+10   
min              1969-12-31 00:00:00    -9.451000e-01  0.000000e+00   
25%              2015-04-14 00:00:00     0.000000e+00  1.377662e+09   
50%              2017-02-09 00:00:00     0.000000e+00  7.318754e+09   
75%              2019-01-04 00:00:00     2.732000e-01  2.917714e+10   
max              2020-06-04 00:00:00     9.638000e-01  3.207062e+12   
std                              NaN     2.916777e-01  8.746174e+10   

            IPO Year          year         month   day_of_week  \
count  441214.000000  1.845559e+06  1.845559e+06  1.845559e+06   
mean     2004.204912  2.016427e+03  6.251557e+00  2.188561e+00   
min      1929.000000  1.969000e+03  1.000000e+00  0.000000e+00   
25%      1997.000000  2.01

In [15]:
stock_counts = df['Name'].value_counts()
print(stock_counts.head(5))

Name
Kroger Company (The) Common Stock     3314
JP Morgan Chase & Co. Common Stock    2873
China Fund Inc. (The) Common Stock    2698
Nordstrom Inc. Common Stock           2690
Ero Copper Corp. Common Shares        2669
Name: count, dtype: int64


In [16]:
ticker_counts = df['stock'].value_counts()
print(ticker_counts.head(5))

stock
KR      3314
GXC     3238
PGJ     3082
YINN    3027
JPM     2873
Name: count, dtype: int64


In [35]:
average_sentiment = df['sentiment_score'].mean()
print(f"Average Sentiment Score: {average_sentiment}")

average_word_count = df['word_count'].mean()
print(f"Average Token Count: {average_word_count}")

Average Sentiment Score: 0.07433651782468073
Average Token Count: 9.181637108323278


In [37]:
# top 10 publishers
publisher_counts = df['publisher'].value_counts()

print(publisher_counts.head(10))


publisher
Seeking Alpha                897219
Zacks                        438107
GuruFocus                    212433
Investor's Business Daily    134890
webmaster                     59669
TalkMarkets                   27209
Traders Huddle                16484
TheStreet.Com                 16023
Vetr                          10866
Small Cap Network              8059
Name: count, dtype: int64


# Visualizations

In [15]:
# Bar chart
# Higher the variance, the more sensational the news source (publisher)

sentiment_variance = df.groupby('publisher')['sentiment_score'].var().sort_values(ascending=False)
top_10_var_publishers = sentiment_variance.sort_values(ascending=False).head(10).reset_index()


fig = px.bar(
    top_10_var_publishers,
    x="publisher",
    y="sentiment_score",
    title="Top 10 Publishers with Highest Sentiment Variance",
    labels={"publisher": "Publisher", "sentiment_score": "Sentiment Variance"},
    color="sentiment_score",
    color_continuous_scale="reds"
)

fig.show()


In [17]:
# Heatmap
# By looking at industry sentiment scores over different months, we can see that:
# February has lower sentiment scores on average across sectors
# December has higher sentiment scores on average across sectors

def sentiment_heatmap(df):
    pivot_data = df.pivot_table(
        values='sentiment_score',
        index='Sector',
        columns='month',
        aggfunc='mean'
    )

    pivot_data_reset = pivot_data.reset_index()

    month_names = {
        1: 'Jan', 2: 'Feb', 3: 'Mar',
        4: 'Apr', 5: 'May', 6: 'Jun',
        7: 'Jul', 8: 'Aug', 9: 'Sep',
        10: 'Oct', 11: 'Nov', 12: 'Dec'
    }

    fig = px.imshow(
        pivot_data,
        labels=dict(x="Month", y="Sector", color="Sentiment Score"),
        x=pivot_data.columns,
        y=pivot_data.index,
        color_continuous_scale='RdYlGn',
        aspect="auto",
        title="Average Sentiment Score by Sector and Month"
    )

    for i, sector in enumerate(pivot_data.index):
        for j, month in enumerate(pivot_data.columns):
            value = pivot_data.loc[sector, month]
            fig.add_annotation(
                x=month,
                y=sector,
                text=f"{value:.2f}",
                showarrow=False,
                font=dict(color="black" if abs(value) < 0.5 else "white")
            )

    return fig


In [18]:
sentiment_heatmap(df)

In [23]:
# Bubble Chart
# By looking at market cap and industry, we can see the market caps of different industries
# The count of headlines shows the popularity of different industries

def bubble_chart(df):
    grouped_data = df.groupby('Sector').agg({
        'sentiment_score': 'mean',
        'Market Cap': 'mean',
    }).reset_index()

    headline_count = df.groupby('Sector').size().reset_index(name='headline_count')
    grouped_data = pd.merge(grouped_data, headline_count, on='Sector')

    fig = px.scatter(
        grouped_data,
        x='headline_count',
        y='sentiment_score',
        size='Market Cap',
        color='sentiment_score',
        hover_name='Sector',
        text='Sector',
        color_continuous_scale='RdYlGn',
        size_max=60,
        hover_data={
            'Sector': True,
            'sentiment_score': ':.3f',
            'Market Cap': ':,.0f',
            'headline_count': True
        },
        labels={
            'sentiment_score': 'Sentiment Score',
            'Market Cap': 'Avg Market Cap ($)',
            'headline_count': 'Number of Headlines'
        },
        title='Sector Analysis: Sentiment vs. Market Cap vs. Coverage'
    )

    return fig

In [24]:
bubble_chart(df)

In [43]:
# Bar chart
# Find missing values in different columns

def missing_data_visuals(df):
    missing_data = df.isnull().sum()
    if missing_data.sum() > 0:
        missing_df = pd.DataFrame({
            'Columns': missing_data.index,
            'Missing Values': missing_data.values
        })
        missing_df = missing_df[missing_df['Missing Values'] > 0].sort_values('Missing Values', ascending=False)

        fig = px.bar(missing_df,
                    x='Columns',
                    y='Missing Values',
                    title='Missing Values by Column')
        fig.update_layout(height=500, width=800)
        fig.show()


In [44]:
missing_data_visuals(df)

In [48]:
# Sentiment analysis
# box plot shows more details on the variance
# bar chart shows the sentiment score for different industries

def sentiment_analysis(df):
    fig = px.box(df,
                x='Market_Cap_Category',
                y='sentiment_score',
                title='Sentiment Distribution by Market Cap',
                labels={'Market_Cap_Category': 'Market Cap Category', 'sentiment_score': 'Sentiment Score'},
                category_orders={'Market_Cap_Category': ['Nano', 'Micro', 'Small', 'Medium', 'Large', 'Mega']},
                color='Market_Cap_Category')
    fig.update_layout(height=500, width=800)
    fig.show()


    sector_sentiment = df.groupby('Sector')['sentiment_score'].mean().reset_index().sort_values('sentiment_score')

    fig = px.bar(sector_sentiment,
                x='sentiment_score',
                y='Sector',
                title='Average Sentiment Score by Sector',
                labels={'sentiment_score': 'Average Sentiment Score', 'Sector': 'Sector'},
                orientation='h',
                color='sentiment_score',
                color_continuous_scale='RdBu')
    fig.update_layout(height=600, width=800, yaxis={'categoryorder': 'array', 'categoryarray': sector_sentiment['Sector'].tolist()})
    fig.show()


In [49]:
sentiment_analysis(df)

Output hidden; open in https://colab.research.google.com to view.

In [50]:
# looking at the sentiment score changes across the months and weekdays

def temporal_analysis(df):

    monthly_sentiment = df.groupby(['year', 'month'])['sentiment_score'].mean().reset_index()
    monthly_sentiment['date'] = monthly_sentiment.apply(lambda x: datetime(int(x['year']), int(x['month']), 1), axis=1)
    monthly_sentiment = monthly_sentiment.sort_values('date')

    fig = px.line(monthly_sentiment,
                 x='date',
                 y='sentiment_score',
                 title='Average Monthly Sentiment Score Over Time',
                 labels={'date': 'Date', 'sentiment_score': 'Average Sentiment Score'},
                 markers=True,
                 line_shape='spline')
    fig.update_layout(height=500, width=900)
    fig.show()


    day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    day_sentiment = df.groupby('day_of_week')['sentiment_score'].mean().reset_index()
    day_sentiment['day_name'] = day_sentiment['day_of_week'].apply(lambda x: day_names[x])

    fig = px.bar(day_sentiment,
                x='day_name',
                y='sentiment_score',
                title='Average Sentiment Score by Day of Week',
                labels={'day_name': 'Day of Week', 'sentiment_score': 'Average Sentiment Score'},
                category_orders={'day_name': day_names},
                color='sentiment_score',
                color_continuous_scale='Viridis')
    fig.update_layout(height=500, width=800)
    fig.show()


    if 'date' in df.columns or (all(col in df.columns for col in ['year', 'month', 'day_of_week'])):

        if 'date' not in df.columns and all(col in df.columns for col in ['year', 'month']):
            df['month_year'] = df.apply(lambda x: f"{int(x['year'])}-{int(x['month']):02d}", axis=1)

        if 'month_year' in df.columns and 'day_of_week' in df.columns:
            month_day_sentiment = df.groupby(['month_year', 'day_of_week'])['sentiment_score'].mean().reset_index()
            month_day_sentiment['day_name'] = month_day_sentiment['day_of_week'].apply(lambda x: day_names[x])

            pivot_data = month_day_sentiment.pivot(index='month_year', columns='day_name', values='sentiment_score')

            fig = px.imshow(pivot_data,
                          labels=dict(x="Day of Week", y="Month-Year", color="Sentiment Score"),
                          x=day_names,
                          title="Sentiment Heatmap by Month and Day of Week",
                          color_continuous_scale='RdBu_r')
            fig.update_layout(height=600, width=900)
            fig.show()



In [51]:
temporal_analysis(df)

In [56]:
# bar chart
# Average score by publisher

def publisher_analysis(df):

    publisher_sentiment = df.groupby('publisher')['sentiment_score'].agg(['mean', 'count']).reset_index()
    publisher_sentiment.columns = ['publisher', 'average_sentiment', 'article_count']
    publisher_sentiment = publisher_sentiment[publisher_sentiment['article_count'] > 50].sort_values('average_sentiment')

    fig = px.bar(publisher_sentiment,
                x='average_sentiment',
                y='publisher',
                title='Average Sentiment Score by Publisher (minimum 50 articles)',
                labels={'average_sentiment': 'Average Sentiment Score', 'publisher': 'Publisher'},
                color='average_sentiment',
                color_continuous_scale='RdBu',
                hover_data=['article_count'])
    fig.update_layout(height=max(500, len(publisher_sentiment)*20), width=900)
    fig.show()


In [57]:
publisher_analysis(df)


In [63]:
# Word cloud

def text_analysis(df):

    if 'tokens' in df.columns and not df['tokens'].isnull().all():
        def create_wordcloud_base64(tokens_list):

            all_tokens = ' '.join([' '.join(eval(tokens)) if isinstance(tokens, str) else '' for tokens in tokens_list])


            wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=200).generate(all_tokens)


            img = wordcloud.to_image()


            buffer = io.BytesIO()
            img.save(buffer, format='PNG')
            buffer.seek(0)

            img_str = base64.b64encode(buffer.getvalue()).decode()

            return f"data:image/png;base64,{img_str}"

        wordcloud_img = create_wordcloud_base64(df['tokens'])


        fig = go.Figure()
        fig.add_trace(go.Image(source=wordcloud_img))
        fig.update_layout(title='Word Cloud of Most Common Terms in Headlines', height=500, width=800)
        fig.show()


In [64]:
text_analysis(df)

In [67]:

def stock_recommendation_analysis(df):
    """Analyze stock recommendations"""
    # Recommendation distribution
    rec_counts = df['recommendation'].value_counts().reset_index()
    rec_counts.columns = ['recommendation', 'count']

    fig = px.bar(rec_counts,
                x='recommendation',
                y='count',
                title='Distribution of Stock Recommendations',
                labels={'recommendation': 'Recommendation', 'count': 'Count'},
                color='recommendation')
    fig.update_layout(height=500, width=800)
    fig.show()

    # Sentiment score by recommendation
    fig = px.box(df,
                x='recommendation',
                y='sentiment_score',
                title='Sentiment Score by Stock Recommendation',
                labels={'recommendation': 'Recommendation', 'sentiment_score': 'Sentiment Score'},
                color='recommendation')
    fig.update_layout(height=500, width=800)
    fig.show()

    # Recommendation by sector
    # Convert to long format for plotly
    sector_rec_data = []
    for sector in df['Sector'].unique():
        sector_df = df[df['Sector'] == sector]
        total = len(sector_df)
        for rec in df['recommendation'].unique():
            count = len(sector_df[sector_df['recommendation'] == rec])
            percentage = (count / total) * 100 if total > 0 else 0
            sector_rec_data.append({
                'Sector': sector,
                'recommendation': rec,
                'percentage': percentage
            })

    sector_rec_df = pd.DataFrame(sector_rec_data)

    fig = px.bar(sector_rec_df,
                x='Sector',
                y='percentage',
                color='recommendation',
                title='Recommendation Distribution by Sector (%)',
                labels={'Sector': 'Sector', 'percentage': 'Percentage', 'recommendation': 'Recommendation'},
                barmode='stack')
    fig.update_layout(height=600, width=1000, xaxis_tickangle=45)
    fig.show()

In [68]:
stock_recommendation_analysis(df)

Output hidden; open in https://colab.research.google.com to view.

In [71]:

def market_cap_analysis(df):
    cap_counts = df['Market_Cap_Category'].value_counts().reset_index()
    cap_counts.columns = ['category', 'count']

    fig = px.pie(cap_counts,
                values='count',
                names='category',
                title='Distribution of Market Cap Categories',
                hole=0.4)
    fig.update_layout(height=500, width=700)
    fig.show()


In [72]:
market_cap_analysis(df)

In [75]:
# Correlation matrix

def correlation_analysis(df):
    """Analyze correlations between numerical features"""
    # Select numerical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # Filter out boolean columns
    bool_cols = [col for col in numerical_cols if df[col].dtype == bool]
    numerical_cols = [col for col in numerical_cols if col not in bool_cols]

    # Keep only the most relevant columns
    relevant_cols = ['sentiment_score', 'Market Cap', 'IPO Year', 'headline_length',
                     'word_count', 'recommendation_label', 'publisher_label']
    relevant_cols = [col for col in relevant_cols if col in numerical_cols]

    # Create correlation matrix
    corr_matrix = df[relevant_cols].corr().round(2)

    # Plot correlation heatmap
    fig = px.imshow(corr_matrix,
                   text_auto=True,
                   title='Correlation Matrix of Numerical Features',
                   color_continuous_scale='RdBu_r',
                   zmin=-1, zmax=1)
    fig.update_layout(height=600, width=800)
    fig.show()

In [76]:
correlation_analysis(df)

In [65]:
# Treemap

def sector_industry_analysis(df):

    if 'Industry' in df.columns and 'Sector' in df.columns:
        industry_sentiment = df.groupby(['Sector', 'Industry'])['sentiment_score'].agg(['mean', 'count']).reset_index()
        industry_sentiment.columns = ['Sector', 'Industry', 'average_sentiment', 'article_count']
        industry_sentiment = industry_sentiment[industry_sentiment['article_count'] > 10].sort_values(['Sector', 'average_sentiment'])

        fig = px.treemap(industry_sentiment,
                        path=[px.Constant("All Sectors"), 'Sector', 'Industry'],
                        values='article_count',
                        color='average_sentiment',
                        hover_data=['article_count'],
                        title='Sectors and Industries Treemap (colored by sentiment)',
                        color_continuous_scale='RdBu')
        fig.update_layout(height=700, width=1000)
        fig.show()


In [66]:
sector_industry_analysis(df)