In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
cd drive/MyDrive/IE7500_GroupB/Dataset

/content/drive/MyDrive/IE7500_GroupB/Dataset


# Install Necessary Libraries

In [3]:
!pip install gensim



In [4]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer

import gensim
import numpy as np

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import plotly.express as px

import os

# Download NLTK dependencies
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("vader_lexicon")
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [5]:
# Initialize components
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
sia = SentimentIntensityAnalyzer()

# Load Dataset

In [6]:
import os
from pathlib import Path

# Load CSVs
df_analyst_processed = pd.read_csv("analyst_ratings_processed.csv", low_memory=False)
df_analyst_ratings = pd.read_csv("raw_analyst_ratings.csv", low_memory=False)
df_partner_headlines = pd.read_csv("raw_partner_headlines.csv", low_memory=False)

datasets = {
    "df_analyst_processed": df_analyst_processed,
    "df_analyst_ratings": df_analyst_ratings,
    "df_partner_headlines": df_partner_headlines
}

print("CSV files loaded successfully.")

CSV files loaded successfully.


In [7]:
print(df_analyst_processed.shape)
print(df_analyst_ratings.shape)
print(df_partner_headlines.shape)

(1400469, 4)
(1407328, 6)
(1845559, 6)


We are selecting to work with df_partner_headlines.csv dataset for our project as it has the most useful data for the purpose of our analysis.

# Dataset Preparation

In [8]:
import os
os.chdir('/content/drive/')

In [9]:
cd MyDrive/IE7500_GroupB/Notebooks

/content/drive/MyDrive/IE7500_GroupB/Notebooks


In [10]:
from helpers import *

In [11]:
# Tokenization
def tokenize(text):
    return word_tokenize(text)

# Normalize tokens (remove punctuation, symbols, etc.)
def normalize_tokens(tokens):
    return [re.sub(r"[^a-zA-Z0-9\s]", "", token.lower()) for token in tokens if token.isalnum()]

# Remove Stopwords
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Lemmatization
def lemmatize(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]


# Sentiment Analysis
def get_sentiment(text):
    return sia.polarity_scores(text)["compound"]

# Convert text into CountVectorizer (Bag of Words)
def apply_countvectorizer(texts):
    vectorizer = CountVectorizer()
    return vectorizer.fit_transform(texts), vectorizer

# Convert text into TF-IDF features
def apply_tfidf(texts):
    vectorizer = TfidfVectorizer()
    return vectorizer.fit_transform(texts), vectorizer

In [12]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

In [13]:
def get_pos_tags(text):
    doc = nlp(str(text))
    return [(token.text, token.pos_) for token in doc]

def get_named_entities(text):
    doc = nlp(str(text))
    return [(ent.text, ent.label_) for ent in doc.ents]


In [14]:
# Check for missing values
missing_values = df_partner_headlines.isnull().sum()

# Display only columns with missing values
missing_values[missing_values > 0]

Unnamed: 0,0


In [15]:
# Check for Duplicates
duplicate_rows = df_partner_headlines.duplicated().sum()

# Display the count of duplicate rows
print(f"Number of duplicate rows: {duplicate_rows}")

Number of duplicate rows: 0


In [16]:
# Run Tokenization First on Headlines
text_column = "headline"
df_partner_headlines["tokens"] = df_partner_headlines[text_column].astype(str).apply(tokenize)
df_partner_headlines.head(5)

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,tokens
0,2,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01 00:00:00,A,"[Agilent, Technologies, Announces, Pricing, of..."
1,3,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18 00:00:00,A,"[Agilent, (, A, ), Gears, Up, for, Q2, Earning..."
2,4,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15 00:00:00,A,"[J.P., Morgan, Asset, Management, Announces, L..."
3,5,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15 00:00:00,A,"[Pershing, Square, Capital, Management, ,, L.P..."
4,6,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12 00:00:00,A,"[Agilent, Awards, Trilogy, Sciences, with, a, ..."


In [17]:
# Normalize Tokens
df_partner_headlines["normalized_tokens"] = df_partner_headlines["tokens"].apply(normalize_tokens)
df_partner_headlines.head(5)

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,tokens,normalized_tokens
0,2,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01 00:00:00,A,"[Agilent, Technologies, Announces, Pricing, of...","[agilent, technologies, announces, pricing, of..."
1,3,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18 00:00:00,A,"[Agilent, (, A, ), Gears, Up, for, Q2, Earning...","[agilent, a, gears, up, for, q2, earnings, wha..."
2,4,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15 00:00:00,A,"[J.P., Morgan, Asset, Management, Announces, L...","[morgan, asset, management, announces, liquida..."
3,5,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15 00:00:00,A,"[Pershing, Square, Capital, Management, ,, L.P...","[pershing, square, capital, management, buys, ..."
4,6,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12 00:00:00,A,"[Agilent, Awards, Trilogy, Sciences, with, a, ...","[agilent, awards, trilogy, sciences, with, a, ..."


In [18]:
# Remove Stopwords
df_partner_headlines["filtered_tokens"] = df_partner_headlines["normalized_tokens"].apply(remove_stopwords)
df_partner_headlines.head(5)

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,tokens,normalized_tokens,filtered_tokens
0,2,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01 00:00:00,A,"[Agilent, Technologies, Announces, Pricing, of...","[agilent, technologies, announces, pricing, of...","[agilent, technologies, announces, pricing, mi..."
1,3,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18 00:00:00,A,"[Agilent, (, A, ), Gears, Up, for, Q2, Earning...","[agilent, a, gears, up, for, q2, earnings, wha...","[agilent, gears, q2, earnings, cards]"
2,4,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15 00:00:00,A,"[J.P., Morgan, Asset, Management, Announces, L...","[morgan, asset, management, announces, liquida...","[morgan, asset, management, announces, liquida..."
3,5,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15 00:00:00,A,"[Pershing, Square, Capital, Management, ,, L.P...","[pershing, square, capital, management, buys, ...","[pershing, square, capital, management, buys, ..."
4,6,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12 00:00:00,A,"[Agilent, Awards, Trilogy, Sciences, with, a, ...","[agilent, awards, trilogy, sciences, with, a, ...","[agilent, awards, trilogy, sciences, golden, t..."


In [19]:
# Run Lemmatization
df_partner_headlines["lemmas"] = df_partner_headlines["filtered_tokens"].apply(lemmatize)
df_partner_headlines.head(5)

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,tokens,normalized_tokens,filtered_tokens,lemmas
0,2,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01 00:00:00,A,"[Agilent, Technologies, Announces, Pricing, of...","[agilent, technologies, announces, pricing, of...","[agilent, technologies, announces, pricing, mi...","[agilent, technology, announces, pricing, mill..."
1,3,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18 00:00:00,A,"[Agilent, (, A, ), Gears, Up, for, Q2, Earning...","[agilent, a, gears, up, for, q2, earnings, wha...","[agilent, gears, q2, earnings, cards]","[agilent, gear, q2, earnings, card]"
2,4,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15 00:00:00,A,"[J.P., Morgan, Asset, Management, Announces, L...","[morgan, asset, management, announces, liquida...","[morgan, asset, management, announces, liquida...","[morgan, asset, management, announces, liquida..."
3,5,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15 00:00:00,A,"[Pershing, Square, Capital, Management, ,, L.P...","[pershing, square, capital, management, buys, ...","[pershing, square, capital, management, buys, ...","[pershing, square, capital, management, buy, a..."
4,6,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12 00:00:00,A,"[Agilent, Awards, Trilogy, Sciences, with, a, ...","[agilent, awards, trilogy, sciences, with, a, ...","[agilent, awards, trilogy, sciences, golden, t...","[agilent, award, trilogy, science, golden, tic..."


In [20]:
# Run Sentiment Analysis
df_partner_headlines["sentiment_score"] = df_partner_headlines[text_column].astype(str).apply(get_sentiment)
df_partner_headlines.head(5)

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,tokens,normalized_tokens,filtered_tokens,lemmas,sentiment_score
0,2,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01 00:00:00,A,"[Agilent, Technologies, Announces, Pricing, of...","[agilent, technologies, announces, pricing, of...","[agilent, technologies, announces, pricing, mi...","[agilent, technology, announces, pricing, mill...",0.0
1,3,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18 00:00:00,A,"[Agilent, (, A, ), Gears, Up, for, Q2, Earning...","[agilent, a, gears, up, for, q2, earnings, wha...","[agilent, gears, q2, earnings, cards]","[agilent, gear, q2, earnings, card]",0.0
2,4,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15 00:00:00,A,"[J.P., Morgan, Asset, Management, Announces, L...","[morgan, asset, management, announces, liquida...","[morgan, asset, management, announces, liquida...","[morgan, asset, management, announces, liquida...",0.3612
3,5,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15 00:00:00,A,"[Pershing, Square, Capital, Management, ,, L.P...","[pershing, square, capital, management, buys, ...","[pershing, square, capital, management, buys, ...","[pershing, square, capital, management, buy, a...",0.0
4,6,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12 00:00:00,A,"[Agilent, Awards, Trilogy, Sciences, with, a, ...","[agilent, awards, trilogy, sciences, with, a, ...","[agilent, awards, trilogy, sciences, golden, t...","[agilent, award, trilogy, science, golden, tic...",0.4588


## Merge DataFrame with NASDAQ Stock Dataset

In [21]:
os.chdir('/content/drive/')

In [22]:
cd MyDrive/IE7500_GroupB/Dataset

/content/drive/MyDrive/IE7500_GroupB/Dataset


In [23]:
ticker_df = pd.read_csv('nasdaq_screener_1742264403037.csv')

In [24]:
ticker_df.head()

Unnamed: 0,Symbol,Name,Last Sale,Net Change,% Change,Market Cap,Country,IPO Year,Volume,Sector,Industry
0,A,Agilent Technologies Inc. Common Stock,$121.18,3.85,3.281%,34548750000.0,United States,1999.0,2978763,Industrials,Biotechnology: Laboratory Analytical Instruments
1,AA,Alcoa Corporation Common Stock,$33.53,1.22,3.776%,8680392000.0,United States,2016.0,3837315,Industrials,Aluminum
2,AACBU,Artius II Acquisition Inc. Units,$10.06,0.0,0.00%,0.0,United States,2025.0,6,Finance,Blank Checks
3,AACG,ATA Creativity Global American Depositary Shares,$0.9199,-0.0264,-2.79%,29436920.0,China,2008.0,23168,Real Estate,Other Consumer Services
4,AACT,Ares Acquisition Corporation II Class A Ordina...,$11.15,-0.01,-0.09%,0.0,,2023.0,135763,Finance,Blank Checks


In [25]:
# Merge the datasets on stock symbol
merged_df = df_partner_headlines.merge(
    ticker_df[['Symbol', 'Name', 'Market Cap', 'Country', 'IPO Year', 'Volume', 'Sector', 'Industry']],
    left_on='stock', right_on='Symbol', how='left'
)

In [26]:
merged_df.head()

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,tokens,normalized_tokens,filtered_tokens,lemmas,sentiment_score,Symbol,Name,Market Cap,Country,IPO Year,Volume,Sector,Industry
0,2,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01 00:00:00,A,"[Agilent, Technologies, Announces, Pricing, of...","[agilent, technologies, announces, pricing, of...","[agilent, technologies, announces, pricing, mi...","[agilent, technology, announces, pricing, mill...",0.0,A,Agilent Technologies Inc. Common Stock,34548750000.0,United States,1999.0,2978763.0,Industrials,Biotechnology: Laboratory Analytical Instruments
1,3,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18 00:00:00,A,"[Agilent, (, A, ), Gears, Up, for, Q2, Earning...","[agilent, a, gears, up, for, q2, earnings, wha...","[agilent, gears, q2, earnings, cards]","[agilent, gear, q2, earnings, card]",0.0,A,Agilent Technologies Inc. Common Stock,34548750000.0,United States,1999.0,2978763.0,Industrials,Biotechnology: Laboratory Analytical Instruments
2,4,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15 00:00:00,A,"[J.P., Morgan, Asset, Management, Announces, L...","[morgan, asset, management, announces, liquida...","[morgan, asset, management, announces, liquida...","[morgan, asset, management, announces, liquida...",0.3612,A,Agilent Technologies Inc. Common Stock,34548750000.0,United States,1999.0,2978763.0,Industrials,Biotechnology: Laboratory Analytical Instruments
3,5,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15 00:00:00,A,"[Pershing, Square, Capital, Management, ,, L.P...","[pershing, square, capital, management, buys, ...","[pershing, square, capital, management, buys, ...","[pershing, square, capital, management, buy, a...",0.0,A,Agilent Technologies Inc. Common Stock,34548750000.0,United States,1999.0,2978763.0,Industrials,Biotechnology: Laboratory Analytical Instruments
4,6,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12 00:00:00,A,"[Agilent, Awards, Trilogy, Sciences, with, a, ...","[agilent, awards, trilogy, sciences, with, a, ...","[agilent, awards, trilogy, sciences, golden, t...","[agilent, award, trilogy, science, golden, tic...",0.4588,A,Agilent Technologies Inc. Common Stock,34548750000.0,United States,1999.0,2978763.0,Industrials,Biotechnology: Laboratory Analytical Instruments


In [27]:
# Drop redundant columns after merging
merged_df.drop(columns=['Unnamed: 0', 'Volume','Symbol'], inplace=True)

In [28]:
# Convert 'date' column to datetime
merged_df['date'] = pd.to_datetime(merged_df['date'])

# Extract Date Features
merged_df['year'] = merged_df['date'].dt.year
merged_df['month'] = merged_df['date'].dt.month
merged_df['day_of_week'] = merged_df['date'].dt.dayofweek

In [29]:
# Sentiment Classification (-1 Negative, 0 Neutral, 1 Positive)
merged_df['sentiment_label'] = merged_df['sentiment_score'].apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))

In [30]:
# Text-Based Features
merged_df['headline_length'] = merged_df['headline'].apply(lambda x: len(str(x)))
merged_df['word_count'] = merged_df['headline'].apply(lambda x: len(str(x).split()))

In [31]:
# Apply classification
merged_df['Market_Cap_Category'] = merged_df['Market Cap'].apply(classify_market_cap)

In [32]:
merged_df.head()

Unnamed: 0,headline,url,publisher,date,stock,tokens,normalized_tokens,filtered_tokens,lemmas,sentiment_score,...,IPO Year,Sector,Industry,year,month,day_of_week,sentiment_label,headline_length,word_count,Market_Cap_Category
0,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01,A,"[Agilent, Technologies, Announces, Pricing, of...","[agilent, technologies, announces, pricing, of...","[agilent, technologies, announces, pricing, mi...","[agilent, technology, announces, pricing, mill...",0.0,...,1999.0,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,6,0,0,70,10,Large
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18,A,"[Agilent, (, A, ), Gears, Up, for, Q2, Earning...","[agilent, a, gears, up, for, q2, earnings, wha...","[agilent, gears, q2, earnings, cards]","[agilent, gear, q2, earnings, card]",0.0,...,1999.0,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,5,0,0,58,11,Large
2,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15,A,"[J.P., Morgan, Asset, Management, Announces, L...","[morgan, asset, management, announces, liquida...","[morgan, asset, management, announces, liquida...","[morgan, asset, management, announces, liquida...",0.3612,...,1999.0,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,5,4,1,79,10,Large
3,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15,A,"[Pershing, Square, Capital, Management, ,, L.P...","[pershing, square, capital, management, buys, ...","[pershing, square, capital, management, buys, ...","[pershing, square, capital, management, buy, a...",0.0,...,1999.0,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,5,4,0,99,14,Large
4,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12,A,"[Agilent, Awards, Trilogy, Sciences, with, a, ...","[agilent, awards, trilogy, sciences, with, a, ...","[agilent, awards, trilogy, sciences, golden, t...","[agilent, award, trilogy, science, golden, tic...",0.4588,...,1999.0,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,5,1,1,66,10,Large


score >= 0.5 → Buy

score <= -0.5 → Sell

-0.5 < score < 0.5 → Hold

In [33]:
def classify_recommendation(score):
    if score >= 0.5:
        return "Buy"
    elif score <= -0.5:
        return "Sell"
    else:
        return "Hold"

In [34]:
merged_df['recommendation'] = merged_df['sentiment_score'].apply(classify_recommendation)
merged_df.head()

Unnamed: 0,headline,url,publisher,date,stock,tokens,normalized_tokens,filtered_tokens,lemmas,sentiment_score,...,Sector,Industry,year,month,day_of_week,sentiment_label,headline_length,word_count,Market_Cap_Category,recommendation
0,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01,A,"[Agilent, Technologies, Announces, Pricing, of...","[agilent, technologies, announces, pricing, of...","[agilent, technologies, announces, pricing, mi...","[agilent, technology, announces, pricing, mill...",0.0,...,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,6,0,0,70,10,Large,Hold
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18,A,"[Agilent, (, A, ), Gears, Up, for, Q2, Earning...","[agilent, a, gears, up, for, q2, earnings, wha...","[agilent, gears, q2, earnings, cards]","[agilent, gear, q2, earnings, card]",0.0,...,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,5,0,0,58,11,Large,Hold
2,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15,A,"[J.P., Morgan, Asset, Management, Announces, L...","[morgan, asset, management, announces, liquida...","[morgan, asset, management, announces, liquida...","[morgan, asset, management, announces, liquida...",0.3612,...,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,5,4,1,79,10,Large,Hold
3,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15,A,"[Pershing, Square, Capital, Management, ,, L.P...","[pershing, square, capital, management, buys, ...","[pershing, square, capital, management, buys, ...","[pershing, square, capital, management, buy, a...",0.0,...,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,5,4,0,99,14,Large,Hold
4,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12,A,"[Agilent, Awards, Trilogy, Sciences, with, a, ...","[agilent, awards, trilogy, sciences, with, a, ...","[agilent, awards, trilogy, sciences, golden, t...","[agilent, award, trilogy, science, golden, tic...",0.4588,...,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,5,1,1,66,10,Large,Hold


In [35]:
import os
os.chdir('/content/drive/')

In [36]:
cd MyDrive/IE7500_GroupB/Notebooks

/content/drive/MyDrive/IE7500_GroupB/Notebooks


Save DataFrame Checkpoint 1

In [37]:
# Save the merged DataFrame
merged_df.to_csv("saved_dfs/merged_df_v1.csv", index=False)

## Apply Encoding to Categorical Columns

In [38]:
# Show all columns when printing DataFrames
pd.set_option('display.max_columns', None)

In [39]:
merged_df.head()

Unnamed: 0,headline,url,publisher,date,stock,tokens,normalized_tokens,filtered_tokens,lemmas,sentiment_score,Name,Market Cap,Country,IPO Year,Sector,Industry,year,month,day_of_week,sentiment_label,headline_length,word_count,Market_Cap_Category,recommendation
0,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01,A,"[Agilent, Technologies, Announces, Pricing, of...","[agilent, technologies, announces, pricing, of...","[agilent, technologies, announces, pricing, mi...","[agilent, technology, announces, pricing, mill...",0.0,Agilent Technologies Inc. Common Stock,34548750000.0,United States,1999.0,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,6,0,0,70,10,Large,Hold
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18,A,"[Agilent, (, A, ), Gears, Up, for, Q2, Earning...","[agilent, a, gears, up, for, q2, earnings, wha...","[agilent, gears, q2, earnings, cards]","[agilent, gear, q2, earnings, card]",0.0,Agilent Technologies Inc. Common Stock,34548750000.0,United States,1999.0,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,5,0,0,58,11,Large,Hold
2,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15,A,"[J.P., Morgan, Asset, Management, Announces, L...","[morgan, asset, management, announces, liquida...","[morgan, asset, management, announces, liquida...","[morgan, asset, management, announces, liquida...",0.3612,Agilent Technologies Inc. Common Stock,34548750000.0,United States,1999.0,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,5,4,1,79,10,Large,Hold
3,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15,A,"[Pershing, Square, Capital, Management, ,, L.P...","[pershing, square, capital, management, buys, ...","[pershing, square, capital, management, buys, ...","[pershing, square, capital, management, buy, a...",0.0,Agilent Technologies Inc. Common Stock,34548750000.0,United States,1999.0,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,5,4,0,99,14,Large,Hold
4,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12,A,"[Agilent, Awards, Trilogy, Sciences, with, a, ...","[agilent, awards, trilogy, sciences, with, a, ...","[agilent, awards, trilogy, sciences, golden, t...","[agilent, award, trilogy, science, golden, tic...",0.4588,Agilent Technologies Inc. Common Stock,34548750000.0,United States,1999.0,Industrials,Biotechnology: Laboratory Analytical Instruments,2020,5,1,1,66,10,Large,Hold


Categorical Columns in Your DataFrame
Here's a breakdown of what should be treated as categorical:

Column Name	Reason
publisher	Source of news → categorical text
stock	Ticker symbol → categorical code
Symbol	Same as stock, may be redundant
Name	Company name → categorical
Country	Country → clear categorical label
Sector	Industry sector → categorical
Industry	More specific business category
Market_Cap_Category	Market cap class → categorical (ordinal-ish)
recommendation	Target label (Buy/Hold/Sell) → categorical
day_of_week	(0–6) but represents categories, not values
month	Categorical when not modeling time series
sentiment_label	-1, 0, 1 → considered a label/class

In [40]:
merged_df['publisher'].value_counts()

Unnamed: 0_level_0,count
publisher,Unnamed: 1_level_1
Seeking Alpha,897219
Zacks,438107
GuruFocus,212433
Investor's Business Daily,134890
webmaster,59669
TalkMarkets,27209
Traders Huddle,16484
TheStreet.Com,16023
Vetr,10866
Small Cap Network,8059


In [41]:
merged_df['Country'].value_counts()

Unnamed: 0_level_0,count
Country,Unnamed: 1_level_1
United States,828757
Canada,44948
China,17062
Bermuda,10527
Israel,8529
United Kingdom,8127
Brazil,7546
Ireland,6320
Switzerland,5421
Netherlands,5269


In [42]:
merged_df['Sector'].value_counts()

Unnamed: 0_level_0,count
Sector,Unnamed: 1_level_1
Consumer Discretionary,239511
Finance,176245
Industrials,123234
Technology,109439
Health Care,105079
Real Estate,59387
Energy,54333
Utilities,48253
Consumer Staples,37869
Basic Materials,25812


In [43]:
merged_df['Industry'].value_counts()

Unnamed: 0_level_0,count
Industry,Unnamed: 1_level_1
Real Estate Investment Trusts,54927
Major Banks,42785
Biotechnology: Pharmaceutical Preparations,38394
Industrial Machinery/Components,37265
Oil & Gas Production,32588
...,...
Wholesale Distributors,161
Diversified Financial Services,115
Building operators,82
Professional and commerical equipment,52


In [44]:
merged_df['Market_Cap_Category'].value_counts()

Unnamed: 0_level_0,count
Market_Cap_Category,Unnamed: 1_level_1
Nano,896614
Large,426935
Medium,265592
Small,175382
Micro,66424
Mega,14612


In [45]:
merged_df['sentiment_label'].value_counts()

Unnamed: 0_level_0,count
sentiment_label,Unnamed: 1_level_1
0,949171
1,600611
-1,295777


In [46]:
merged_df['recommendation'].value_counts()

Unnamed: 0_level_0,count
recommendation,Unnamed: 1_level_1
Hold,1588489
Buy,184563
Sell,72507


In [47]:
# One Hot Encode 'Market_Cap_Category' column
market_cap_ohe = pd.get_dummies(merged_df['Market_Cap_Category'], prefix="cap")

# Concatenate back to main DataFrame
merged_df = pd.concat([merged_df, market_cap_ohe], axis=1)


In [48]:
# One-hot encode 'Sector' column
sector_dummies = pd.get_dummies(merged_df['Sector'], prefix='sector')

# Concatenate the one-hot columns with the original DataFrame
merged_df = pd.concat([merged_df, sector_dummies], axis=1)


In [49]:
# Label Encode 'recommendation' column
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
merged_df['recommendation_label'] = le.fit_transform(merged_df['recommendation'])

# Mapping
recommendation_label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(recommendation_label_mapping)


{'Buy': 0, 'Hold': 1, 'Sell': 2}


In [50]:
# Label encode 'Publisher' column
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
merged_df['publisher_label'] = le.fit_transform(merged_df['publisher'])
# Mapping
publisher_label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(publisher_label_mapping)

{'Accesswire': 0, 'BayStreet': 1, 'Born2Invest': 2, 'Fox Business': 3, 'GuruFocus': 4, 'Investopedia': 5, "Investor's Business Daily": 6, 'Marketfy': 7, 'Marketfy Insights': 8, 'Seeking Alpha': 9, 'Small Cap Network': 10, 'TalkMarkets': 11, 'TheStreet.Com': 12, 'Traders Huddle': 13, 'Unknown': 14, 'Vetr': 15, 'Zacks': 16, 'moneyshow': 17, 'msnmoney': 18, 'webmaster': 19, 'ycharts': 20}


In [51]:
# Label encode 'Country' column
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
merged_df['country_label'] = le.fit_transform(merged_df['Country'])
# Mapping
country_label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(country_label_mapping)

{'Argentina': 0, 'Australia': 1, 'Belgium': 2, 'Bermuda': 3, 'Brazil': 4, 'Canada': 5, 'Cayman Islands': 6, 'Chile': 7, 'China': 8, 'Colombia': 9, 'Curacao': 10, 'Denmark': 11, 'Finland': 12, 'France': 13, 'Germany': 14, 'Greece': 15, 'Guernsey': 16, 'Hong Kong': 17, 'India': 18, 'Indonesia': 19, 'Ireland': 20, 'Israel': 21, 'Italy': 22, 'Japan': 23, 'Jersey': 24, 'Luxembourg': 25, 'Macau': 26, 'Malaysia': 27, 'Marshall Islands': 28, 'Mexico': 29, 'Monaco': 30, 'Netherlands': 31, 'Panama': 32, 'Peru': 33, 'Philippines': 34, 'Puerto Rico': 35, 'Singapore': 36, 'South Africa': 37, 'South Korea': 38, 'Spain': 39, 'Sweden': 40, 'Switzerland': 41, 'Taiwan': 42, 'Turkey': 43, 'United Kingdom': 44, 'United States': 45, nan: 46}


In [52]:
# Label encode 'Industry' column
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
merged_df['industry_label'] = le.fit_transform(merged_df['Industry'])
# Mapping
industry_label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(industry_label_mapping)

{' Medicinal Chemicals and Botanical Products ': 0, 'Accident &Health Insurance': 1, 'Advertising': 2, 'Aerospace': 3, 'Agricultural Chemicals': 4, 'Air Freight/Delivery Services': 5, 'Aluminum': 6, 'Apparel': 7, 'Auto & Home Supply Stores': 8, 'Auto Manufacturing': 9, 'Auto Parts:O.E.M.': 10, 'Automotive Aftermarket': 11, 'Banks': 12, 'Beverages (Production/Distribution)': 13, 'Biotechnology: Biological Products (No Diagnostic Substances)': 14, 'Biotechnology: Commercial Physical & Biological Resarch': 15, 'Biotechnology: Electromedical & Electrotherapeutic Apparatus': 16, 'Biotechnology: In Vitro & In Vivo Diagnostic Substances': 17, 'Biotechnology: Laboratory Analytical Instruments': 18, 'Biotechnology: Pharmaceutical Preparations': 19, 'Blank Checks': 20, 'Books': 21, 'Broadcasting': 22, 'Building Materials': 23, 'Building Products': 24, 'Building operators': 25, 'Business Services': 26, 'Cable & Other Pay Television Services': 27, 'Catalog/Specialty Distribution': 28, 'Clothing/Sh

In [53]:
merged_df.shape

(1845559, 46)

In [56]:
# Save the merged DataFrame
merged_df.to_csv("saved_dfs/df_for_models.csv", index=False)

In [57]:
merged_df.dtypes

Unnamed: 0,0
headline,object
url,object
publisher,object
date,datetime64[ns]
stock,object
tokens,object
normalized_tokens,object
filtered_tokens,object
lemmas,object
sentiment_score,float64


In [58]:
# Convert data types to dictionary
dtypes_dict = merged_df.dtypes.apply(lambda x: x.name).to_dict()

# View or print the result
print(dtypes_dict)

{'headline': 'object', 'url': 'object', 'publisher': 'object', 'date': 'datetime64[ns]', 'stock': 'object', 'tokens': 'object', 'normalized_tokens': 'object', 'filtered_tokens': 'object', 'lemmas': 'object', 'sentiment_score': 'float64', 'Name': 'object', 'Market Cap': 'float64', 'Country': 'object', 'IPO Year': 'float64', 'Sector': 'object', 'Industry': 'object', 'year': 'int32', 'month': 'int32', 'day_of_week': 'int32', 'sentiment_label': 'int64', 'headline_length': 'int64', 'word_count': 'int64', 'Market_Cap_Category': 'object', 'recommendation': 'object', 'cap_Large': 'bool', 'cap_Medium': 'bool', 'cap_Mega': 'bool', 'cap_Micro': 'bool', 'cap_Nano': 'bool', 'cap_Small': 'bool', 'sector_Basic Materials': 'bool', 'sector_Consumer Discretionary': 'bool', 'sector_Consumer Staples': 'bool', 'sector_Energy': 'bool', 'sector_Finance': 'bool', 'sector_Health Care': 'bool', 'sector_Industrials': 'bool', 'sector_Miscellaneous': 'bool', 'sector_Real Estate': 'bool', 'sector_Technology': 'bool

In [59]:
# save df datatypes to ensure data integrity
merged_df.dtypes.to_csv('saved_dfs/merged_df_dict.csv')