In [1]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import numpy as np

Read financials news dataset and merge into one csv file

In [2]:
#add company label in dataset
years = ['2021', '2022', '2023']
quarters = ['Q1', 'Q2', 'Q3', 'Q4']
companies = ['AAPL', 'AMZN', 'MSFT', 'META', 'TSLA']

for year in years:
    for quarter in quarters:
        for company in companies:
            df_fn = pd.read_csv(f'data/{year}/{quarter}_News/{company}_{year}{quarter}_Financial_News.csv')
            df_fn['company'] = company
            df_fn.to_csv(f'data/{year}/{quarter}_News/{company}_{year}{quarter}_Financial_News.csv', index=False)


In [3]:
#merge csv files and group it into quarter

for i in years:
    for j in quarters:
        files = [
            f'data/{i}/{j}_News/AAPL_{i}{j}_Financial_News.csv', 
            f'data/{i}/{j}_News/AMZN_{i}{j}_Financial_News.csv',
            f'data/{i}/{j}_News/TSLA_{i}{j}_Financial_News.csv',
            f'data/{i}/{j}_News/MSFT_{i}{j}_Financial_News.csv',
            f'data/{i}/{j}_News/META_{i}{j}_Financial_News.csv'
        ]
        df_fn = pd.concat( 
            map(pd.read_csv, files), ignore_index=True)
        df_fn.to_csv(f'Financial News/{i}_{j}_Financial_News.csv', index=False)
 

In [4]:
#merge and group according to years
for i in years:
    files = [
            f'Financial News/{i}_Q1_Financial_News.csv', 
            f'Financial News/{i}_Q2_Financial_News.csv',
            f'Financial News/{i}_Q3_Financial_News.csv',
            f'Financial News/{i}_Q4_Financial_News.csv'
        ]
    df_fn = pd.concat( 
        map(pd.read_csv, files), ignore_index=True)
    df_fn.to_csv(f'Financial News/{i}_Financial_News.csv', index=False)

In [5]:
#merge all csv files into one csv file
files = [
            'Financial News/2021_Financial_News.csv', 
            'Financial News/2022_Financial_News.csv',
            'Financial News/2023_Financial_News.csv',
        ]
df_fn = pd.concat( 
        map(pd.read_csv, files), ignore_index=True)
df_fn.to_csv('FinancialNews.csv', index=False)  

Read financial historical stock data and merge into one csv file

In [6]:
#add company label in dataset
for company in companies:
    df = pd.read_csv(f'Historical stock prices/{company}_historical_data.csv')
    df['company'] = company
    df.to_csv(f'Historical stock prices/{company}_historical_data.csv', index=False)

In [7]:
files = ['Historical stock prices/AAPL_historical_data.csv', 
         'Historical stock prices/AMZN_historical_data.csv',
         'Historical stock prices/TSLA_historical_data.csv',
         'Historical stock prices/MSFT_historical_data.csv',
         'Historical stock prices/META_historical_data.csv'
        ]
df_sp = pd.concat(map(pd.read_csv, files), ignore_index=True)
df_sp.to_csv('Stock_Prices.csv', index=False)  

## Data Cleaning of Financial News

In [8]:
df_fn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      4622 non-null   int64  
 1   Unnamed: 0.1    4622 non-null   int64  
 2   title           4622 non-null   object 
 3   description     4622 non-null   object 
 4   published date  4622 non-null   object 
 5   url             4622 non-null   object 
 6   publisher       4622 non-null   object 
 7   company         4622 non-null   object 
 8   Unnamed: 0.1.1  985 non-null    float64
dtypes: float64(1), int64(2), object(6)
memory usage: 325.1+ KB


In [9]:
df_fn.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,description,published date,url,publisher,company,Unnamed: 0.1.1
0,0,0,Apple (AAPL) to Report Q1 Earnings: What's in ...,Apple (AAPL) to Report Q1 Earnings: What's in ...,"Fri, 22 Jan 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMiS2h0d...,"{'href': 'https://finance.yahoo.com', 'title':...",AAPL,
1,1,1,How to Trade Apple (AAPL) in the First Half of...,How to Trade Apple (AAPL) in the First Half of...,"Thu, 07 Jan 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMiUWh0d...,"{'href': 'https://www.investopedia.com', 'titl...",AAPL,
2,2,2,"What Facebook (FB), Twitter (TWTR), Apple (AAP...","What Facebook (FB), Twitter (TWTR), Apple (AAP...","Mon, 08 Feb 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMidGh0d...,"{'href': 'https://www.bloomberg.com', 'title':...",AAPL,
3,3,3,AAPL After Hours: Share Price Slides On Tech W...,AAPL After Hours: Share Price Slides On Tech W...,"Wed, 24 Mar 2021 07:00:00 GMT",https://news.google.com/rss/articles/CBMiSmh0d...,"{'href': 'https://www.thestreet.com', 'title':...",AAPL,
4,4,4,"Apple reports blowout quarter, booking more th...","Apple reports blowout quarter, booking more th...","Wed, 27 Jan 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMiQGh0d...,"{'href': 'https://www.cnbc.com', 'title': 'CNBC'}",AAPL,


In [10]:
df_fn.tail(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,description,published date,url,publisher,company,Unnamed: 0.1.1
4617,84,84,Meta Quest 3: The Holiday Gift That Transforms...,Meta Quest 3: The Holiday Gift That Transforms...,"Wed, 29 Nov 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiVmh0d...,"{'href': 'https://about.fb.com', 'title': 'Meta'}",META,
4618,85,85,noyb files GDPR complaint against Meta over “P...,noyb files GDPR complaint against Meta over “P...,"Tue, 28 Nov 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiSmh0d...,"{'href': 'https://noyb.eu', 'title': 'NOYB'}",META,
4619,86,86,"Two years later, Facebook's rebrand as Meta lo...","Two years later, Facebook's rebrand as Meta lo...","Sat, 28 Oct 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiS2h0d...,"{'href': 'https://www.fastcompany.com', 'title...",META,
4620,87,87,Meta smart glasses—large language models and t...,Meta smart glasses—large language models and t...,"Mon, 04 Dec 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiMmh0d...,"{'href': 'https://www.nature.com', 'title': 'N...",META,
4621,88,88,Meta-reinforcement learning via orbitofrontal ...,Meta-reinforcement learning via orbitofrontal ...,"Mon, 13 Nov 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiMmh0d...,"{'href': 'https://www.nature.com', 'title': 'N...",META,


In [11]:
#removed unwanted column
df_fn = df_fn.drop(['Unnamed: 0', 'Unnamed: 0.1', 'description', 'Unnamed: 0.1.1'], axis='columns')

In [12]:
#check if there is any duplication
df_fn.duplicated().sum()

10

In [13]:
#drop duplicated value
df_fn.drop_duplicates(inplace=True)

In [14]:
#check if there is any missing value
df_fn.isna().sum()

title             0
published date    0
url               0
publisher         0
company           0
dtype: int64

In [15]:
df_fn.isnull().sum()

title             0
published date    0
url               0
publisher         0
company           0
dtype: int64

In [16]:
#rearrange columns name of financial news csv file
df_fn = df_fn.iloc[:,[1, -1, 0, 3, 2]]

In [17]:
#format date
from datetime import datetime

# covert to datetime
df_fn['published date'] = pd.to_datetime(df_fn['published date'])

# date in MM-DD-YYYY format
df_fn['published date'] = df_fn['published date'].dt.strftime('%Y-%m-%d')

In [18]:
#extract the publisher instead of url
import ast

df_fn['publisher'] = pd.Series(df_fn['publisher'], dtype="string")

def extract_title(dict_string):
    try:
        # Convert string to dictionary
        dict_obj = ast.literal_eval(dict_string)
        
        # Extract the title
        return dict_obj.get('title')
    
    except (ValueError, SyntaxError):
        return None
    
df_fn['publisher'] = df_fn['publisher'].apply(extract_title)

In [19]:
df_fn.to_csv('FinancialNews.csv', index=False) 

## Data Cleaning for Stock Prices

In [20]:
df_sp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3765 entries, 0 to 3764
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       3765 non-null   object 
 1   Open       3765 non-null   float64
 2   High       3765 non-null   float64
 3   Low        3765 non-null   float64
 4   Close      3765 non-null   float64
 5   Adj Close  3765 non-null   float64
 6   Volume     3765 non-null   int64  
 7   company    3765 non-null   object 
dtypes: float64(5), int64(1), object(2)
memory usage: 235.4+ KB


In [21]:
df_sp.head(5)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,company
0,2021-01-04T00:00:00.000,133.520004,133.610001,126.760002,129.410004,126.830078,143301900,AAPL
1,2021-01-05T00:00:00.000,128.889999,131.740005,128.429993,131.009995,128.398163,97664900,AAPL
2,2021-01-06T00:00:00.000,127.720001,131.050003,126.379997,126.599998,124.076096,155088000,AAPL
3,2021-01-07T00:00:00.000,128.360001,131.630005,127.860001,130.919998,128.309982,109578200,AAPL
4,2021-01-08T00:00:00.000,132.429993,132.630005,130.229996,132.050003,129.417465,105158200,AAPL


In [22]:
df_sp.tail(5)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,company
3760,2023-12-22T00:00:00.000,355.579987,357.200012,351.220001,353.390015,353.015472,11764200,META
3761,2023-12-26T00:00:00.000,354.98999,356.980011,353.450012,354.829987,354.453918,9898600,META
3762,2023-12-27T00:00:00.000,356.070007,359.0,355.309998,357.829987,357.450714,13207900,META
3763,2023-12-28T00:00:00.000,359.700012,361.899994,357.809998,358.320007,357.940216,11798800,META
3764,2023-12-29T00:00:00.000,358.98999,360.0,351.820007,353.959991,353.584839,14980500,META


In [23]:
#check if there is any duplication
df_sp.duplicated().sum()

0

In [24]:
#check if there is any missing value
df_sp.isna().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
company      0
dtype: int64

In [25]:
df_sp.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
company      0
dtype: int64

In [26]:
#format date
from datetime import datetime

# covert to datetime
df_sp['Date'] = pd.to_datetime(df_sp['Date'])

# date in MM-DD-YYYY format
df_sp['Date'] = df_sp['Date'].dt.strftime('%Y-%m-%d')

In [27]:
df_sp = df_sp.iloc[:,[0, 7, 1, 2, 3, 4, 5, 6]]
df_sp

Unnamed: 0,Date,company,Open,High,Low,Close,Adj Close,Volume
0,2021-01-04,AAPL,133.520004,133.610001,126.760002,129.410004,126.830078,143301900
1,2021-01-05,AAPL,128.889999,131.740005,128.429993,131.009995,128.398163,97664900
2,2021-01-06,AAPL,127.720001,131.050003,126.379997,126.599998,124.076096,155088000
3,2021-01-07,AAPL,128.360001,131.630005,127.860001,130.919998,128.309982,109578200
4,2021-01-08,AAPL,132.429993,132.630005,130.229996,132.050003,129.417465,105158200
...,...,...,...,...,...,...,...,...
3760,2023-12-22,META,355.579987,357.200012,351.220001,353.390015,353.015472,11764200
3761,2023-12-26,META,354.989990,356.980011,353.450012,354.829987,354.453918,9898600
3762,2023-12-27,META,356.070007,359.000000,355.309998,357.829987,357.450714,13207900
3763,2023-12-28,META,359.700012,361.899994,357.809998,358.320007,357.940216,11798800


In [28]:
df_sp.to_csv('Stock_Prices.csv', index=False)

## Data Labelling

Label sentiment score of each financial news based on Loughran and McDonald Financial Sentiment Dictionaries

In [29]:
!pip install pysentiment2
!pip install vaderSentiment



In [30]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def get_sentiment_score(title):
 
    sid_obj = SentimentIntensityAnalyzer()
 
    sentiment_score = sid_obj.polarity_scores(title)
 
    # decide sentiment as positive, negative and neutral
    if sentiment_score['compound'] >= 0.05 :
        return "Positive"
 
    elif sentiment_score['compound'] <= - 0.05 :
        return "Negative"
 
    else :
        return "Neutral"
    
df_fn['sentiment_score'] = df_fn['title'].apply(get_sentiment_score)
df_fn

Unnamed: 0,published date,company,title,publisher,url,sentiment_score
0,2021-01-22,AAPL,Apple (AAPL) to Report Q1 Earnings: What's in ...,Yahoo Finance,https://news.google.com/rss/articles/CBMiS2h0d...,Neutral
1,2021-01-07,AAPL,How to Trade Apple (AAPL) in the First Half of...,Investopedia,https://news.google.com/rss/articles/CBMiUWh0d...,Neutral
2,2021-02-08,AAPL,"What Facebook (FB), Twitter (TWTR), Apple (AAP...",Bloomberg,https://news.google.com/rss/articles/CBMidGh0d...,Neutral
3,2021-03-24,AAPL,AAPL After Hours: Share Price Slides On Tech W...,TheStreet,https://news.google.com/rss/articles/CBMiSmh0d...,Negative
4,2021-01-27,AAPL,"Apple reports blowout quarter, booking more th...",CNBC,https://news.google.com/rss/articles/CBMiQGh0d...,Neutral
...,...,...,...,...,...,...
4617,2023-11-29,META,Meta Quest 3: The Holiday Gift That Transforms...,Meta,https://news.google.com/rss/articles/CBMiVmh0d...,Positive
4618,2023-11-28,META,noyb files GDPR complaint against Meta over “P...,NOYB,https://news.google.com/rss/articles/CBMiSmh0d...,Negative
4619,2023-10-28,META,"Two years later, Facebook's rebrand as Meta lo...",Fast Company,https://news.google.com/rss/articles/CBMiS2h0d...,Neutral
4620,2023-12-04,META,Meta smart glasses—large language models and t...,Nature.com,https://news.google.com/rss/articles/CBMiMmh0d...,Positive


In [31]:
# import pysentiment2 as ps

# lm = ps.LM()

# def get_sentiment_score(title):
#     tokens = lm.tokenize(title)
#     score = lm.get_score(tokens)
#     return score

# df_fn['sentiment_score'] = df_fn['title'].apply(get_sentiment_score)
# df_fn

In [32]:
#filter whether it is positive, negative or neutral sentiments

# df_fn['sentiment_score'] = pd.Series(df_fn['sentiment_score'], dtype="string")

# def extract_sentiment(sentiment_string):
#     try:
#         # Convert string to dictionary
#         dict_obj = ast.literal_eval(sentiment_string)
        
#         # Extract the sentiment
#         positive = dict_obj.get('Positive')
#         negative = dict_obj.get('Negative')
        
#         if positive != 0:
#             return 'positive'
#         elif negative != 0:
#             return 'negative'
#         else:
#             return 'neutral'
    
#     except (ValueError, SyntaxError):
#         return None
    
# df_fn['sentiment_score'] = df_fn['sentiment_score'].apply(extract_sentiment)

In [33]:
df_fn = df_fn.iloc[:,[0, 1, 2, 3, 5, 4]]
df_fn

Unnamed: 0,published date,company,title,publisher,sentiment_score,url
0,2021-01-22,AAPL,Apple (AAPL) to Report Q1 Earnings: What's in ...,Yahoo Finance,Neutral,https://news.google.com/rss/articles/CBMiS2h0d...
1,2021-01-07,AAPL,How to Trade Apple (AAPL) in the First Half of...,Investopedia,Neutral,https://news.google.com/rss/articles/CBMiUWh0d...
2,2021-02-08,AAPL,"What Facebook (FB), Twitter (TWTR), Apple (AAP...",Bloomberg,Neutral,https://news.google.com/rss/articles/CBMidGh0d...
3,2021-03-24,AAPL,AAPL After Hours: Share Price Slides On Tech W...,TheStreet,Negative,https://news.google.com/rss/articles/CBMiSmh0d...
4,2021-01-27,AAPL,"Apple reports blowout quarter, booking more th...",CNBC,Neutral,https://news.google.com/rss/articles/CBMiQGh0d...
...,...,...,...,...,...,...
4617,2023-11-29,META,Meta Quest 3: The Holiday Gift That Transforms...,Meta,Positive,https://news.google.com/rss/articles/CBMiVmh0d...
4618,2023-11-28,META,noyb files GDPR complaint against Meta over “P...,NOYB,Negative,https://news.google.com/rss/articles/CBMiSmh0d...
4619,2023-10-28,META,"Two years later, Facebook's rebrand as Meta lo...",Fast Company,Neutral,https://news.google.com/rss/articles/CBMiS2h0d...
4620,2023-12-04,META,Meta smart glasses—large language models and t...,Nature.com,Positive,https://news.google.com/rss/articles/CBMiMmh0d...


In [34]:
df_fn.to_csv('Financial_News.csv', index=False)

In [35]:
#change delimiter for importing dataset into MySQL
# FinancialNews.csv is for MySQL

import csv

input_file = 'Financial_News.csv'  # Replace with your input CSV file name
output_file = 'FinancialNews.csv'  # Replace with your desired output CSV file name

def add_quotes(value):
    return f'{value}'

with open(input_file, mode='r', newline='', encoding='utf-8') as infile, \
     open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
    
    reader = csv.reader(infile)
    header = next(reader)  # Read the header row
    writer = csv.writer(outfile, delimiter='*', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(header)
    
    for row in reader:
        cleaned_row = [col.strip() for col in row if col.strip() != '']  # Remove empty columns
        writer.writerow(cleaned_row)

## Tokenization

In [36]:
# !pip install --upgrade pip setuptools
# !pip install transformers --ignore-installed TBB
# !python -m venv myenv
# !myenv\Scripts\activate
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia

Looking in indexes: https://download.pytorch.org/whl/cu118


In [37]:
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install transformers

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB 1.3 MB/s eta 0:00:10
     ---------------------------------------- 0.1/12.8 MB 1.3 MB/s eta 0:00:10
      --------------------------------------- 0.3/12.8 MB 2.7 MB/s eta 0:00:05
     -- ------------------------------------- 0.6/12.8 MB 3.7 MB/s eta 0:00:04
     --- ------------------------------------ 1.0/12.8 MB 4.6 MB/s eta 0:00:03
     ---- ----------------------------------- 1.3/12.8 MB 5.0 MB/s eta 0:00:03
     ----- ---------------------------------- 1.7/12.8 MB 5.3 MB/s eta 0:00:03
     ------ --------------------------------- 2.0/12.8 MB 5.6 MB/s eta 0:00:02
     ------- -------------------------------- 2.3/12.8 MB 5.6 MB/s eta 0:00:02
     -------- ---------------------------

In [38]:
#need to download the stopwords dataset in order to import stopwords
# import nltk
# nltk.download('stopwords')

import spacy
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import pipeline

In [39]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))

def preprocess_text(title):
    
    title = title.lower() # Convert to lowercase

    # Remove punctuation
    title = title.translate(str.maketrans('', '', string.punctuation))

    # Tokenize and lemmatize
    doc = nlp(title)
    tokens = [token.lemma_ for token in doc if token.text not in stop_words]
    return ' '.join(tokens)

df_fn.loc[:, 'cleaned_title'] = df_fn.loc[:,'title'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fn.loc[:, 'cleaned_title'] = df_fn.loc[:,'title'].apply(preprocess_text)


In [40]:
# Map labels to integers
label_mapping = {'Neutral': 0, 'Negative': 1, 'Positive': 2}
df_fn.loc[:,'label'] = df_fn.loc[:,'sentiment_score'].map(label_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fn.loc[:,'label'] = df_fn.loc[:,'sentiment_score'].map(label_mapping)


In [41]:
df_fn = df_fn.sort_values(by='published date')
df_fn

Unnamed: 0,published date,company,title,publisher,sentiment_score,url,cleaned_title,label
358,2021-01-01,META,From ATOM to GradiATOM: Cortical gradients sup...,ScienceDirect.com,Positive,https://news.google.com/rss/articles/CBMiQ2h0d...,atom gradiatom cortical gradient support time ...,2
346,2021-01-01,META,A global meta-analysis of greenhouse gases emi...,ScienceDirect.com,Neutral,https://news.google.com/rss/articles/CBMiQ2h0d...,global metaanalysis greenhouse gas emission cr...,0
162,2021-01-02,TSLA,"Tesla reports 499,550 vehicle deliveries for 2...",CNBC,Negative,https://news.google.com/rss/articles/CBMiX2h0d...,tesla report 499550 vehicle delivery 2020 slig...,1
143,2021-01-04,TSLA,Tesla (TSLA) Breaks Out After Beating Delivery...,Investopedia,Negative,https://news.google.com/rss/articles/CBMiWWh0d...,tesla tsla break beat delivery target invest...,1
363,2021-01-04,META,Differential and spatial expression meta-analy...,Nature.com,Negative,https://news.google.com/rss/articles/CBMiMmh0d...,differential spatial expression metaanalysis g...,1
...,...,...,...,...,...,...,...,...
4402,2023-12-28,TSLA,Tesla Forecast: Why 2024 Will Be Decisive for ...,InvestorPlace,Positive,https://news.google.com/rss/articles/CBMiWmh0d...,tesla forecast 2024 decisive tsla stock inve...,2
4307,2023-12-29,AMZN,3 Reasons Why Amazon (AMZN) Is a Great Growth ...,Yahoo Finance,Positive,https://news.google.com/rss/articles/CBMiR2h0d...,3 reason amazon amzn great growth stock yaho...,2
4530,2023-12-29,MSFT,Is It Too Late to Buy Microsoft? - The Motley ...,The Motley Fool,Negative,https://news.google.com/rss/articles/CBMiSmh0d...,late buy microsoft motley fool,1
4315,2023-12-29,AMZN,Should You Buy Amazon Stock in 2024? - The Mot...,The Motley Fool,Negative,https://news.google.com/rss/articles/CBMiTmh0d...,buy amazon stock 2024 motley fool,1


In [42]:
#Tokenize and encode the data using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128)

df_fn['tokenized'] = df_fn['cleaned_title'].apply(tokenize_function)

In [43]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

## Splitting Datatset

In [44]:
# Split the dataset into train(70%) and test dataset(30%)
x_train, x_temp, y_train, y_temp = train_test_split(df_fn['tokenized'].to_list(), df_fn['label'].to_list(), test_size=0.3, random_state=42)

#Split the remaining 30% of test dataset into validation and test data
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [45]:
def convert_to_dicts(tokenized_texts):
    input_ids = [d['input_ids'] for d in tokenized_texts]
    attention_masks = [d['attention_mask'] for d in tokenized_texts]
    return {'input_ids': input_ids, 'attention_mask': attention_masks}

# Convert to lists of dictionaries
train_encodings = convert_to_dicts(x_train)
val_encodings = convert_to_dicts(x_val)
test_encodings = convert_to_dicts(x_test)

# Create three dataset objects using the SentimentDataset
train_dataset = SentimentDataset(train_encodings, y_train)
val_dataset = SentimentDataset(val_encodings, y_val)
test_dataset = SentimentDataset(test_encodings, y_test)

## Data Modelling

In [46]:
#BERT Model
model_BERT= BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
# !pip install transformers[torch]
!pip install accelerate -U



In [48]:
training_args = TrainingArguments(
    output_dir='./results', # All files generated during training will be stored here
    num_train_epochs=3, # The model will be trained for 3 full epochs unless the step limit (max_steps) is reached first
    per_device_train_batch_size=5, # Training batch size per device (GPU or CPU).
    per_device_eval_batch_size=5, # Evaluation batch size per device (GPU or CPU).
    warmup_steps=10, # Number of warm-up steps during which the learning rate gradually increases to its initial value
    weight_decay=0.01, # Weight decay rate: this technique helps to avoid overfitting, penalizing large weights in the neural network
    logging_dir='./logs', # Directory where training logs will be stored
    max_steps=10,  # Maximum number of training steps to be performed
    save_steps=2,  # Range of steps after which the model will be saved
    logging_steps=2,  # Range of steps after which log information will be recorded
)

In [49]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [50]:
# def compute_metrics(p):
#     preds = p.predictions.argmax(-1)
#     return precision_recall_fscore_support(p.label_ids, preds, average='binary', zero_division=0)[:3]

In [51]:
trainer_bert = Trainer(
    model=model_BERT, # The pre-trained model that you want to fine-tune or train
    args=training_args, # The training arguments that specify the configurations for the training process
    train_dataset=train_dataset, # The dataset used for training the model
    eval_dataset=val_dataset # The dataset used for evaluating the model during training
    # compute_metrics=compute_metrics # custom metrics function
)

# Start training
trainer_bert.train()

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
2,1.0831
4,1.0394
6,1.103
8,1.1102
10,1.0397


TrainOutput(global_step=10, training_loss=1.0750924348831177, metrics={'train_runtime': 63.0402, 'train_samples_per_second': 0.793, 'train_steps_per_second': 0.159, 'total_flos': 3288917721600.0, 'train_loss': 1.0750924348831177, 'epoch': 0.015479876160990712})

In [52]:
# FinBERT model
model_FinBERT = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3, output_hidden_states=True)
tokenizer_finbert=  BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [53]:
# nlp_finbert = pipeline('sentiment-analysis', model=finbert, tokenizer=tokenizer_finbert)

In [54]:
trainer_finbert = Trainer(
    model=model_FinBERT, # The pre-trained model that you want to fine-tune or train
    args=training_args, # The training arguments that specify the configurations for the training process
    train_dataset=train_dataset, # The dataset used for training the model
    eval_dataset=val_dataset # The dataset used for evaluating the model during training
    # compute_metrics=compute_metrics # custom metrics function
)

# Start training
trainer_finbert.train()

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
2,6.6341
4,4.9886
6,2.7311
8,3.6476
10,3.2489


TrainOutput(global_step=10, training_loss=4.250070285797119, metrics={'train_runtime': 70.2735, 'train_samples_per_second': 0.712, 'train_steps_per_second': 0.142, 'total_flos': 3288917721600.0, 'train_loss': 4.250070285797119, 'epoch': 0.015479876160990712})

## Evaluation

In [55]:
# Evaluate the Model
results_bert = trainer_bert.evaluate(test_dataset)

print("Evaluation Results:")
print('***** BERT model*****')
print(f"  - Loss: {results_bert['eval_loss']:.4f}")
print(f"  - Runtime: {results_bert['eval_runtime']:.2f} seconds")
print(f"  - Samples per Second: {results_bert['eval_samples_per_second']:.2f}")
print(f"  - Steps per Second: {results_bert['eval_steps_per_second']:.2f}")
print(f"  - Epoch: {results_bert['epoch']:.4f}")

results_finbert = trainer_finbert.evaluate(test_dataset)
print('\n')
print('***** FinBERT model*****')
print(f"  - Loss: {results_finbert['eval_loss']:.4f}")
print(f"  - Runtime: {results_finbert['eval_runtime']:.2f} seconds")
print(f"  - Samples per Second: {results_finbert['eval_samples_per_second']:.2f}")
print(f"  - Steps per Second: {results_finbert['eval_steps_per_second']:.2f}")
print(f"  - Epoch: {results_finbert['epoch']:.4f}")

Evaluation Results:
***** BERT model*****
  - Loss: 1.0539
  - Runtime: 282.89 seconds
  - Samples per Second: 2.45
  - Steps per Second: 0.49
  - Epoch: 0.0155




***** FinBERT model*****
  - Loss: 3.4681
  - Runtime: 218.75 seconds
  - Samples per Second: 3.16
  - Steps per Second: 0.64
  - Epoch: 0.0155


## Prediction

In [65]:
def predict_sentiment_BERT(text):
    '''Function to predict the sentiment of a given text using a pre-trained BERT model.
    Args: the input text for sentiment prediction.
    Returns: the predicted sentiment ('negative', 'neutral', 'positive').
    '''

    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    outputs = model_BERT(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(predictions, dim=1).item()
    sentiment = {0: 'Neutral', 1: 'Negative', 2: 'Positive'}
    return sentiment[predicted_class]

# Example prediction
example_text = "Apple (AAPL) Stock Significantly Outperforms S&P 500: A Strong Bullish Trend in 2024"
predicted_sentiment = predict_sentiment_BERT(example_text)
print(f"Predicted Sentiment: {predicted_sentiment}")

Predicted Sentiment: Neutral


In [57]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [69]:
# y_pred_bert = df_fn['title'].apply(predict_sentiment_BERT)
y_pred_bert = trainer_bert.predict(test_dataset)
print(y_pred_bert)

PredictionOutput(predictions=array([[ 0.04500972, -0.5775828 , -0.11419939],
       [ 0.03457282, -0.46477443, -0.22406778],
       [-0.02685158, -0.48780283, -0.18719189],
       ...,
       [-0.09495665, -0.4814614 , -0.20129615],
       [ 0.2462766 , -0.66415673,  0.04879946],
       [-0.026008  , -0.50426316, -0.18429132]], dtype=float32), label_ids=array([2, 0, 1, 0, 2, 0, 0, 2, 2, 0, 1, 2, 0, 0, 0, 2, 2, 2, 1, 0, 2, 0,
       0, 2, 1, 1, 0, 0, 2, 2, 0, 0, 0, 0, 1, 2, 2, 2, 0, 2, 2, 2, 2, 0,
       2, 0, 2, 2, 2, 1, 2, 1, 0, 1, 0, 0, 1, 2, 0, 0, 2, 0, 0, 2, 0, 2,
       0, 1, 0, 1, 0, 2, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0,
       2, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 1, 2, 0, 2, 2, 0, 2, 2, 2, 1, 0,
       2, 2, 2, 2, 0, 2, 0, 2, 1, 0, 0, 2, 1, 0, 1, 0, 0, 2, 0, 2, 0, 2,
       0, 2, 2, 2, 0, 0, 0, 2, 2, 2, 1, 0, 2, 2, 1, 0, 2, 1, 0, 2, 1, 1,
       2, 1, 0, 0, 1, 2, 2, 2, 2, 0, 2, 2, 0, 2, 1, 2, 2, 0, 1, 0, 2, 2,
       0, 2, 0, 2, 2, 2, 2, 1, 0, 2, 0, 1, 2, 2, 1, 2, 1, 0,

In [70]:
y_pred_bert = np.argmax(y_pred_bert.predictions, axis=-1) 
y_test_array = np.array(y_test)  # Ensure y_test is a numpy array for comparison
y_pred_bert_array = np.array(y_pred_bert)

#Calculate accuracy
accuracy = accuracy_score(y_test_array, y_pred_bert_array)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision
precision = precision_score(y_test_array, y_pred_bert_array, average='macro', zero_division=0)
print(f"Precision: {precision:.2f}")

# Calculate recall
recall = recall_score(y_test_array, y_pred_bert_array, average='macro')
print(f"Recall: {recall:.2f}")

# Calculate F1 score
f1 = f1_score(y_test_array, y_pred_bert_array, average='macro')
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.38
Precision: 0.33
Recall: 0.34
F1 Score: 0.20


In [71]:
# y_pred_finbert = predict_sentiment_FinBERT(df_fn['title'])
y_pred_finbert = trainer_finbert.predict(test_dataset)
print(y_pred_finbert)

PredictionOutput(predictions=(array([[-4.384128  ,  3.2145245 ,  4.8502445 ],
       [-0.1293976 , -1.2346987 ,  3.0320313 ],
       [-1.4541361 ,  0.1105828 ,  4.7190027 ],
       ...,
       [-2.0768683 , -0.13518861,  5.0314827 ],
       [-1.7531585 ,  0.22667807,  5.0115347 ],
       [ 0.09588408, -1.4778402 ,  4.3294544 ]], dtype=float32), (array([[[-0.11336291,  0.62980473, -0.63674456, ..., -0.19307211,
          0.79837334, -0.65452325],
        [ 0.44589773, -0.6437525 , -0.62973684, ...,  0.4240526 ,
         -0.87949216,  0.37748742],
        [-0.05306801,  0.05092329, -0.57968676, ...,  0.23610501,
         -3.0624576 , -0.17268822],
        ...,
        [-0.10040909, -0.5457087 ,  1.8801994 , ..., -0.01570695,
         -0.6761691 , -2.5509245 ],
        [ 0.7200433 , -0.76889235,  1.4903662 , ..., -0.35075784,
         -0.44159454, -2.283993  ],
        [ 0.36061978, -0.9319785 ,  0.89066267, ..., -0.05087802,
         -0.67117   , -1.886979  ]],

       [[-0.11336291,  0.

In [73]:
y_pred_finbert = np.argmax(y_pred_finbert.predictions[0], axis=-1) 
# y_test_array = np.array(y_test)  # Ensure y_test is a numpy array for comparison
y_pred_finbert_array = np.array(y_pred_finbert)

accuracy = accuracy_score(y_test_array, y_pred_finbert_array)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision
precision = precision_score(y_test_array, y_pred_finbert_array, average='macro')
print(f"Precision: {precision:.2f}")

# # Calculate recall
recall = recall_score(y_test_array, y_pred_finbert_array, average='macro')
print(f"Recall: {recall:.2f}")

# Calculate F1 score
f1 = f1_score(y_test_array, y_pred_finbert_array, average='macro')
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.40
Precision: 0.51
Recall: 0.32
F1 Score: 0.21


## Save Model

In [74]:
model_save_path = "./model"
trainer_finbert.save_model(model_save_path)
tokenizer_finbert.save_pretrained(model_save_path)

('./model\\tokenizer_config.json',
 './model\\special_tokens_map.json',
 './model\\vocab.txt',
 './model\\added_tokens.json')