In [1]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import numpy as np

In [2]:
#add company label in dataset
years = ['2021', '2022', '2023']
quarters = ['Q1', 'Q2', 'Q3', 'Q4']
companies = ['AAPL', 'AMZN', 'MSFT', 'META', 'TSLA']

for year in years:
    for quarter in quarters:
        for company in companies:
            df = pd.read_csv(f'data/{year}/{quarter}_News/{company}_{year}{quarter}_Financial_News.csv')
            df['company'] = company
            df.to_csv(f'data/{year}/{quarter}_News/{company}_{year}{quarter}_Financial_News.csv', index=False)


In [3]:
#merge csv files and group it into quarter

for i in years:
    for j in quarters:
        files = [
            f'data/{i}/{j}_News/AAPL_{i}{j}_Financial_News.csv', 
            f'data/{i}/{j}_News/AMZN_{i}{j}_Financial_News.csv',
            f'data/{i}/{j}_News/TSLA_{i}{j}_Financial_News.csv',
            f'data/{i}/{j}_News/MSFT_{i}{j}_Financial_News.csv',
            f'data/{i}/{j}_News/META_{i}{j}_Financial_News.csv'
        ]
        df = pd.concat( 
            map(pd.read_csv, files), ignore_index=True)
        df.to_csv(f'Financial News/{i}_{j}_Financial_News.csv', index=False)
 

In [4]:
#merge and group according to years
for i in years:
    files = [
            f'Financial News/{i}_Q1_Financial_News.csv', 
            f'Financial News/{i}_Q2_Financial_News.csv',
            f'Financial News/{i}_Q3_Financial_News.csv',
            f'Financial News/{i}_Q4_Financial_News.csv'
        ]
    df = pd.concat( 
        map(pd.read_csv, files), ignore_index=True)
    df.to_csv(f'Financial News/{i}_Financial_News.csv', index=False)

In [5]:
#merge all csv files into one csv file
files = [
            'Financial News/2021_Financial_News.csv', 
            'Financial News/2022_Financial_News.csv',
            'Financial News/2023_Financial_News.csv',
        ]
df = pd.concat( 
        map(pd.read_csv, files), ignore_index=True)
df.to_csv('Financial_News.csv', index=False)  

## Data Cleaning

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      4622 non-null   int64  
 1   Unnamed: 0.1    4622 non-null   int64  
 2   title           4622 non-null   object 
 3   description     4622 non-null   object 
 4   published date  4622 non-null   object 
 5   url             4622 non-null   object 
 6   publisher       4622 non-null   object 
 7   company         4622 non-null   object 
 8   Unnamed: 0.1.1  985 non-null    float64
dtypes: float64(1), int64(2), object(6)
memory usage: 325.1+ KB


In [7]:
df.head(8)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,description,published date,url,publisher,company,Unnamed: 0.1.1
0,0,0,Apple (AAPL) to Report Q1 Earnings: What's in ...,Apple (AAPL) to Report Q1 Earnings: What's in ...,"Fri, 22 Jan 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMiS2h0d...,"{'href': 'https://finance.yahoo.com', 'title':...",AAPL,
1,1,1,How to Trade Apple (AAPL) in the First Half of...,How to Trade Apple (AAPL) in the First Half of...,"Thu, 07 Jan 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMiUWh0d...,"{'href': 'https://www.investopedia.com', 'titl...",AAPL,
2,2,2,"What Facebook (FB), Twitter (TWTR), Apple (AAP...","What Facebook (FB), Twitter (TWTR), Apple (AAP...","Mon, 08 Feb 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMidGh0d...,"{'href': 'https://www.bloomberg.com', 'title':...",AAPL,
3,3,3,AAPL After Hours: Share Price Slides On Tech W...,AAPL After Hours: Share Price Slides On Tech W...,"Wed, 24 Mar 2021 07:00:00 GMT",https://news.google.com/rss/articles/CBMiSmh0d...,"{'href': 'https://www.thestreet.com', 'title':...",AAPL,
4,4,4,"Apple reports blowout quarter, booking more th...","Apple reports blowout quarter, booking more th...","Wed, 27 Jan 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMiQGh0d...,"{'href': 'https://www.cnbc.com', 'title': 'CNBC'}",AAPL,
5,5,5,AAPL: Why Apple Stock Will Continue to Outperf...,AAPL: Why Apple Stock Will Continue to Outperf...,"Wed, 06 Jan 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMiYWh0d...,"{'href': 'https://stocknews.com', 'title': 'St...",AAPL,
6,6,6,Apple (AAPL) Q1 2021 Earnings Call Transcript ...,Apple (AAPL) Q1 2021 Earnings Call Transcript ...,"Thu, 28 Jan 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMiZmh0d...,"{'href': 'https://www.fool.com', 'title': 'The...",AAPL,
7,7,7,Apple reportedly plans iPhone production cuts ...,Apple reportedly plans iPhone production cuts ...,"Wed, 10 Mar 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMioAFod...,"{'href': 'https://www.marketwatch.com', 'title...",AAPL,


In [8]:
df.tail(8)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,description,published date,url,publisher,company,Unnamed: 0.1.1
4614,81,81,"Authors sue Meta, Microsoft, Bloomberg in late...","Authors sue Meta, Microsoft, Bloomberg in late...","Wed, 18 Oct 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMic2h0d...,"{'href': 'https://www.reuters.com', 'title': '...",META,
4615,82,82,Meta makes end-to-end encryption a default on ...,Meta makes end-to-end encryption a default on ...,"Thu, 07 Dec 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiWmh0d...,"{'href': 'https://apnews.com', 'title': 'The A...",META,
4616,83,83,"Lawsuit against Meta: States say Facebook, Ins...","Lawsuit against Meta: States say Facebook, Ins...","Tue, 24 Oct 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMia2h0d...,"{'href': 'https://www.usatoday.com', 'title': ...",META,
4617,84,84,Meta Quest 3: The Holiday Gift That Transforms...,Meta Quest 3: The Holiday Gift That Transforms...,"Wed, 29 Nov 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiVmh0d...,"{'href': 'https://about.fb.com', 'title': 'Meta'}",META,
4618,85,85,noyb files GDPR complaint against Meta over “P...,noyb files GDPR complaint against Meta over “P...,"Tue, 28 Nov 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiSmh0d...,"{'href': 'https://noyb.eu', 'title': 'NOYB'}",META,
4619,86,86,"Two years later, Facebook's rebrand as Meta lo...","Two years later, Facebook's rebrand as Meta lo...","Sat, 28 Oct 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiS2h0d...,"{'href': 'https://www.fastcompany.com', 'title...",META,
4620,87,87,Meta smart glasses—large language models and t...,Meta smart glasses—large language models and t...,"Mon, 04 Dec 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiMmh0d...,"{'href': 'https://www.nature.com', 'title': 'N...",META,
4621,88,88,Meta-reinforcement learning via orbitofrontal ...,Meta-reinforcement learning via orbitofrontal ...,"Mon, 13 Nov 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiMmh0d...,"{'href': 'https://www.nature.com', 'title': 'N...",META,


In [9]:
#removed unwanted column
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'description', 'Unnamed: 0.1.1'], axis='columns')
df

Unnamed: 0,title,published date,url,publisher,company
0,Apple (AAPL) to Report Q1 Earnings: What's in ...,"Fri, 22 Jan 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMiS2h0d...,"{'href': 'https://finance.yahoo.com', 'title':...",AAPL
1,How to Trade Apple (AAPL) in the First Half of...,"Thu, 07 Jan 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMiUWh0d...,"{'href': 'https://www.investopedia.com', 'titl...",AAPL
2,"What Facebook (FB), Twitter (TWTR), Apple (AAP...","Mon, 08 Feb 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMidGh0d...,"{'href': 'https://www.bloomberg.com', 'title':...",AAPL
3,AAPL After Hours: Share Price Slides On Tech W...,"Wed, 24 Mar 2021 07:00:00 GMT",https://news.google.com/rss/articles/CBMiSmh0d...,"{'href': 'https://www.thestreet.com', 'title':...",AAPL
4,"Apple reports blowout quarter, booking more th...","Wed, 27 Jan 2021 08:00:00 GMT",https://news.google.com/rss/articles/CBMiQGh0d...,"{'href': 'https://www.cnbc.com', 'title': 'CNBC'}",AAPL
...,...,...,...,...,...
4617,Meta Quest 3: The Holiday Gift That Transforms...,"Wed, 29 Nov 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiVmh0d...,"{'href': 'https://about.fb.com', 'title': 'Meta'}",META
4618,noyb files GDPR complaint against Meta over “P...,"Tue, 28 Nov 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiSmh0d...,"{'href': 'https://noyb.eu', 'title': 'NOYB'}",META
4619,"Two years later, Facebook's rebrand as Meta lo...","Sat, 28 Oct 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiS2h0d...,"{'href': 'https://www.fastcompany.com', 'title...",META
4620,Meta smart glasses—large language models and t...,"Mon, 04 Dec 2023 08:00:00 GMT",https://news.google.com/rss/articles/CBMiMmh0d...,"{'href': 'https://www.nature.com', 'title': 'N...",META


In [10]:
#check if there is any duplication
df.duplicated().sum()

10

In [11]:
#drop duplicated value
df.drop_duplicates(inplace=True)

In [12]:
#check if there is any missing value
df.isna().sum()

title             0
published date    0
url               0
publisher         0
company           0
dtype: int64

In [13]:
df.isnull().sum()

title             0
published date    0
url               0
publisher         0
company           0
dtype: int64

In [14]:
#rearrange columns name
df = df.iloc[:,[1, -1, 0, 3, 2]]

In [15]:
#format date

from datetime import datetime

# covert to datetime
df['published date'] = pd.to_datetime(df['published date'])

# date in MM-DD-YYYY format
df['published date'] = df['published date'].dt.strftime('%m/%d/%Y')
df

Unnamed: 0,published date,company,title,publisher,url
0,01/22/2021,AAPL,Apple (AAPL) to Report Q1 Earnings: What's in ...,"{'href': 'https://finance.yahoo.com', 'title':...",https://news.google.com/rss/articles/CBMiS2h0d...
1,01/07/2021,AAPL,How to Trade Apple (AAPL) in the First Half of...,"{'href': 'https://www.investopedia.com', 'titl...",https://news.google.com/rss/articles/CBMiUWh0d...
2,02/08/2021,AAPL,"What Facebook (FB), Twitter (TWTR), Apple (AAP...","{'href': 'https://www.bloomberg.com', 'title':...",https://news.google.com/rss/articles/CBMidGh0d...
3,03/24/2021,AAPL,AAPL After Hours: Share Price Slides On Tech W...,"{'href': 'https://www.thestreet.com', 'title':...",https://news.google.com/rss/articles/CBMiSmh0d...
4,01/27/2021,AAPL,"Apple reports blowout quarter, booking more th...","{'href': 'https://www.cnbc.com', 'title': 'CNBC'}",https://news.google.com/rss/articles/CBMiQGh0d...
...,...,...,...,...,...
4617,11/29/2023,META,Meta Quest 3: The Holiday Gift That Transforms...,"{'href': 'https://about.fb.com', 'title': 'Meta'}",https://news.google.com/rss/articles/CBMiVmh0d...
4618,11/28/2023,META,noyb files GDPR complaint against Meta over “P...,"{'href': 'https://noyb.eu', 'title': 'NOYB'}",https://news.google.com/rss/articles/CBMiSmh0d...
4619,10/28/2023,META,"Two years later, Facebook's rebrand as Meta lo...","{'href': 'https://www.fastcompany.com', 'title...",https://news.google.com/rss/articles/CBMiS2h0d...
4620,12/04/2023,META,Meta smart glasses—large language models and t...,"{'href': 'https://www.nature.com', 'title': 'N...",https://news.google.com/rss/articles/CBMiMmh0d...


In [22]:
#format publisher
df['publisher'] = pd.Series(df['publisher'], dtype="string")
df.info()
df['publisher'] = str(df['publisher'].str.split("title': '")[1].split('\''))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4612 entries, 0 to 4621
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   published date  4612 non-null   object
 1   company         4612 non-null   object
 2   title           4612 non-null   object
 3   publisher       4612 non-null   string
 4   url             4612 non-null   object
dtypes: object(4), string(1)
memory usage: 345.2+ KB


AttributeError: 'list' object has no attribute 'split'

## Data Preprocessing