# APPLE

In [None]:
import pandas as pd

# Load the re-uploaded CSV file
file_path = "AAPL_Stock_News_Data.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Review the unique news sources
unique_sources = df_news['Site'].value_counts().reset_index()
unique_sources.columns = ['Site', 'Article Count']
unique_sources


Unnamed: 0,Site,Article Count
0,fool.com,2333
1,zacks.com,1138
2,cnbc.com,1096
3,investorplace.com,1039
4,seekingalpha.com,978
...,...,...
106,headlinesoftoday.com,1
107,https://www.reuters.com,1
108,prnewswire.com,1
109,newsfilecorp.com,1


In [None]:
df_news.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,31
URL,1
Site,1


In [None]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14567 entries, 0 to 14566
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Published Date  14567 non-null  object
 1   Title           14567 non-null  object
 2   Text            14536 non-null  object
 3   URL             14566 non-null  object
 4   Site            14566 non-null  object
dtypes: object(5)
memory usage: 569.2+ KB


In [None]:
import pandas as pd

# Load dataset
file_path = "AAPL_Stock_News_Data.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Fill null values
df_news['Text'] = df_news['Text'].fillna(df_news['Title'])
df_news['Site'] = df_news['Site'].fillna(df_news['URL'].str.extract(r'https?://([^/]+)')[0])

# Save cleaned data to CSV
output_cleaned_path = "AAPL_Stock_News_Data_Cleaned.csv"
df_news.to_csv(output_cleaned_path, index=False)

output_cleaned_path


'AAPL_Stock_News_Data_Cleaned.csv'

In [None]:
import pandas as pd

# Load the dataset
file_path = "AAPL_Stock_News_Data_Cleaned.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Fill missing values
df_news['Text'] = df_news['Text'].fillna(df_news['Title'])
df_news['Site'] = df_news['Site'].fillna(df_news['URL'].str.extract(r'https?://([^/]+)')[0])

# Updated credibility score mapping
credibility_scores = {
    "forbes.com": 8,
    "zacks.com": 2,
    "benzinga.com": 1,
    "investorplace.com": 1,
    "cnbc.com": 9,
    "247wallst.com": 1,
    "etftrends.com": 1,
    "cnet.com": 1,
    "investors.com": 2,
    "fool.com": 2,
    "reuters.com": 10,
    "seekingalpha.com": 2,
    "marketwatch.com": 3,
    "cnn.com": 4,
    "fastcompany.com": 2,
    "gurufocus.com": 2,
    "nypost.com": 1,
    "theguardian.com": 7,
    "wsj.com": 10,
    "investing.com": 2,
    "investopedia.com": 7,
    "invezz.com": 2,
    "indianexpress.com": 5,
    "indiatvnews.com": 5,
    "venturebeat.com": 3,
    "businessinsider.com": 4,
    "insidermonkey.com": 2,
    "barrons.com": 9,
    "pulse2.com": 1,
    "pymnts.com": 3,
    "proactiveinvestors.co.uk": 1,
    "techcrunch.com": 3,
    "foxbusiness.com": 3,
    "schaeffersresearch.com": 2,
    "techxplore.com": 1,
    "nytimes.com": 10,
    "markets.businessinsider.com": 4,
    "businesswire.com": 8,
    "marketbeat.com": 2,
    "finbold.com": 1,
    "news.sky.com": 4,
    "newsfilecorp.com": 3,
    "proactiveinvestors.com": 1,
    "stockmarket.com": 1,
    "geekwire.com": 2,
    "globenewswire.com": 2,
    "youtube.com": 3,
    "prnewswire.com": 5,
    "kiplinger.com": 5,
    "https://www.proactiveinvestors.com": 1,
    "fxempire.com": 2
}

# Apply the credibility mapping
df_news['Credibility Score'] = df_news['Site'].map(credibility_scores).fillna(5)

# Save the updated DataFrame to a new CSV
output_path = "AAPL_Stock_News_Data_With_Updated_Credibility.csv"
df_news.to_csv(output_path, index=False)

# Display a preview
print(df_news[['Site', 'Credibility Score']].head())


               Site  Credibility Score
0  seekingalpha.com                2.0
1     247wallst.com                1.0
2          fool.com                2.0
3          cnbc.com                9.0
4          cnbc.com                9.0


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the punkt_tab data package

# Load dataset from Colab (ensure the file is uploaded first)
df = pd.read_csv('AAPL_Stock_News_Data_With_Updated_Credibility.csv', encoding='latin-1')

# Check dataset columns
print("Dataset Columns:", df.columns)

# Specify the column containing the text to clean
text_column = 'Text'  # Update if the actual column name differs

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
        text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
        text = re.sub(r'[^a-z\s]', '', text)  # Keep only alphabets
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

        # Tokenization
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        cleaned_text = " ".join(lemmatizer.lemmatize(word) for word in tokens if word not in stop_words)
        return cleaned_text
    return ""

# Apply cleaning function to the "Text" column
df['Cleaned_Text'] = df[text_column].apply(clean_text)

# Save cleaned data
cleaned_file_path = "AAPL_Stock_News_Data_Cleaned_text.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved as: {cleaned_file_path}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Dataset Columns: Index(['Published Date', 'Title', 'Text', 'URL', 'Site', 'Credibility Score'], dtype='object')
Cleaned dataset saved as: AAPL_Stock_News_Data_Cleaned_text.csv


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('AAPL_Stock_News_Data_Cleaned_text.csv', encoding='latin-1')
df.head()

Unnamed: 0,Published Date,Title,Text,URL,Site,Credibility Score,Cleaned_Text
0,01-01-2020,Expect Major Stock Market Challenges In 2020,The stock market has completely rebounded from...,https://seekingalpha.com/article/4314786-expec...,seekingalpha.com,2.0,stock market completely rebounded last year se...
1,01-01-2020,"Apple Rises 86%, Pulls The Entire Market Higher","Apple is part of the Dow 30 (DJIA), S&P 500, a...",https://247wallst.com/consumer-products/2020/0...,247wallst.com,1.0,apple part dow djia sp nasdaq composite share ...
2,01-01-2020,3 Ways to Prepare Your Stock Portfolio for a R...,While no one can predict when a recession will...,https://www.fool.com/investing/2020/01/01/3-wa...,fool.com,2.0,one predict recession hit always good investme...
3,01-01-2020,Apple's stock could be worth $100 more in 2020...,"If you missed Apple's 2019 record rally, there...",https://www.cnbc.com/2020/01/01/apple-stock-co...,cnbc.com,9.0,missed apple record rally may still time make ...
4,02-01-2020,Apple revives relationship with Imagination Te...,Imagination Technologies announced a new licen...,https://www.cnbc.com/2020/01/02/apple-agrees-n...,cnbc.com,9.0,imagination technology announced new license a...


In [None]:
df.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,0
URL,0
Site,0
Credibility Score,0
Cleaned_Text,5


In [None]:
# Remove rows where 'Cleaned_Text' has missing values
apple_news_df_cleaned = df.dropna(subset=['Cleaned_Text']).reset_index(drop=True)

# Save the cleaned dataset without missing values
cleaned_file_path = "AAPL_Stock_News_Data_Final.csv"
apple_news_df_cleaned.to_csv(cleaned_file_path, index=False)

# Confirm the number of missing values after removal
missing_values_after = apple_news_df_cleaned['Cleaned_Text'].isnull().sum()
missing_values_after


np.int64(0)

In [None]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download VADER sentiment analyzer
nltk.download('vader_lexicon')

# Load the cleaned dataset (Upload to Google Colab first)
file_path = "AAPL_Stock_News_Data_Final.csv"  # Update path if needed
df = pd.read_csv(file_path)

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to compute sentiment score
def get_sentiment_score(text):
    if isinstance(text, str):
        return sia.polarity_scores(text)['compound']  # VADER compound score (-1 to 1)
    return 0  # Default neutral score for missing text

# Apply sentiment analysis
df['Sentiment_Score'] = df['Cleaned_Text'].apply(get_sentiment_score)

# Save the dataset with sentiment scores
output_file_path = "AAPL_Stock_News_Data_With_Sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f"✅ Updated dataset with sentiment scores saved as: {output_file_path}")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


✅ Updated dataset with sentiment scores saved as: AAPL_Stock_News_Data_With_Sentiment.csv


In [None]:
df

Unnamed: 0,Published Date,Title,Text,URL,Site,Credibility Score,Cleaned_Text,Sentiment_Score
0,01-01-2020,Expect Major Stock Market Challenges In 2020,The stock market has completely rebounded from...,https://seekingalpha.com/article/4314786-expec...,seekingalpha.com,2.0,stock market completely rebounded last year se...,0.1027
1,01-01-2020,"Apple Rises 86%, Pulls The Entire Market Higher","Apple is part of the Dow 30 (DJIA), S&P 500, a...",https://247wallst.com/consumer-products/2020/0...,247wallst.com,1.0,apple part dow djia sp nasdaq composite share ...,0.5423
2,01-01-2020,3 Ways to Prepare Your Stock Portfolio for a R...,While no one can predict when a recession will...,https://www.fool.com/investing/2020/01/01/3-wa...,fool.com,2.0,one predict recession hit always good investme...,0.5423
3,01-01-2020,Apple's stock could be worth $100 more in 2020...,"If you missed Apple's 2019 record rally, there...",https://www.cnbc.com/2020/01/01/apple-stock-co...,cnbc.com,9.0,missed apple record rally may still time make ...,0.1779
4,02-01-2020,Apple revives relationship with Imagination Te...,Imagination Technologies announced a new licen...,https://www.cnbc.com/2020/01/02/apple-agrees-n...,cnbc.com,9.0,imagination technology announced new license a...,0.8402
...,...,...,...,...,...,...,...,...
22825,13-02-2025,"TikTok returns to Apple, Google app stores",TikTok is back in the United States after a te...,https://www.cnbc.com/2025/02/13/tiktok-returns...,cnbc.com,9.0,tiktok back united state temporary shutdown av...,0.7184
22826,13-02-2025,Why Alibaba Won't Solve Apple's China Problems...,Alibaba's chairman says Apple's iPhones will u...,https://www.youtube.com/watch?v=GosMhfeq0WQ,youtube.com,3.0,alibabas chairman say apple iphones use ai tec...,-0.0516
22827,13-02-2025,Apple plans to launch AI features in China fro...,Apple aims to launch its artificial intelligen...,https://www.reuters.com/technology/artificial-...,reuters.com,10.0,apple aim launch artificial intelligence featu...,0.4939
22828,13-02-2025,TikTok Available Once Again For Download In Ap...,"TikTok is back, kids. The popular app, which i...",https://deadline.com/2025/02/tiktok-download-a...,deadline.com,5.0,tiktok back kid popular app owned china byteda...,0.6369


In [None]:
# ✅ Create Weighted Sentiment column
df['Weighted_Sentiment'] = df['Sentiment_Score'] * (df['Credibility Score'] / 10)

# Save final dataset
output_file_path = "AAPL_Stock_News_Data_With_Weighted_Sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f"✅ Updated dataset with Sentiment_Score and Weighted_Sentiment saved as: {output_file_path}")

✅ Updated dataset with Sentiment_Score and Weighted_Sentiment saved as: AAPL_Stock_News_Data_With_Weighted_Sentiment.csv


In [None]:
import pandas as pd

# Load the updated dataset
df = pd.read_csv("AAPL_Stock_News_Data_With_Weighted_Sentiment.csv")

# Assuming 'Published Date' is the actual column with date information
# Rename 'Published Date' to 'Date'
df.rename(columns={'Published Date': 'Date'}, inplace=True)

# Ensure Date column is in datetime format, specifying the correct format
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y', errors='coerce')
# format='%d-%m-%Y' specifies the day-month-year format
# errors='coerce' handles any invalid dates by setting them to NaT (Not a Time)

# Sort by date to apply lag/rolling correctly
df = df.sort_values('Date')

# Create lag features
df['Lag_1'] = df['Weighted_Sentiment'].shift(1)
df['Lag_2'] = df['Weighted_Sentiment'].shift(2)

# Create rolling average features
df['Rolling_3'] = df['Weighted_Sentiment'].rolling(window=3).mean()
df['Rolling_7'] = df['Weighted_Sentiment'].rolling(window=7).mean()

# Save final dataset with engineered features
df.to_csv("AAPL_Sentiment_With_Lag_Rolling.csv", index=False)
print("✅ Lag and rolling average features added and saved to: AAPL_Sentiment_With_Lag_Rolling.csv")

✅ Lag and rolling average features added and saved to: AAPL_Sentiment_With_Lag_Rolling.csv


In [None]:
import pandas as pd

# Load dataset
file_path = "AAPL_Sentiment_With_Lag_Rolling.csv"
df = pd.read_csv(file_path)

# Convert 'Date' column to datetime (handles multiple formats)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Drop rows where Date conversion failed
df = df.dropna(subset=['Date'])

# Keep only the date part
df['Date'] = df['Date'].dt.date

# Compute daily average weighted sentiment
daily_avg_sentiment = (
    df.groupby('Date')['Weighted_Sentiment']
    .mean()
    .reset_index()
    .rename(columns={'Weighted_Sentiment': 'Avg_Weighted_Sentiment'})
)

# Merge back with original dataframe
df = pd.merge(df, daily_avg_sentiment, on='Date', how='left')

# Calculate deviation from daily average
df['Impact'] = df['Weighted_Sentiment'] - df['Avg_Weighted_Sentiment']

# Label the direction of the impact
df['Impact_Direction'] = df['Impact'].apply(lambda x: 'Positive Impact' if x > 0 else 'Negative Impact')

# Get the most impactful news per day by absolute deviation
most_important_news = df.loc[df.groupby('Date')['Impact'].apply(lambda x: x.abs().idxmax())]

# Save result
output_file_path = "Most_Important_Apple_News_Per_Day.csv"
most_important_news.to_csv(output_file_path, index=False)

# Show sample
print(most_important_news[['Date', 'Title', 'Impact', 'Impact_Direction', 'Weighted_Sentiment', 'Avg_Weighted_Sentiment']].head())
print(f"✅ Saved as: {output_file_path}")


          Date                                              Title    Impact  \
3   2020-01-01  Apple's stock could be worth $100 more in 2020...  0.074275   
23  2020-01-02  Apple revives relationship with Imagination Te...  0.672334   
32  2020-01-03  Apple CEO Tim Cook's total pay dropped last ye...  0.429802   
48  2020-01-04            Mark Your Calendar for Apple's Earnings -0.088120   
53  2020-01-05                     2 Top 5G Stocks to Buy in 2020 -0.104513   

   Impact_Direction  Weighted_Sentiment  Avg_Weighted_Sentiment  
3   Positive Impact             0.16011                0.085835  
23  Positive Impact             0.75618                0.083846  
32  Positive Impact             0.48807                0.058268  
48  Negative Impact             0.00000                0.088120  
53  Negative Impact            -0.05000                0.054513  
✅ Saved as: Most_Important_Apple_News_Per_Day.csv


# AMAZON

In [None]:
import pandas as pd

# Load the re-uploaded CSV file
file_path = "AMZN_Stock_News_Data.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Review the unique news sources
unique_sources = df_news['Site'].value_counts().reset_index()
unique_sources.columns = ['Site', 'Article Count']
unique_sources


Unnamed: 0,Site,Article Count
0,fool.com,4833
1,businesswire.com,1867
2,zacks.com,1739
3,investorplace.com,1711
4,cnbc.com,1454
...,...,...
113,hindustantimes.com,1
114,etf.com,1
115,https://finbold.com,1
116,https://thefly.com,1


In [None]:
df_news.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,20
URL,19
Site,24


In [None]:
df_news.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,0
URL,0
Site,0


In [None]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24095 entries, 0 to 24094
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Published Date  24095 non-null  object
 1   Title           24095 non-null  object
 2   Text            24075 non-null  object
 3   URL             24076 non-null  object
 4   Site            24071 non-null  object
dtypes: object(5)
memory usage: 941.3+ KB


In [None]:
import pandas as pd

# Load dataset
file_path = "AMZN_Stock_News_Data.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Fill null values
df_news['Text'] = df_news['Text'].fillna(df_news['Title'])
df_news['Site'] = df_news['Site'].fillna(df_news['URL'].str.extract(r'https?://([^/]+)')[0])

# Save cleaned data to CSV
output_cleaned_path = "AMZN_Stock_News_Data_Cleaned.csv"
df_news.to_csv(output_cleaned_path, index=False)

output_cleaned_path


'AMZN_Stock_News_Data_Cleaned.csv'

In [None]:
import pandas as pd

# Load the dataset
file_path = "AMZN_Stock_News_Data_Cleaned.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Fill missing values
df_news['Text'] = df_news['Text'].fillna(df_news['Title'])
df_news['Site'] = df_news['Site'].fillna(df_news['URL'].str.extract(r'https?://([^/]+)')[0])

# Assign credibility scores
credibility_scores = {
    'fool.com': 7,
    'zacks.com': 8,
    'investorplace.com': 6,
    'cnbc.com': 9,
    'seekingalpha.com': 6,
    'marketwatch.com': 8,
    'finance.yahoo.com': 8,
    'yahoo.com': 7,
    'reuters.com': 10,
    'bloomberg.com': 9,
    'businessinsider.com': 7,
    'thestreet.com': 7,
    '247wallst.com': 5,
    'forbes.com': 8,
    'money.cnn.com': 8,
    'barrons.com': 9,
    'nasdaq.com': 8,
    'wsj.com': 9
}
df_news['Credibility Score'] = df_news['Site'].map(credibility_scores).fillna(5)

# Save the updated DataFrame to a new CSV file (credibility only)
output_path = "AMZN_Stock_News_Data_With_Credibility.csv"
df_news.to_csv(output_path, index=False)

# Display first few rows
print(df_news.head())


  Published Date                                           Title  \
0     01-01-2020    Expect Major Stock Market Challenges In 2020   
1     01-01-2020    Warren Buffett Bought These 9 Stocks in 2019   
2     01-01-2020     1 FAANG Stock to Buy and 1 to Avoid in 2020   
3     01-01-2020  3 Stocks to Buy and Hold for the Next 50 Years   
4     01-01-2020   5 Stocks Analysts Recommend Heading Into 2020   

                                                Text  \
0  The stock market has completely rebounded from...   
1  The greatest long-term investor of our generat...   
2  Among these stocks -- Facebook, Amazon, Apple,...   
3  These companies will continue to lead for deca...   
4  The S&P 500 closed out 2019 with another stron...   

                                                 URL              Site  \
0  https://seekingalpha.com/article/4314786-expec...  seekingalpha.com   
1  https://www.fool.com/investing/2020/01/01/warr...          fool.com   
2  https://www.fool.com/investin

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the punkt_tab data package

# Load dataset from Colab (ensure the file is uploaded first)
df = pd.read_csv('AMZN_Stock_News_Data_With_Credibility.csv', encoding='latin-1')

# Check dataset columns
print("Dataset Columns:", df.columns)

# Specify the column containing the text to clean
text_column = 'Text'  # Update if the actual column name differs

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
        text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
        text = re.sub(r'[^a-z\s]', '', text)  # Keep only alphabets
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

        # Tokenization
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        cleaned_text = " ".join(lemmatizer.lemmatize(word) for word in tokens if word not in stop_words)
        return cleaned_text
    return ""

# Apply cleaning function to the "Text" column
df['Cleaned_Text'] = df[text_column].apply(clean_text)

# Save cleaned data
cleaned_file_path = "AMZN_Stock_News_Data_Cleaned_text.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved as: {cleaned_file_path}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Dataset Columns: Index(['Published Date', 'Title', 'Text', 'URL', 'Site', 'Credibility Score'], dtype='object')
Cleaned dataset saved as: AMZN_Stock_News_Data_Cleaned_text.csv


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('AMZN_Stock_News_Data_Cleaned_text.csv', encoding='latin-1')
df.head()

Unnamed: 0,Published Date,Title,Text,URL,Site,Credibility Score,Cleaned_Text
0,01-01-2020,Expect Major Stock Market Challenges In 2020,The stock market has completely rebounded from...,https://seekingalpha.com/article/4314786-expec...,seekingalpha.com,6.0,stock market completely rebounded last year se...
1,01-01-2020,Warren Buffett Bought These 9 Stocks in 2019,The greatest long-term investor of our generat...,https://www.fool.com/investing/2020/01/01/warr...,fool.com,7.0,greatest longterm investor generation shopping...
2,01-01-2020,1 FAANG Stock to Buy and 1 to Avoid in 2020,"Among these stocks -- Facebook, Amazon, Apple,...",https://www.fool.com/investing/2020/01/01/1-fa...,fool.com,7.0,among stock facebook amazon apple netflix goog...
3,01-01-2020,3 Stocks to Buy and Hold for the Next 50 Years,These companies will continue to lead for deca...,https://www.fool.com/investing/2020/01/01/3-st...,fool.com,7.0,company continue lead decade come
4,01-01-2020,5 Stocks Analysts Recommend Heading Into 2020,The S&P 500 closed out 2019 with another stron...,https://www.benzinga.com/trading-ideas/long-id...,benzinga.com,5.0,sp closed another strong month december howeve...


In [None]:
df.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,0
URL,19
Site,19
Credibility Score,0
Cleaned_Text,3


In [None]:
# Remove rows where 'Cleaned_Text' has missing values
df_cleaned = df.dropna(subset=['Cleaned_Text']).reset_index(drop=True)

# Save the cleaned dataset without missing values
cleaned_file_path = "AMZN_Stock_News_Data_Final.csv"
df_cleaned.to_csv(cleaned_file_path, index=False)

# Confirm the number of missing values after removal
missing_values_after = df_cleaned['Cleaned_Text'].isnull().sum()
missing_values_after


np.int64(0)

In [None]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download VADER sentiment analyzer
nltk.download('vader_lexicon')

# Load the cleaned dataset (Upload to Google Colab first)
file_path = "AMZN_Stock_News_Data_Final.csv"  # Update path if needed
df = pd.read_csv(file_path)

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to compute sentiment score
def get_sentiment_score(text):
    if isinstance(text, str):
        return sia.polarity_scores(text)['compound']  # VADER compound score (-1 to 1)
    return 0  # Default neutral score for missing text

# Apply sentiment analysis
df['Sentiment_Score'] = df['Cleaned_Text'].apply(get_sentiment_score)

# Save the dataset with sentiment scores
output_file_path = "AMZN_Stock_News_Data_With_Sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f"✅ Updated dataset with sentiment scores saved as: {output_file_path}")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


✅ Updated dataset with sentiment scores saved as: AMZN_Stock_News_Data_With_Sentiment.csv


In [None]:
df

Unnamed: 0,Published Date,Title,Text,URL,Site,Credibility Score,Cleaned_Text,Sentiment_Score
0,01-01-2020,Expect Major Stock Market Challenges In 2020,The stock market has completely rebounded from...,https://seekingalpha.com/article/4314786-expec...,seekingalpha.com,6.0,stock market completely rebounded last year se...,0.1027
1,01-01-2020,Warren Buffett Bought These 9 Stocks in 2019,The greatest long-term investor of our generat...,https://www.fool.com/investing/2020/01/01/warr...,fool.com,7.0,greatest longterm investor generation shopping...,0.6369
2,01-01-2020,1 FAANG Stock to Buy and 1 to Avoid in 2020,"Among these stocks -- Facebook, Amazon, Apple,...",https://www.fool.com/investing/2020/01/01/1-fa...,fool.com,7.0,among stock facebook amazon apple netflix goog...,-0.2263
3,01-01-2020,3 Stocks to Buy and Hold for the Next 50 Years,These companies will continue to lead for deca...,https://www.fool.com/investing/2020/01/01/3-st...,fool.com,7.0,company continue lead decade come,0.0000
4,01-01-2020,5 Stocks Analysts Recommend Heading Into 2020,The S&P 500 closed out 2019 with another stron...,https://www.benzinga.com/trading-ideas/long-id...,benzinga.com,5.0,sp closed another strong month december howeve...,0.7506
...,...,...,...,...,...,...,...,...
24087,13-02-2025,Help Scout Launches Availability in AWS Market...,"BOSTON--(BUSINESS WIRE)--Help Scout, the custo...",https://www.businesswire.com/news/home/2025021...,businesswire.com,5.0,bostonbusiness wirehelp scout customer support...,0.9501
24088,13-02-2025,"Amazon looks to hire 2,000 new workers in Inla...",Amazon is gearing up for a hiring spree in Cal...,https://techxplore.com/news/2025-02-amazon-hir...,techxplore.com,5.0,amazon gearing hiring spree california inland ...,0.1779
24089,13-02-2025,Amazon: 2025 Deep-Dive Reveals A Conglomerate ...,"Amazon's diverse operations, including e-comme...",https://seekingalpha.com/article/4757854-amazo...,seekingalpha.com,6.0,amazon diverse operation including ecommerce p...,0.9826
24090,13-02-2025,Amazon: Top Value For Growth Investors,Amazon exceeded Q4 expectations due to strong ...,https://seekingalpha.com/article/4757871-amazo...,seekingalpha.com,6.0,amazon exceeded q expectation due strong ecomm...,0.9866


In [None]:
# ✅ Create Weighted Sentiment column
df['Weighted_Sentiment'] = df['Sentiment_Score'] * (df['Credibility Score'] / 10)

# Save final dataset
output_file_path = "AMZN_Stock_News_Data_With_Weighted_Sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f"✅ Updated dataset with Sentiment_Score and Weighted_Sentiment saved as: {output_file_path}")

✅ Updated dataset with Sentiment_Score and Weighted_Sentiment saved as: AMZN_Stock_News_Data_With_Weighted_Sentiment.csv


In [None]:
import pandas as pd

# Load dataset
file_path = "AMZN_Stock_News_Data_With_Weighted_Sentiment.csv"
df = pd.read_csv(file_path)

# Convert 'Published Date' to datetime
df['Published Date'] = pd.to_datetime(df['Published Date'], format='%d-%m-%Y', errors='coerce')

# Extract just the date for grouping
df['Date'] = df['Published Date'].dt.date

# Compute average weighted sentiment per day
daily_avg_sentiment = df.groupby('Date')['Weighted_Sentiment'].mean().reset_index()
daily_avg_sentiment.rename(columns={'Weighted_Sentiment': 'Avg_Weighted_Sentiment'}, inplace=True)

# Merge back
df = pd.merge(df, daily_avg_sentiment, on='Date', how='left')

# Calculate directional impact
df['Impact'] = df['Weighted_Sentiment'] - df['Avg_Weighted_Sentiment']

# Label the direction
df['Impact_Direction'] = df['Impact'].apply(lambda x: 'Positive Impact' if x > 0 else 'Negative Impact')

# Get most impactful news per day (by absolute deviation)
most_important_news = df.loc[df.groupby('Date')['Impact'].apply(lambda x: x.abs().idxmax())]

# Drop the 'Date' column as requested
most_important_news.drop(columns=['Date'], inplace=True)

# Save result
output_file_path = "AMZN_Most_Important_News_Per_Day_With_Direction.csv"
most_important_news.to_csv(output_file_path, index=False)

# Show sample
print(most_important_news[['Published Date', 'Title', 'Impact', 'Impact_Direction', 'Weighted_Sentiment', 'Avg_Weighted_Sentiment']].head())
print(f"✅ Saved as: {output_file_path}")


   Published Date                                              Title  \
1      2020-01-01       Warren Buffett Bought These 9 Stocks in 2019   
16     2020-01-02  Amazon workers group says members were threate...   
21     2020-01-03               3 Warren Buffett Stocks Worth Buying   
28     2020-01-04  Wall Street analysts expect a big 2020 from th...   
37     2020-01-05  Amazon is generating lots of free-cash flow, t...   

      Impact Impact_Direction  Weighted_Sentiment  Avg_Weighted_Sentiment  
1   0.325107  Positive Impact             0.44583                0.120723  
16  0.472279  Positive Impact             0.43384               -0.038439  
21  0.452044  Positive Impact             0.60823                0.156186  
28 -0.332358  Negative Impact             0.00000                0.332358  
37  0.441728  Positive Impact             0.78450                0.342772  
✅ Saved as: AMZN_Most_Important_News_Per_Day_With_Direction.csv


# Boeing

In [None]:
import pandas as pd

# Load the re-uploaded CSV file
file_path = "BA_Stock_News_Data.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Review the unique news sources
unique_sources = df_news['Site'].value_counts().reset_index()
unique_sources.columns = ['Site', 'Article Count']
unique_sources


Unnamed: 0,Site,Article Count
0,reuters.com,1483
1,fool.com,787
2,zacks.com,641
3,seekingalpha.com,537
4,cnbc.com,466
...,...,...
95,insidertrades.com,1
96,easternherald.com,1
97,http://feeds.benzinga.com,1
98,https://invezz.com,1


In [None]:
df_news.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,41
URL,0
Site,0


In [None]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8786 entries, 0 to 8785
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Published Date  8786 non-null   object
 1   Title           8786 non-null   object
 2   Text            8745 non-null   object
 3   URL             8786 non-null   object
 4   Site            8786 non-null   object
dtypes: object(5)
memory usage: 343.3+ KB


In [None]:
import pandas as pd

# Load dataset
file_path = "BA_Stock_News_Data.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Fill null values
df_news['Text'] = df_news['Text'].fillna(df_news['Title'])
df_news['Site'] = df_news['Site'].fillna(df_news['URL'].str.extract(r'https?://([^/]+)')[0])

# Save cleaned data to CSV
output_cleaned_path = "BA_Stock_News_Data_Cleaned.csv"
df_news.to_csv(output_cleaned_path, index=False)

output_cleaned_path


'BA_Stock_News_Data_Cleaned.csv'

In [None]:
import pandas as pd

# Load the dataset
file_path = "BA_Stock_News_Data_Cleaned.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Fill missing values
df_news['Text'] = df_news['Text'].fillna(df_news['Title'])
df_news['Site'] = df_news['Site'].fillna(df_news['URL'].str.extract(r'https?://([^/]+)')[0])

# Assign credibility scores
credibility_scores = {
    'fool.com': 7,
    'zacks.com': 8,
    'investorplace.com': 6,
    'cnbc.com': 9,
    'seekingalpha.com': 6,
    'marketwatch.com': 8,
    'finance.yahoo.com': 8,
    'yahoo.com': 7,
    'reuters.com': 10,
    'bloomberg.com': 9,
    'businessinsider.com': 7,
    'thestreet.com': 7,
    '247wallst.com': 5,
    'forbes.com': 8,
    'money.cnn.com': 8,
    'barrons.com': 9,
    'nasdaq.com': 8,
    'wsj.com': 9
}
df_news['Credibility Score'] = df_news['Site'].map(credibility_scores).fillna(5)

# Save the updated DataFrame to a new CSV file (credibility only)
output_path = "BA_Stock_News_Data_With_Credibility.csv"
df_news.to_csv(output_path, index=False)

# Display first few rows
print(df_news.head())


  Published Date                                              Title  \
0     02-01-2020  Airbus bumps Boeing from top spot in 2019 with...   
1     02-01-2020  2019 Was A Safe Year For Air Travel Despite 73...   
2     02-01-2020  Boeing Could Spin Off Defense Unit to Salvage ...   
3     02-01-2020  Bell-Boeing JV Wins Deal to Support V-22 Fligh...   
4     02-01-2020  2019 WasnÂt BoeingÂs Worst Year. Not Even Cl...   

                                                Text  \
0  Airbus has become the world's largest planemak...   
1  Despite a high-profile accident involving the ...   
2  Boeing Co. (NYSE: BA) shares rose only 1% last...   
3  Bell-Boeing, a joint venture between Boeing (B...   
4  There's a reason why Boeing has survived for o...   

                                                 URL           Site  \
0  https://www.cnbc.com/2020/01/01/reuters-americ...       cnbc.com   
1  https://www.forbes.com/sites/niallmccarthy/202...     forbes.com   
2  https://247wallst.co

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the punkt_tab data package

# Load dataset from Colab (ensure the file is uploaded first)
df = pd.read_csv('BA_Stock_News_Data_With_Credibility.csv', encoding='latin-1')

# Check dataset columns
print("Dataset Columns:", df.columns)

# Specify the column containing the text to clean
text_column = 'Text'  # Update if the actual column name differs

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
        text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
        text = re.sub(r'[^a-z\s]', '', text)  # Keep only alphabets
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

        # Tokenization
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        cleaned_text = " ".join(lemmatizer.lemmatize(word) for word in tokens if word not in stop_words)
        return cleaned_text
    return ""

# Apply cleaning function to the "Text" column
df['Cleaned_Text'] = df[text_column].apply(clean_text)

# Save cleaned data
cleaned_file_path = "BA_Stock_News_Data_Cleaned_text.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved as: {cleaned_file_path}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Dataset Columns: Index(['Published Date', 'Title', 'Text', 'URL', 'Site', 'Credibility Score'], dtype='object')
Cleaned dataset saved as: BA_Stock_News_Data_Cleaned_text.csv


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('BA_Stock_News_Data_Cleaned_text.csv', encoding='latin-1')
df.head()

Unnamed: 0,Published Date,Title,Text,URL,Site,Credibility Score,Cleaned_Text
0,02-01-2020,Airbus bumps Boeing from top spot in 2019 with...,Airbus has become the world's largest planemak...,https://www.cnbc.com/2020/01/01/reuters-americ...,cnbc.com,9.0,airbus become world largest planemaker first t...
1,02-01-2020,2019 Was A Safe Year For Air Travel Despite 73...,Despite a high-profile accident involving the ...,https://www.forbes.com/sites/niallmccarthy/202...,forbes.com,8.0,despite highprofile accident involving boeing ...
2,02-01-2020,Boeing Could Spin Off Defense Unit to Salvage ...,Boeing Co. (NYSE: BA) shares rose only 1% last...,https://247wallst.com/aerospace-defense/2020/0...,247wallst.com,5.0,boeing co nyse ba share rose last year compare...
3,02-01-2020,Bell-Boeing JV Wins Deal to Support V-22 Fligh...,"Bell-Boeing, a joint venture between Boeing (B...",https://www.zacks.com/stock/news/698991/bell-b...,zacks.com,8.0,bellboeing joint venture boeing ba textrons tx...
4,02-01-2020,2019 WasnÃÂÃÂt BoeingÃÂÃÂs Worst Year....,There's a reason why Boeing has survived for o...,https://www.forbes.com/sites/lorenthompson/202...,forbes.com,8.0,there reason boeing survived century rival dis...


In [None]:
df.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,0
URL,0
Site,0
Credibility Score,0
Cleaned_Text,5


In [None]:
# Remove rows where 'Cleaned_Text' has missing values
df_cleaned = df.dropna(subset=['Cleaned_Text']).reset_index(drop=True)

# Save the cleaned dataset without missing values
cleaned_file_path = "BA_Stock_News_Data_Final.csv"
df_cleaned.to_csv(cleaned_file_path, index=False)

# Confirm the number of missing values after removal
missing_values_after = df_cleaned['Cleaned_Text'].isnull().sum()
missing_values_after


np.int64(0)

In [None]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download VADER sentiment analyzer
nltk.download('vader_lexicon')

# Load the cleaned dataset (Upload to Google Colab first)
file_path = "BA_Stock_News_Data_Final.csv"  # Update path if needed
df = pd.read_csv(file_path)

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to compute sentiment score
def get_sentiment_score(text):
    if isinstance(text, str):
        return sia.polarity_scores(text)['compound']  # VADER compound score (-1 to 1)
    return 0  # Default neutral score for missing text

# Apply sentiment analysis
df['Sentiment_Score'] = df['Cleaned_Text'].apply(get_sentiment_score)

# Save the dataset with sentiment scores
output_file_path = "BA_Stock_News_Data_With_Sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f"✅ Updated dataset with sentiment scores saved as: {output_file_path}")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


✅ Updated dataset with sentiment scores saved as: BA_Stock_News_Data_With_Sentiment.csv


In [None]:
# ✅ Create Weighted Sentiment column
df['Weighted_Sentiment'] = df['Sentiment_Score'] * (df['Credibility Score'] / 10)

# Save final dataset
output_file_path = "BA_Stock_News_Data_With_Weighted_Sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f"✅ Updated dataset with Sentiment_Score and Weighted_Sentiment saved as: {output_file_path}")

✅ Updated dataset with Sentiment_Score and Weighted_Sentiment saved as: BA_Stock_News_Data_With_Weighted_Sentiment.csv


In [None]:
import pandas as pd

# Load dataset
file_path = "BA_Stock_News_Data_With_Weighted_Sentiment.csv"
df = pd.read_csv(file_path)

# Convert 'Published Date' to datetime
df['Published Date'] = pd.to_datetime(df['Published Date'], format='%d-%m-%Y', errors='coerce')

# Extract just the date for grouping
df['Date'] = df['Published Date'].dt.date

# Compute average weighted sentiment per day
daily_avg_sentiment = df.groupby('Date')['Weighted_Sentiment'].mean().reset_index()
daily_avg_sentiment.rename(columns={'Weighted_Sentiment': 'Avg_Weighted_Sentiment'}, inplace=True)

# Merge back
df = pd.merge(df, daily_avg_sentiment, on='Date', how='left')

# Calculate directional impact
df['Impact'] = df['Weighted_Sentiment'] - df['Avg_Weighted_Sentiment']

# Label the direction
df['Impact_Direction'] = df['Impact'].apply(lambda x: 'Positive Impact' if x > 0 else 'Negative Impact')

# Get most impactful news per day (by absolute deviation)
most_important_news = df.loc[df.groupby('Date')['Impact'].apply(lambda x: x.abs().idxmax())]

# Drop the 'Date' column as requested
most_important_news.drop(columns=['Date'], inplace=True)

# Save result
output_file_path = "BA_Most_Important_News_Per_Day_With_Direction.csv"
most_important_news.to_csv(output_file_path, index=False)

# Show sample
print(most_important_news[['Published Date', 'Title', 'Impact', 'Impact_Direction', 'Weighted_Sentiment', 'Avg_Weighted_Sentiment']].head())
print(f"✅ Saved as: {output_file_path}")


   Published Date                                              Title  \
2      2020-01-02  Boeing Could Spin Off Defense Unit to Salvage ...   
10     2020-01-03                             10 Stocks For 40 Years   
12     2020-01-05  Boeing's New Plan for Moon Landing Sounds a Lo...   
23     2020-01-06  Mexico's Aeromexico reaches compensation agree...   
37     2020-01-07  Boeing recommending simulator training for pil...   

      Impact Impact_Direction  Weighted_Sentiment  Avg_Weighted_Sentiment  
2   0.175486  Positive Impact             0.34040                0.164914  
10  0.064100  Positive Impact             0.00000               -0.064100  
12  0.381898  Positive Impact             0.49021                0.108312  
23  0.528415  Positive Impact             0.55740                0.028985  
37 -0.910725  Negative Impact            -0.89340                0.017325  
✅ Saved as: BA_Most_Important_News_Per_Day_With_Direction.csv


# J.P.Morgan

In [None]:
import pandas as pd

# Load the re-uploaded CSV file
file_path = "JPM_Stock_News_Data.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Review the unique news sources
unique_sources = df_news['Site'].value_counts().reset_index()
unique_sources.columns = ['Site', 'Article Count']
unique_sources


Unnamed: 0,Site,Article Count
0,zacks.com,1006
1,businesswire.com,830
2,seekingalpha.com,597
3,fool.com,587
4,reuters.com,452
...,...,...
62,sky.com,1
63,https://www.tipranks.com,1
64,https://www.businesswire.com,1
65,https://247wallst.com,1


In [None]:
df_news.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,4
URL,0
Site,3


In [None]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6486 entries, 0 to 6485
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Published Date  6486 non-null   object
 1   Title           6486 non-null   object
 2   Text            6482 non-null   object
 3   URL             6486 non-null   object
 4   Site            6483 non-null   object
dtypes: object(5)
memory usage: 253.5+ KB


In [None]:
import pandas as pd

# Load dataset
file_path = "JPM_Stock_News_Data.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Fill null values
df_news['Text'] = df_news['Text'].fillna(df_news['Title'])
df_news['Site'] = df_news['Site'].fillna(df_news['URL'].str.extract(r'https?://([^/]+)')[0])

# Save cleaned data to CSV
output_cleaned_path = "JPM_Stock_News_Data_Cleaned.csv"
df_news.to_csv(output_cleaned_path, index=False)

output_cleaned_path


'JPM_Stock_News_Data_Cleaned.csv'

In [None]:
import pandas as pd

# Load the dataset
file_path = "JPM_Stock_News_Data_Cleaned.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Fill missing values
df_news['Text'] = df_news['Text'].fillna(df_news['Title'])
df_news['Site'] = df_news['Site'].fillna(df_news['URL'].str.extract(r'https?://([^/]+)')[0])

# Assign credibility scores
credibility_scores = {
    'fool.com': 7,
    'zacks.com': 8,
    'investorplace.com': 6,
    'cnbc.com': 9,
    'seekingalpha.com': 6,
    'marketwatch.com': 8,
    'finance.yahoo.com': 8,
    'yahoo.com': 7,
    'reuters.com': 10,
    'bloomberg.com': 9,
    'businessinsider.com': 7,
    'thestreet.com': 7,
    '247wallst.com': 5,
    'forbes.com': 8,
    'money.cnn.com': 8,
    'barrons.com': 9,
    'nasdaq.com': 8,
    'wsj.com': 9
}
df_news['Credibility Score'] = df_news['Site'].map(credibility_scores).fillna(5)

# Save the updated DataFrame to a new CSV file (credibility only)
output_path = "JPM_Stock_News_Data_With_Credibility.csv"
df_news.to_csv(output_path, index=False)

# Display first few rows
print(df_news.head())


  Published Date                                              Title  \
0     03-01-2020  JPMorgan Chase to Host Fourth-Quarter and Full...   
1     03-01-2020  JPMorgan Chase to Host Fourth-Quarter and Full...   
2     06-01-2020  Amid Global Complexity, Businesses Continue to...   
3     07-01-2020  Chase Partners with DoorDash to Deliver New Be...   
4     08-01-2020  Chase Introduces New Cardmember Benefits With ...   

                                                Text  \
0  NEW YORK--(BUSINESS WIRE)--As previously annou...   
1  NEW YORK--(BUSINESS WIRE)--As previously annou...   
2  NEW YORK--(BUSINESS WIRE)--Most small and mids...   
3  WILMINGTON, Del.--(BUSINESS WIRE)--Today Chase...   
4  WILMINGTON, Del.--(BUSINESS WIRE)--Chase is pa...   

                                                 URL              Site  \
0  https://www.businesswire.com/news/home/2020010...  businesswire.com   
1  http://www.businesswire.com/news/home/20200103...  businesswire.com   
2  https://www

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the punkt_tab data package

# Load dataset from Colab (ensure the file is uploaded first)
df = pd.read_csv('JPM_Stock_News_Data_With_Credibility.csv', encoding='latin-1')

# Check dataset columns
print("Dataset Columns:", df.columns)

# Specify the column containing the text to clean
text_column = 'Text'  # Update if the actual column name differs

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
        text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
        text = re.sub(r'[^a-z\s]', '', text)  # Keep only alphabets
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

        # Tokenization
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        cleaned_text = " ".join(lemmatizer.lemmatize(word) for word in tokens if word not in stop_words)
        return cleaned_text
    return ""

# Apply cleaning function to the "Text" column
df['Cleaned_Text'] = df[text_column].apply(clean_text)

# Save cleaned data
cleaned_file_path = "JPM_Stock_News_Data_Cleaned_text.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved as: {cleaned_file_path}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Dataset Columns: Index(['Published Date', 'Title', 'Text', 'URL', 'Site', 'Credibility Score'], dtype='object')
Cleaned dataset saved as: JPM_Stock_News_Data_Cleaned_text.csv


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('JPM_Stock_News_Data_Cleaned_text.csv', encoding='latin-1')
df.head()

Unnamed: 0,Published Date,Title,Text,URL,Site,Credibility Score,Cleaned_Text
0,03-01-2020,JPMorgan Chase to Host Fourth-Quarter and Full...,NEW YORK--(BUSINESS WIRE)--As previously annou...,https://www.businesswire.com/news/home/2020010...,businesswire.com,5.0,new yorkbusiness wireas previously announced j...
1,03-01-2020,JPMorgan Chase to Host Fourth-Quarter and Full...,NEW YORK--(BUSINESS WIRE)--As previously annou...,http://www.businesswire.com/news/home/20200103...,businesswire.com,5.0,new yorkbusiness wireas previously announced j...
2,06-01-2020,"Amid Global Complexity, Businesses Continue to...",NEW YORK--(BUSINESS WIRE)--Most small and mids...,https://www.businesswire.com/news/home/2020010...,businesswire.com,5.0,new yorkbusiness wiremost small midsize u busi...
3,07-01-2020,Chase Partners with DoorDash to Deliver New Be...,"WILMINGTON, Del.--(BUSINESS WIRE)--Today Chase...",https://www.businesswire.com/news/home/2020010...,businesswire.com,5.0,wilmington delbusiness wiretoday chase announc...
4,08-01-2020,Chase Introduces New Cardmember Benefits With ...,"WILMINGTON, Del.--(BUSINESS WIRE)--Chase is pa...",https://www.businesswire.com/news/home/2020010...,businesswire.com,5.0,wilmington delbusiness wirechase partnering ly...


In [None]:
df.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,0
URL,0
Site,0
Credibility Score,0
Cleaned_Text,1


In [None]:
# Remove rows where 'Cleaned_Text' has missing values
df_cleaned = df.dropna(subset=['Cleaned_Text']).reset_index(drop=True)

# Save the cleaned dataset without missing values
cleaned_file_path = "JPM_Stock_News_Data_Final.csv"
df_cleaned.to_csv(cleaned_file_path, index=False)

# Confirm the number of missing values after removal
missing_values_after = df_cleaned['Cleaned_Text'].isnull().sum()
missing_values_after


np.int64(0)

In [None]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download VADER sentiment analyzer
nltk.download('vader_lexicon')

# Load the cleaned dataset (Upload to Google Colab first)
file_path = "JPM_Stock_News_Data_Final.csv"  # Update path if needed
df = pd.read_csv(file_path)

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to compute sentiment score
def get_sentiment_score(text):
    if isinstance(text, str):
        return sia.polarity_scores(text)['compound']  # VADER compound score (-1 to 1)
    return 0  # Default neutral score for missing text

# Apply sentiment analysis
df['Sentiment_Score'] = df['Cleaned_Text'].apply(get_sentiment_score)

# Save the dataset with sentiment scores
output_file_path = "JPM_Stock_News_Data_With_Sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f"✅ Updated dataset with sentiment scores saved as: {output_file_path}")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


✅ Updated dataset with sentiment scores saved as: JPM_Stock_News_Data_With_Sentiment.csv


In [None]:
# ✅ Create Weighted Sentiment column
df['Weighted_Sentiment'] = df['Sentiment_Score'] * (df['Credibility Score'] / 10)

# Save final dataset
output_file_path = "JPM_Stock_News_Data_With_Weighted_Sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f"✅ Updated dataset with Sentiment_Score and Weighted_Sentiment saved as: {output_file_path}")

✅ Updated dataset with Sentiment_Score and Weighted_Sentiment saved as: JPM_Stock_News_Data_With_Weighted_Sentiment.csv


In [None]:
import pandas as pd

# Load dataset
file_path = "JPM_Stock_News_Data_With_Weighted_Sentiment.csv"
df = pd.read_csv(file_path)

# Convert 'Published Date' to datetime
df['Published Date'] = pd.to_datetime(df['Published Date'], format='%d-%m-%Y', errors='coerce')

# Extract just the date for grouping
df['Date'] = df['Published Date'].dt.date

# Compute average weighted sentiment per day
daily_avg_sentiment = df.groupby('Date')['Weighted_Sentiment'].mean().reset_index()
daily_avg_sentiment.rename(columns={'Weighted_Sentiment': 'Avg_Weighted_Sentiment'}, inplace=True)

# Merge back
df = pd.merge(df, daily_avg_sentiment, on='Date', how='left')

# Calculate directional impact
df['Impact'] = df['Weighted_Sentiment'] - df['Avg_Weighted_Sentiment']

# Label the direction
df['Impact_Direction'] = df['Impact'].apply(lambda x: 'Positive Impact' if x > 0 else 'Negative Impact')

# Get most impactful news per day (by absolute deviation)
most_important_news = df.loc[df.groupby('Date')['Impact'].apply(lambda x: x.abs().idxmax())]

# Drop the 'Date' column as requested
most_important_news.drop(columns=['Date'], inplace=True)

# Save result
output_file_path = "JPM_Most_Important_News_Per_Day_With_Direction.csv"
most_important_news.to_csv(output_file_path, index=False)

# Show sample
print(most_important_news[['Published Date', 'Title', 'Impact', 'Impact_Direction', 'Weighted_Sentiment', 'Avg_Weighted_Sentiment']].head())
print(f"✅ Saved as: {output_file_path}")


  Published Date                                              Title  Impact  \
0     2020-01-03  JPMorgan Chase to Host Fourth-Quarter and Full...     0.0   
2     2020-01-06  Amid Global Complexity, Businesses Continue to...     0.0   
3     2020-01-07  Chase Partners with DoorDash to Deliver New Be...     0.0   
4     2020-01-08  Chase Introduces New Cardmember Benefits With ...     0.0   
5     2020-01-11  All the 2020 Pre-JPM News Healthcare Investors...     0.0   

  Impact_Direction  Weighted_Sentiment  Avg_Weighted_Sentiment  
0  Negative Impact             0.46565                 0.46565  
2  Negative Impact             0.49895                 0.49895  
3  Negative Impact             0.49970                 0.49970  
4  Negative Impact             0.49990                 0.49990  
5  Negative Impact             0.41958                 0.41958  
✅ Saved as: JPM_Most_Important_News_Per_Day_With_Direction.csv


# META

In [None]:
import pandas as pd

# Load the re-uploaded CSV file
file_path = "META_Stock_News_Data.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Review the unique news sources
unique_sources = df_news['Site'].value_counts().reset_index()
unique_sources.columns = ['Site', 'Article Count']
unique_sources


Unnamed: 0,Site,Article Count
0,fool.com,2168
1,investorplace.com,1331
2,zacks.com,1249
3,reuters.com,1140
4,seekingalpha.com,1046
...,...,...
115,otcprwire.com,1
116,headlinesoftoday.com,1
117,https://www.defenseworld.net,1
118,radarr.africa,1


In [None]:
df_news.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,18
URL,0
Site,14


In [None]:
import pandas as pd

# Load dataset
file_path = "META_Stock_News_Data.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Fill null values
df_news['Text'] = df_news['Text'].fillna(df_news['Title'])
df_news['Site'] = df_news['Site'].fillna(df_news['URL'].str.extract(r'https?://([^/]+)')[0])

# Save cleaned data to CSV
output_cleaned_path = "META_Stock_News_Data_Cleaned.csv"
df_news.to_csv(output_cleaned_path, index=False)

output_cleaned_path


'META_Stock_News_Data_Cleaned.csv'

In [None]:
import pandas as pd

# Load the dataset
file_path = "META_Stock_News_Data_Cleaned.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Fill missing values
df_news['Text'] = df_news['Text'].fillna(df_news['Title'])
df_news['Site'] = df_news['Site'].fillna(df_news['URL'].str.extract(r'https?://([^/]+)')[0])

# Assign credibility scores
credibility_scores = {
    'fool.com': 7,
    'zacks.com': 8,
    'investorplace.com': 6,
    'cnbc.com': 9,
    'seekingalpha.com': 6,
    'marketwatch.com': 8,
    'finance.yahoo.com': 8,
    'yahoo.com': 7,
    'reuters.com': 10,
    'bloomberg.com': 9,
    'businessinsider.com': 7,
    'thestreet.com': 7,
    '247wallst.com': 5,
    'forbes.com': 8,
    'money.cnn.com': 8,
    'barrons.com': 9,
    'nasdaq.com': 8,
    'wsj.com': 9
}
df_news['Credibility Score'] = df_news['Site'].map(credibility_scores).fillna(5)

# Save the updated DataFrame to a new CSV file (credibility only)
output_path = "META_Stock_News_Data_With_Credibility.csv"
df_news.to_csv(output_path, index=False)

# Display first few rows
print(df_news.head())


  Published Date                                              Title  \
0     02-01-2020  FANG Stocks Face Regulatory Scrutiny In 2020 Â...   
1     02-01-2020       6 Metrics Behind Facebook's 54% Gain in 2019   
2     03-01-2020   Facebook Had the 4 Top Mobile Apps of the Decade   
3     03-01-2020         Does Instagram Have a User Growth Problem?   
4     04-01-2020  Wall Street analysts expect a big 2020 from th...   

                                                Text  \
0  All five FANG stocks grew by double digits in ...   
1  The social network impressed investors with st...   
2  Two were acquired and subsequently scaled to m...   
3  The photo-sharing app is having trouble adding...   
4  Wall Street analysts said they expect internet...   

                                                 URL           Site  \
0  https://www.investors.com/news/technology/fang...  investors.com   
1  https://www.fool.com/investing/2020/01/02/6-me...       fool.com   
2  https://www.fool.com

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the punkt_tab data package

# Load dataset from Colab (ensure the file is uploaded first)
df = pd.read_csv('META_Stock_News_Data_With_Credibility.csv', encoding='latin-1')

# Check dataset columns
print("Dataset Columns:", df.columns)

# Specify the column containing the text to clean
text_column = 'Text'  # Update if the actual column name differs

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
        text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
        text = re.sub(r'[^a-z\s]', '', text)  # Keep only alphabets
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

        # Tokenization
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        cleaned_text = " ".join(lemmatizer.lemmatize(word) for word in tokens if word not in stop_words)
        return cleaned_text
    return ""

# Apply cleaning function to the "Text" column
df['Cleaned_Text'] = df[text_column].apply(clean_text)

# Save cleaned data
cleaned_file_path = "META_Stock_News_Data_Cleaned_text.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved as: {cleaned_file_path}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Dataset Columns: Index(['Published Date', 'Title', 'Text', 'URL', 'Site', 'Credibility Score'], dtype='object')
Cleaned dataset saved as: META_Stock_News_Data_Cleaned_text.csv


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('META_Stock_News_Data_Cleaned_text.csv', encoding='latin-1')
df.head()

Unnamed: 0,Published Date,Title,Text,URL,Site,Credibility Score,Cleaned_Text
0,02-01-2020,FANG Stocks Face Regulatory Scrutiny In 2020 Ã...,All five FANG stocks grew by double digits in ...,https://www.investors.com/news/technology/fang...,investors.com,5.0,five fang stock grew double digit enter possib...
1,02-01-2020,6 Metrics Behind Facebook's 54% Gain in 2019,The social network impressed investors with st...,https://www.fool.com/investing/2020/01/02/6-me...,fool.com,7.0,social network impressed investor strong reven...
2,03-01-2020,Facebook Had the 4 Top Mobile Apps of the Decade,Two were acquired and subsequently scaled to m...,https://www.fool.com/investing/2020/01/03/face...,fool.com,7.0,two acquired subsequently scaled billion user
3,03-01-2020,Does Instagram Have a User Growth Problem?,The photo-sharing app is having trouble adding...,https://www.fool.com/investing/2020/01/03/does...,fool.com,7.0,photosharing app trouble adding new user affec...
4,04-01-2020,Wall Street analysts expect a big 2020 from th...,Wall Street analysts said they expect internet...,https://www.cnbc.com/2020/01/04/these-internet...,cnbc.com,9.0,wall street analyst said expect internet stock...


In [None]:
df.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,0
URL,0
Site,0
Credibility Score,0
Cleaned_Text,0


In [None]:
# Remove rows where 'Cleaned_Text' has missing values
df_cleaned = df.dropna(subset=['Cleaned_Text']).reset_index(drop=True)

# Save the cleaned dataset without missing values
cleaned_file_path = "META_Stock_News_Data_Final.csv"
df_cleaned.to_csv(cleaned_file_path, index=False)

# Confirm the number of missing values after removal
missing_values_after = df_cleaned['Cleaned_Text'].isnull().sum()
missing_values_after


np.int64(0)

In [None]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download VADER sentiment analyzer
nltk.download('vader_lexicon')

# Load the cleaned dataset (Upload to Google Colab first)
file_path = "META_Stock_News_Data_Final.csv"  # Update path if needed
df = pd.read_csv(file_path)

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to compute sentiment score
def get_sentiment_score(text):
    if isinstance(text, str):
        return sia.polarity_scores(text)['compound']  # VADER compound score (-1 to 1)
    return 0  # Default neutral score for missing text

# Apply sentiment analysis
df['Sentiment_Score'] = df['Cleaned_Text'].apply(get_sentiment_score)

# Save the dataset with sentiment scores
output_file_path = "META_Stock_News_Data_With_Sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f"✅ Updated dataset with sentiment scores saved as: {output_file_path}")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


✅ Updated dataset with sentiment scores saved as: META_Stock_News_Data_With_Sentiment.csv


In [None]:
# ✅ Create Weighted Sentiment column
df['Weighted_Sentiment'] = df['Sentiment_Score'] * (df['Credibility Score'] / 10)

# Save final dataset
output_file_path = "META_Stock_News_Data_With_Weighted_Sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f"✅ Updated dataset with Sentiment_Score and Weighted_Sentiment saved as: {output_file_path}")

✅ Updated dataset with Sentiment_Score and Weighted_Sentiment saved as: META_Stock_News_Data_With_Weighted_Sentiment.csv


In [None]:
import pandas as pd

# Load dataset
file_path = "META_Stock_News_Data_With_Weighted_Sentiment.csv"
df = pd.read_csv(file_path)

# Convert 'Published Date' to datetime
df['Published Date'] = pd.to_datetime(df['Published Date'], format='%d-%m-%Y', errors='coerce')

# Extract just the date for grouping
df['Date'] = df['Published Date'].dt.date

# Compute average weighted sentiment per day
daily_avg_sentiment = df.groupby('Date')['Weighted_Sentiment'].mean().reset_index()
daily_avg_sentiment.rename(columns={'Weighted_Sentiment': 'Avg_Weighted_Sentiment'}, inplace=True)

# Merge back
df = pd.merge(df, daily_avg_sentiment, on='Date', how='left')

# Calculate directional impact
df['Impact'] = df['Weighted_Sentiment'] - df['Avg_Weighted_Sentiment']

# Label the direction
df['Impact_Direction'] = df['Impact'].apply(lambda x: 'Positive Impact' if x > 0 else 'Negative Impact')

# Get most impactful news per day (by absolute deviation)
most_important_news = df.loc[df.groupby('Date')['Impact'].apply(lambda x: x.abs().idxmax())]

# Drop the 'Date' column as requested
most_important_news.drop(columns=['Date'], inplace=True)

# Save result
output_file_path = "META_Most_Important_News_Per_Day_With_Direction.csv"
most_important_news.to_csv(output_file_path, index=False)

# Show sample
print(most_important_news[['Published Date', 'Title', 'Impact', 'Impact_Direction', 'Weighted_Sentiment', 'Avg_Weighted_Sentiment']].head())
print(f"✅ Saved as: {output_file_path}")


  Published Date                                              Title    Impact  \
0     2020-01-02  FANG Stocks Face Regulatory Scrutiny In 2020 Ã... -0.225770   
2     2020-01-03   Facebook Had the 4 Top Mobile Apps of the Decade  0.009030   
4     2020-01-04  Wall Street analysts expect a big 2020 from th...  0.000000   
5     2020-01-05             Better Buy: MercadoLibre vs. Facebook.  0.195090   
9     2020-01-06  Facebook data misuse and voter manipulation ba... -0.342587   

  Impact_Direction  Weighted_Sentiment  Avg_Weighted_Sentiment  
0  Negative Impact             0.13660                0.362370  
2  Positive Impact             0.00000               -0.009030  
4  Negative Impact             0.00000                0.000000  
5  Positive Impact             0.39018                0.195090  
9  Negative Impact            -0.22940                0.113187  
✅ Saved as: META_Most_Important_News_Per_Day_With_Direction.csv


# PG

In [None]:
import pandas as pd

# Load the re-uploaded CSV file
file_path = "PG_Stock_News_Data.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Review the unique news sources
unique_sources = df_news['Site'].value_counts().reset_index()
unique_sources.columns = ['Site', 'Article Count']
unique_sources


Unnamed: 0,Site,Article Count
0,businesswire.com,673
1,fool.com,508
2,zacks.com,427
3,investorplace.com,327
4,seekingalpha.com,289
5,marketwatch.com,62
6,forbes.com,60
7,gurufocus.com,53
8,247wallst.com,51
9,reuters.com,48


In [None]:
df_news.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,0
URL,3
Site,3


In [None]:
import pandas as pd

# Load dataset
file_path = "PG_Stock_News_Data.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Fill null values
df_news['Text'] = df_news['Text'].fillna(df_news['Title'])
df_news['Site'] = df_news['Site'].fillna(df_news['URL'].str.extract(r'https?://([^/]+)')[0])

# Save cleaned data to CSV
output_cleaned_path = "PG_Stock_News_Data_Cleaned.csv"
df_news.to_csv(output_cleaned_path, index=False)

output_cleaned_path


'PG_Stock_News_Data_Cleaned.csv'

In [None]:
import pandas as pd

# Load the dataset
file_path = "PG_Stock_News_Data_Cleaned.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Fill missing values
df_news['Text'] = df_news['Text'].fillna(df_news['Title'])
df_news['Site'] = df_news['Site'].fillna(df_news['URL'].str.extract(r'https?://([^/]+)')[0])

# Assign credibility scores
credibility_scores = {
    'fool.com': 7,
    'zacks.com': 8,
    'investorplace.com': 6,
    'cnbc.com': 9,
    'seekingalpha.com': 6,
    'marketwatch.com': 8,
    'finance.yahoo.com': 8,
    'yahoo.com': 7,
    'reuters.com': 10,
    'bloomberg.com': 9,
    'businessinsider.com': 7,
    'thestreet.com': 7,
    '247wallst.com': 5,
    'forbes.com': 8,
    'money.cnn.com': 8,
    'barrons.com': 9,
    'nasdaq.com': 8,
    'wsj.com': 9
}
df_news['Credibility Score'] = df_news['Site'].map(credibility_scores).fillna(5)

# Save the updated DataFrame to a new CSV file (credibility only)
output_path = "PG_Stock_News_Data_With_Credibility.csv"
df_news.to_csv(output_path, index=False)

# Display first few rows
print(df_news.head())


  Published Date                                              Title  \
0     02-01-2020  P&G to Webcast Discussion of Second Quarter 20...   
1     02-01-2020  SecretÂ® Deodorant Canada Joins the PWHPA Drea...   
2     02-01-2020  LA MARQUE SECRETÂ® DÃODORANT CANADA SE JOINT ...   
3     03-01-2020  3 Big Stock Charts for Friday: P&G, Walgreens,...   
4     03-01-2020                             10 Stocks For 40 Years   

                                                Text  \
0  CINCINNATI--(BUSINESS WIRE)--The Procter & Gam...   
1  TORONTO--(BUSINESS WIRE)--The Professional Wom...   
2  TORONTO--(BUSINESS WIRE)--La tournÃ©e Dream Ga...   
3  Friday's big stock charts feature 3 names tryi...   
4                             10 Stocks For 40 Years   

                                                 URL               Site  \
0  https://www.businesswire.com/news/home/2020010...   businesswire.com   
1  https://www.businesswire.com/news/home/2020010...   businesswire.com   
2  https://

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the punkt_tab data package

# Load dataset from Colab (ensure the file is uploaded first)
df = pd.read_csv('PG_Stock_News_Data_With_Credibility.csv', encoding='latin-1')

# Check dataset columns
print("Dataset Columns:", df.columns)

# Specify the column containing the text to clean
text_column = 'Text'  # Update if the actual column name differs

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
        text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
        text = re.sub(r'[^a-z\s]', '', text)  # Keep only alphabets
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

        # Tokenization
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        cleaned_text = " ".join(lemmatizer.lemmatize(word) for word in tokens if word not in stop_words)
        return cleaned_text
    return ""

# Apply cleaning function to the "Text" column
df['Cleaned_Text'] = df[text_column].apply(clean_text)

# Save cleaned data
cleaned_file_path = "PG_Stock_News_Data_Cleaned_text.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved as: {cleaned_file_path}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Dataset Columns: Index(['Published Date', 'Title', 'Text', 'URL', 'Site', 'Credibility Score'], dtype='object')
Cleaned dataset saved as: PG_Stock_News_Data_Cleaned_text.csv


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('PG_Stock_News_Data_Cleaned_text.csv', encoding='latin-1')
df.head()

Unnamed: 0,Published Date,Title,Text,URL,Site,Credibility Score,Cleaned_Text
0,02-01-2020,P&G to Webcast Discussion of Second Quarter 20...,CINCINNATI--(BUSINESS WIRE)--The Procter & Gam...,https://www.businesswire.com/news/home/2020010...,businesswire.com,5.0,cincinnatibusiness wirethe procter gamble comp...
1,02-01-2020,SecretÃÂÃÂ® Deodorant Canada Joins the PWHP...,TORONTO--(BUSINESS WIRE)--The Professional Wom...,https://www.businesswire.com/news/home/2020010...,businesswire.com,5.0,torontobusiness wirethe professional womens ho...
2,02-01-2020,LA MARQUE SECRETÃÂÃÂ® DÃÂÃÂODORANT CANA...,TORONTO--(BUSINESS WIRE)--La tournÃÂÃÂ©e Dr...,https://www.businesswire.com/news/home/2020010...,businesswire.com,5.0,torontobusiness wirela tourne dream gap de las...
3,03-01-2020,"3 Big Stock Charts for Friday: P&G, Walgreens,...",Friday's big stock charts feature 3 names tryi...,https://investorplace.com/2020/01/3-big-stock-...,investorplace.com,6.0,friday big stock chart feature name trying avo...
4,03-01-2020,10 Stocks For 40 Years,10 Stocks For 40 Years,https://seekingalpha.com/article/4315118-10-st...,seekingalpha.com,6.0,stock year


In [None]:
df.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,0
URL,3
Site,3
Credibility Score,0
Cleaned_Text,0


In [None]:
# Remove rows where 'Cleaned_Text' has missing values
df_cleaned = df.dropna(subset=['Cleaned_Text', 'URL','Site']).reset_index(drop=True)

# Save the cleaned dataset without missing values
cleaned_file_path = "PG_Stock_News_Data_Final.csv"
df_cleaned.to_csv(cleaned_file_path, index=False)

# Confirm the number of missing values after removal
missing_values_after = df_cleaned['Cleaned_Text'].isnull().sum()
missing_values_after


np.int64(0)

In [None]:
df_cleaned.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,0
URL,0
Site,0
Credibility Score,0
Cleaned_Text,0


In [None]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download VADER sentiment analyzer
nltk.download('vader_lexicon')

# Load the cleaned dataset (Upload to Google Colab first)
file_path = "PG_Stock_News_Data_Final.csv"  # Update path if needed
df = pd.read_csv(file_path)

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to compute sentiment score
def get_sentiment_score(text):
    if isinstance(text, str):
        return sia.polarity_scores(text)['compound']  # VADER compound score (-1 to 1)
    return 0  # Default neutral score for missing text

# Apply sentiment analysis
df['Sentiment_Score'] = df['Cleaned_Text'].apply(get_sentiment_score)

# Save the dataset with sentiment scores
output_file_path = "PG_Stock_News_Data_With_Sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f"✅ Updated dataset with sentiment scores saved as: {output_file_path}")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


✅ Updated dataset with sentiment scores saved as: PG_Stock_News_Data_With_Sentiment.csv


In [None]:
# ✅ Create Weighted Sentiment column
df['Weighted_Sentiment'] = df['Sentiment_Score'] * (df['Credibility Score'] / 10)

# Save final dataset
output_file_path = "PG_Stock_News_Data_With_Weighted_Sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f"✅ Updated dataset with Sentiment_Score and Weighted_Sentiment saved as: {output_file_path}")

✅ Updated dataset with Sentiment_Score and Weighted_Sentiment saved as: PG_Stock_News_Data_With_Weighted_Sentiment.csv


In [None]:
import pandas as pd

# Load dataset
file_path = "PG_Stock_News_Data_With_Weighted_Sentiment.csv"
df = pd.read_csv(file_path)

# Convert 'Published Date' to datetime
df['Published Date'] = pd.to_datetime(df['Published Date'], format='%d-%m-%Y', errors='coerce')

# Extract just the date for grouping
df['Date'] = df['Published Date'].dt.date

# Compute average weighted sentiment per day
daily_avg_sentiment = df.groupby('Date')['Weighted_Sentiment'].mean().reset_index()
daily_avg_sentiment.rename(columns={'Weighted_Sentiment': 'Avg_Weighted_Sentiment'}, inplace=True)

# Merge back
df = pd.merge(df, daily_avg_sentiment, on='Date', how='left')

# Calculate directional impact
df['Impact'] = df['Weighted_Sentiment'] - df['Avg_Weighted_Sentiment']

# Label the direction
df['Impact_Direction'] = df['Impact'].apply(lambda x: 'Positive Impact' if x > 0 else 'Negative Impact')

# Get most impactful news per day (by absolute deviation)
most_important_news = df.loc[df.groupby('Date')['Impact'].apply(lambda x: x.abs().idxmax())]

# Drop the 'Date' column as requested
most_important_news.drop(columns=['Date'], inplace=True)

# Save result
output_file_path = "PG_Most_Important_News_Per_Day_With_Direction.csv"
most_important_news.to_csv(output_file_path, index=False)

# Show sample
print(most_important_news[['Published Date', 'Title', 'Impact', 'Impact_Direction', 'Weighted_Sentiment', 'Avg_Weighted_Sentiment']].head())
print(f"✅ Saved as: {output_file_path}")


   Published Date                                              Title  \
0      2020-01-02  P&G to Webcast Discussion of Second Quarter 20...   
5      2020-01-03  Secret Deodorant Casts a Power Line-Up for New...   
8      2020-01-05  Baby Monitoring Takes a Quantum Leap: Lumi by ...   
10     2020-01-07  Buy 5 Blue-Chip Stocks on Best Intra-Day Recov...   
14     2020-01-08  CES Day Two: P&G Spots Future Entrepreneurs, A...   

      Impact Impact_Direction  Weighted_Sentiment  Avg_Weighted_Sentiment  
0  -0.028067  Negative Impact             0.44670                0.474767  
5   0.392267  Positive Impact             0.49960                0.107333  
8  -0.005400  Negative Impact             0.49180                0.497200  
10 -0.762220  Negative Impact            -0.52776                0.234460  
14 -0.000025  Negative Impact             0.49895                0.498975  
✅ Saved as: PG_Most_Important_News_Per_Day_With_Direction.csv


# TESLA

In [None]:
import pandas as pd

# Load the re-uploaded CSV file
file_path = "TSLA_Stock_News_Data.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Review the unique news sources
unique_sources = df_news['Site'].value_counts().reset_index()
unique_sources.columns = ['Site', 'Article Count']
unique_sources


Unnamed: 0,Site,Article Count
0,fool.com,3058
1,investorplace.com,2415
2,reuters.com,1690
3,benzinga.com,1650
4,marketwatch.com,1419
...,...,...
86,prismmediawire.com,1
87,https://www.proactiveinvestors.co.uk/companies...,1
88,https://www.gurufocus.com,1
89,https://www.businesswire.com,1


In [None]:
df_news.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,7
URL,0
Site,25


In [None]:
import pandas as pd

# Load dataset
file_path = "TSLA_Stock_News_Data.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Fill null values
df_news['Text'] = df_news['Text'].fillna(df_news['Title'])
df_news['Site'] = df_news['Site'].fillna(df_news['URL'].str.extract(r'https?://([^/]+)')[0])

# Save cleaned data to CSV
output_cleaned_path = "TSLA_Stock_News_Data_Cleaned.csv"
df_news.to_csv(output_cleaned_path, index=False)

output_cleaned_path


'TSLA_Stock_News_Data_Cleaned.csv'

In [None]:
import pandas as pd

# Load the dataset
file_path = "TSLA_Stock_News_Data_Cleaned.csv"
df_news = pd.read_csv(file_path, encoding='ISO-8859-1')

# Fill missing values
df_news['Text'] = df_news['Text'].fillna(df_news['Title'])
df_news['Site'] = df_news['Site'].fillna(df_news['URL'].str.extract(r'https?://([^/]+)')[0])

# Assign credibility scores
credibility_scores = {
    'fool.com': 7,
    'zacks.com': 8,
    'investorplace.com': 6,
    'cnbc.com': 9,
    'seekingalpha.com': 6,
    'marketwatch.com': 8,
    'finance.yahoo.com': 8,
    'yahoo.com': 7,
    'reuters.com': 10,
    'bloomberg.com': 9,
    'businessinsider.com': 7,
    'thestreet.com': 7,
    '247wallst.com': 5,
    'forbes.com': 8,
    'money.cnn.com': 8,
    'barrons.com': 9,
    'nasdaq.com': 8,
    'wsj.com': 9
}
df_news['Credibility Score'] = df_news['Site'].map(credibility_scores).fillna(5)

# Save the updated DataFrame to a new CSV file (credibility only)
output_path = "TSLA_Stock_News_Data_With_Credibility.csv"
df_news.to_csv(output_path, index=False)

# Display first few rows
print(df_news.head())


  Published Date                                              Title  \
0     01-01-2020  US traffic safety agency launches probe of fat...   
1     01-01-2020  Tesla was on Autopilot in California crash whi...   
2     01-01-2020     Tesla: Just The Price Charts, None Of The Hype   
3     02-01-2020  Tesla says will start delivering China-made Mo...   
4     02-01-2020  Tesla To Begin Delivering China-Made Model 3 T...   

                                                Text  \
0  NHTSA, the US traffic safety agency, launches ...   
1  The US National Highway Traffic Safety Adminis...   
2  Tesla is one of those once-in-a-generation sto...   
3  Tesla Inc will deliver its first Chinese made ...   
4  Tesla Inc (NASDAQ: TSLA) said it would start d...   

                                                 URL             Site  \
0  https://www.cnbc.com/2020/01/01/nhtsa-will-pro...         cnbc.com   
1  https://www.theguardian.com/technology/2020/ja...  theguardian.com   
2  https://www.fo

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the punkt_tab data package

# Load dataset from Colab (ensure the file is uploaded first)
df = pd.read_csv('TSLA_Stock_News_Data_With_Credibility.csv', encoding='latin-1')

# Check dataset columns
print("Dataset Columns:", df.columns)

# Specify the column containing the text to clean
text_column = 'Text'  # Update if the actual column name differs

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
        text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
        text = re.sub(r'[^a-z\s]', '', text)  # Keep only alphabets
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

        # Tokenization
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        cleaned_text = " ".join(lemmatizer.lemmatize(word) for word in tokens if word not in stop_words)
        return cleaned_text
    return ""

# Apply cleaning function to the "Text" column
df['Cleaned_Text'] = df[text_column].apply(clean_text)

# Save cleaned data
cleaned_file_path = "TSLA_Stock_News_Data_Cleaned_text.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved as: {cleaned_file_path}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Dataset Columns: Index(['Published Date', 'Title', 'Text', 'URL', 'Site', 'Credibility Score'], dtype='object')
Cleaned dataset saved as: TSLA_Stock_News_Data_Cleaned_text.csv


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('TSLA_Stock_News_Data_Cleaned_text.csv', encoding='latin-1')
df.head()

Unnamed: 0,Published Date,Title,Text,URL,Site,Credibility Score,Cleaned_Text
0,01-01-2020,US traffic safety agency launches probe of fat...,"NHTSA, the US traffic safety agency, launches ...",https://www.cnbc.com/2020/01/01/nhtsa-will-pro...,cnbc.com,9.0,nhtsa u traffic safety agency launch probe tes...
1,01-01-2020,Tesla was on Autopilot in California crash whi...,The US National Highway Traffic Safety Adminis...,https://www.theguardian.com/technology/2020/ja...,theguardian.com,5.0,u national highway traffic safety administrati...
2,01-01-2020,"Tesla: Just The Price Charts, None Of The Hype",Tesla is one of those once-in-a-generation sto...,https://www.forbes.com/sites/johnnavin/2020/01...,forbes.com,8.0,tesla one onceinageneration stock drive people...
3,02-01-2020,Tesla says will start delivering China-made Mo...,Tesla Inc will deliver its first Chinese made ...,https://www.reuters.com/article/us-tesla-china...,reuters.com,10.0,tesla inc deliver first chinese made model sed...
4,02-01-2020,Tesla To Begin Delivering China-Made Model 3 T...,Tesla Inc (NASDAQ: TSLA) said it would start d...,https://www.benzinga.com/news/20/01/15059892/t...,benzinga.com,5.0,tesla inc nasdaq tsla said would start deliver...


In [None]:
df.isnull().sum()

Unnamed: 0,0
Published Date,0
Title,0
Text,0
URL,0
Site,0
Credibility Score,0
Cleaned_Text,2


In [None]:
# Remove rows where 'Cleaned_Text' has missing values
df_cleaned = df.dropna(subset=['Cleaned_Text']).reset_index(drop=True)

# Save the cleaned dataset without missing values
cleaned_file_path = "TSLA_Stock_News_Data_Final.csv"
df_cleaned.to_csv(cleaned_file_path, index=False)

# Confirm the number of missing values after removal
missing_values_after = df_cleaned['Cleaned_Text'].isnull().sum()
missing_values_after


np.int64(0)

In [None]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download VADER sentiment analyzer
nltk.download('vader_lexicon')

# Load the cleaned dataset (Upload to Google Colab first)
file_path = "TSLA_Stock_News_Data_Final.csv"  # Update path if needed
df = pd.read_csv(file_path)

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to compute sentiment score
def get_sentiment_score(text):
    if isinstance(text, str):
        return sia.polarity_scores(text)['compound']  # VADER compound score (-1 to 1)
    return 0  # Default neutral score for missing text

# Apply sentiment analysis
df['Sentiment_Score'] = df['Cleaned_Text'].apply(get_sentiment_score)

# Save the dataset with sentiment scores
output_file_path = "TSLA_Stock_News_Data_With_Sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f"✅ Updated dataset with sentiment scores saved as: {output_file_path}")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


✅ Updated dataset with sentiment scores saved as: TSLA_Stock_News_Data_With_Sentiment.csv


In [None]:
# ✅ Create Weighted Sentiment column
df['Weighted_Sentiment'] = df['Sentiment_Score'] * (df['Credibility Score'] / 10)

# Save final dataset
output_file_path = "TSLA_Stock_News_Data_With_Weighted_Sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f"✅ Updated dataset with Sentiment_Score and Weighted_Sentiment saved as: {output_file_path}")

✅ Updated dataset with Sentiment_Score and Weighted_Sentiment saved as: TSLA_Stock_News_Data_With_Weighted_Sentiment.csv


In [None]:
import pandas as pd

# Load dataset
file_path = "TSLA_Stock_News_Data_With_Weighted_Sentiment.csv"
df = pd.read_csv(file_path)

# Convert 'Published Date' to datetime
df['Published Date'] = pd.to_datetime(df['Published Date'], format='%d-%m-%Y', errors='coerce')

# Extract just the date for grouping
df['Date'] = df['Published Date'].dt.date

# Compute average weighted sentiment per day
daily_avg_sentiment = df.groupby('Date')['Weighted_Sentiment'].mean().reset_index()
daily_avg_sentiment.rename(columns={'Weighted_Sentiment': 'Avg_Weighted_Sentiment'}, inplace=True)

# Merge back
df = pd.merge(df, daily_avg_sentiment, on='Date', how='left')

# Calculate directional impact
df['Impact'] = df['Weighted_Sentiment'] - df['Avg_Weighted_Sentiment']

# Label the direction
df['Impact_Direction'] = df['Impact'].apply(lambda x: 'Positive Impact' if x > 0 else 'Negative Impact')

# Get most impactful news per day (by absolute deviation)
most_important_news = df.loc[df.groupby('Date')['Impact'].apply(lambda x: x.abs().idxmax())]

# Drop the 'Date' column as requested
most_important_news.drop(columns=['Date'], inplace=True)

# Save result
output_file_path = "TSLA_Most_Important_News_Per_Day_With_Direction.csv"
most_important_news.to_csv(output_file_path, index=False)

# Show sample
print(most_important_news[['Published Date', 'Title', 'Impact', 'Impact_Direction', 'Weighted_Sentiment', 'Avg_Weighted_Sentiment']].head())
print(f"✅ Saved as: {output_file_path}")


   Published Date                                              Title  \
0      2020-01-01  US traffic safety agency launches probe of fat...   
12     2020-01-02  Tesla cuts starting price for China-made Model...   
33     2020-01-03  Tesla rally leaves short sellers down $3 billi...   
38     2020-01-04           Tesla short sellers have lost $8 billion   
39     2020-01-05  Should Tesla Take The Initiative To Better Mon...   

      Impact Impact_Direction  Weighted_Sentiment  Avg_Weighted_Sentiment  
0  -0.195203  Negative Impact            -0.59373               -0.398527  
12 -0.341490  Negative Impact            -0.27320                0.068290  
33 -0.772735  Negative Impact            -0.67050                0.102235  
38  0.185800  Positive Impact             0.27870                0.092900  
39  0.411720  Positive Impact             0.74400                0.332280  
✅ Saved as: TSLA_Most_Important_News_Per_Day_With_Direction.csv


In [None]:
import yfinance as yf

# Define the ticker symbol and date range
ticker = 'GSPC'
start_date = '2020-01-01'
end_date = '2025-03-31'

# Fetch the data
netflix_data = yf.download(ticker, start=start_date, end=end_date)

# Save to CSV
netflix_data.to_csv('S&P500_Stock_Data_2020_to_2025.csv')


ERROR:yfinance:404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/GSPC?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=GSPC&crumb=di0Mdse.zEw
[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['GSPC']: AttributeError("'NoneType' object has no attribute 'update'")


In [None]:
import yfinance as yf

# Define the ticker symbol for S&P 500 and date range
ticker = "^GSPC"
start_date = "2020-01-01"
end_date = "2025-03-31"
import yfinance as yf

# Define the ticker symbol for S&P 500 and date range
ticker = "^GSPC"
start_date = "2020-01-01"
end_date = "2025-03-31"

# Fetch the data
sp500_data = yf.download(ticker, start=start_date, end=end_date, interval='1mo')
df = sp500_data[['Close']].reset_index()

# Save to CSV
file_path = "SP500_Closing_Price_2020_to_2025.csv"
sp500_data.to_csv(file_path)

file_path
# Save to CSV
file_path = "SP500_Closing_Price_2020_to_2025.csv"
sp500_data.to_csv(file_path)

file_path


[*********************100%***********************]  1 of 1 completed


'SP500_Closing_Price_2020_to_2025.csv'

In [None]:
!pip install yfinance --quiet

import yfinance as yf
import pandas as pd

# Download monthly close prices for Netflix
start_date = '2020-01-01'
end_date = '2025-03-31'
symbol = 'GSPC'

df = yf.download(symbol, start=start_date, end=end_date, interval='1mo')
df = df[['Close']].reset_index()

# Adjust the date to the 1st of each month
df['Date'] = df['Date'].dt.to_period('M').dt.to_timestamp('M') + pd.offsets.Day(1)

# Save to CSV
df[['Date', 'Close']].to_csv("SP500_Monthly_Close.csv", index=False)
print("✅ Netflix monthly close prices saved as: Netflix_Monthly_Close.csv")


[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['GSPC']: YFTzMissingError('possibly delisted; no timezone found')


✅ Netflix monthly close prices saved as: Netflix_Monthly_Close.csv
