# Load and Merge Data

In [7]:
import pandas as pd
import yfinance as yf

# Load the news_data.csv file
news_data = pd.read_csv('news_data.csv')

# Load the stock_data.csv file
stock_data = pd.read_csv('stock_data.csv')

# Load the owid-covid-data.csv.csv file
df_covid =describe.read_csv("owid-covid-data.csv")


# Merge the two datasets based on "Date" and "Ticker"
merged_data = pd.merge(stock_data, news_data, on=["Date", "Ticker"])



# Filter the data for iso_code 'USA'
df_covid = df_covid[df_covid['iso_code'] == 'USA']

df_covid = df_covid[df_covid["date"] <= "2023-04-28"]

# Extract the day of the week for each date in the dataset
df_covid['day_of_week'] = pd.to_datetime(df_covid['date']).dt.day_name()

# Drop rows where day_of_week is either 'Saturday' or 'Sunday'
df_covid = df_covid[~df_covid['day_of_week'].isin(['Saturday', 'Sunday'])]

# Drop all columns in df_covid except for 'date' and 'new_cases'
df_covid = df_covid[['date', 'new_cases']]

# Rename the 'new_cases' column to 'new_covid_cases'
df_covid = df_covid.rename(columns={"new_cases": "New_Covid_Cases"})

# Rename the 'Sent' column to 'Sentiment_Score'
merged_data.rename(columns={"Sent": "Sentiment_Score"}, inplace=True)


merged_data = pd.merge(merged_data, df_covid, left_on="Date", right_on="date", how="left")

merged_data.drop(columns=["date"], inplace=True)

# List of tickers
tickers = merged_data["Ticker"].unique().tolist()

# Fetch the market cap data
market_caps = {}
for ticker in tickers:
    stock = yf.Ticker(ticker)
    try:
        market_caps[ticker] = stock.info['marketCap']
    except:
        market_caps[ticker] = None

# Convert the market_caps dictionary to a DataFrame
df_market_caps = pd.DataFrame(list(market_caps.items()), columns=["Ticker", "MarketCap"])

# Merge the market_caps_df with merged_data using "Ticker" as key and "left" as the merge method
merged_data = pd.merge(merged_data, df_market_caps, on="Ticker", how="left")

# merged_data.to_csv("merged_data.csv", index=False)

# Preprocessing

## How to deal with missing values

In [15]:
import pandas as pd

# Load the dataset
df = pd.read_csv("merged_data.csv")

# Calculate the number of missing values for each column
missing_values = df.isnull().sum()

# Sentiment_Score: 30,397 missing values
# New_Covid_Cases: 150,833 missing values

# Filling missing values in 'New_Covid_Cases' column with 0
df['New_Covid_Cases'].fillna(0, inplace=True)

# Drop rows where the Ticker is either "PEP" or "ATVI" because the two companies do not have sentiment score
df = df[~df['Ticker'].isin(["PEP", "ATVI"])]

# Filling missing values in 'Sentiment_Score' column with 0
df['Sentiment_Score'].fillna(0, inplace=True)

# df.to_csv("After_preprocessing.csv", index=False)