# Preprocessing

## How to deal with missing values

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("merged_data.csv")

# Display the first few rows of the dataset
df.head(2)


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,Sentiment_Score,New_Covid_Cases,MarketCap
0,2013-04-30,32.560001,33.110001,32.279999,33.099998,27.390682,75165200,MSFT,5.6,,2396829515776
1,2013-05-01,32.93,33.080002,32.599998,32.720001,27.076227,54330900,MSFT,0.04,,2396829515776


In [2]:
# Calculate the number of missing values for each column
missing_values = df.isnull().sum()

missing_values


Date                    0
Open                    0
High                    0
Low                     0
Close                   0
Adj Close               0
Volume                  0
Ticker                  0
Sentiment_Score     30397
New_Covid_Cases    150833
MarketCap               0
dtype: int64

- Sentiment_Score: 30,397 missing values
- New_Covid_Cases: 150,833 missing values

In [None]:
# Calculate the number of missing values for the 'Sentiment_Score' column for each ticker
missing_values_by_ticker = data[data['Sentiment_Score'].isna()].groupby('Ticker').size()

missing_values_by_ticker.head()


In [3]:
# Filling missing values in 'New_Covid_Cases' column with 0
df['New_Covid_Cases'].fillna(0, inplace=True)

# Verify if missing values in 'New_Covid_Cases' have been filled
missing_values_updated = df.isnull().sum()

missing_values_updated


Date                   0
Open                   0
High                   0
Low                    0
Close                  0
Adj Close              0
Volume                 0
Ticker                 0
Sentiment_Score    30397
New_Covid_Cases        0
MarketCap              0
dtype: int64

In [4]:
# Get the top 20 tickers with the most missing values in 'Sentiment_Score'
top_20_missing = missing_values_by_ticker.sort_values(ascending=False).head(20)

# Plot
plt.figure(figsize=(10, 8))
sns.barplot(x=top_20_missing.values, y=top_20_missing.index, palette="viridis")
plt.title('Top 20 Tickers with Most Missing Values in Sentiment_Score')
plt.xlabel('Number of Missing Values')
plt.ylabel('Ticker')
plt.tight_layout()

plt.savefig("Top20_missing_values.png", format='png')
plt.show()

NameError: name 'missing_values_by_ticker' is not defined

In [11]:
# Check if "PEP" and "ATVI" do not have sentiment scores
pep_atvi_missing = top_20_missing[["PEP", "ATVI"]]

pep_atvi_missing


Ticker
PEP     2518
ATVI    2518
dtype: int64

In [13]:
# Drop rows where the Ticker is either "PEP" or "ATVI"
df = df[~df['Ticker'].isin(["PEP", "ATVI"])]

# Verify the changes
remaining_missing_values = df[df['Sentiment_Score'].isnull()].groupby('Ticker').size()

# Check if "PEP" and "ATVI" are still in the dataset
"PEP" in remaining_missing_values.index, "ATVI" in remaining_missing_values.index


(False, False)

In [14]:
# Filling missing values in 'Sentiment_Score' column with 0
df['Sentiment_Score'].fillna(0, inplace=True)

# Verify if missing values in 'Sentiment_Score' have been filled
remaining_missing_values_updated = df.isnull().sum()

remaining_missing_values_updated


Date               0
Open               0
High               0
Low                0
Close              0
Adj Close          0
Volume             0
Ticker             0
Sentiment_Score    0
New_Covid_Cases    0
MarketCap          0
dtype: int64

In [15]:
df.to_csv("After_preprocessing.csv", index=False)

In [19]:
df["Ticker"].unique()

array(['MSFT', 'AAPL', 'NVDA', 'AMZN', 'META', 'TSLA', 'GOOGL', 'GOOG',
       'AVGO', 'COST', 'ADBE', 'CSCO', 'NFLX', 'AMD', 'CMCSA', 'TMUS',
       'TXN', 'INTC', 'HON', 'INTU', 'QCOM', 'ISRG', 'AMGN', 'AMAT',
       'SBUX', 'BKNG', 'ADI', 'MDLZ', 'GILD', 'ADP', 'VRTX', 'LRCX',
       'PYPL', 'REGN', 'PANW', 'MU', 'CSX', 'SNPS', 'KLAC', 'ASML',
       'CDNS', 'FTNT', 'ORLY', 'MNST', 'MAR', 'CHTR', 'MELI', 'ABNB',
       'NXPI', 'MRVL', 'DXCM', 'CTAS', 'MCHP', 'MRNA', 'LULU', 'ADSK',
       'PDD', 'WDAY', 'PCAR', 'AEP', 'KDP', 'KHC', 'IDXX', 'CPRT', 'PAYX',
       'ON', 'EXC', 'ODFL', 'BIIB', 'AZN', 'ROST', 'GEHC', 'EA', 'SGEN',
       'CSGP', 'GFS', 'XEL', 'BKR', 'CTSH', 'FAST', 'VRSK', 'CRWD',
       'DLTR', 'WBD', 'DDOG', 'CEG', 'ILMN', 'ANSS', 'ALGN', 'TEAM',
       'WBA', 'FANG', 'ENPH', 'EBAY', 'ZS', 'SIRI', 'ZM', 'JD', 'LCID'],
      dtype=object)