In [46]:
# Import statements
import numpy as np
import pandas as pd
from google.colab import files

# Load in the data as a pandas dataframe
data = pd.read_csv('stock_news.csv', index_col=0)
data.head()

Unnamed: 0,headline,label
0,"Markets Close Mostly Lower Again; ROST, PANW R...",Negative
1,"Gap plummets on earnings miss, cuts full-year ...",Negative
2,Billionaire Ken Fisher is Selling These 10 Stocks,Negative
3,"Corning net income drops 13%, shares fall",Negative
4,Internet Explorer shutdown to cause Japan prob...,Negative


In [47]:
# Fill any missing headlines with an empty string
data['headline'] = data['headline'].fillna('')

# View unique labels
print("Unique labels:", data['label'].unique())

# Map string labels to integers
label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
data['label'] = data['label'].map(label_mapping)

# Drop any rows with labels that couldn't be mapped (e.g., typos)
data = data.dropna(subset=['label'])

# Convert label to integer type
data['label'] = data['label'].astype(int)

Unique labels: ['Negative' 'Neutral' 'Positive']


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TfidfVectorizer to convert headlines into numeric features
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the headlines into a feature matrix
X_tfidf = vectorizer.fit_transform(data['headline'])

# Get the target labels
y = data['label']


In [49]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets (80% train, 20% test)
X_tfidf_train, X_tfidf_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [57]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Create a Naive Bayes model
nb_tfidf = GaussianNB()

# Train the model
nb_tfidf.fit(X_tfidf_train.toarray(), y_train)

# Predict on the test set
y_pred_nb_tfidf = nb_tfidf.predict(X_tfidf_test.toarray())

# Evaluate the model
nb_tfidf_accuracy = accuracy_score(y_test, y_pred_nb_tfidf)
print(f"Naive Bayes Accuracy: {nb_tfidf_accuracy:.4f}")

Naive Bayes Accuracy: 0.4067


In [58]:
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression model
lr_tfidf = LogisticRegression(max_iter=100000)

# Train the model
lr_tfidf.fit(X_tfidf_train, y_train)

# Predict on the test set
y_pred_lr_tfidf = lr_tfidf.predict(X_tfidf_test)

# Evaluate the model
lr_tfidf_accuracy = accuracy_score(y_test, y_pred_lr_tfidf)
print(f"Logistic Regression Accuracy: {lr_tfidf_accuracy:.4f}")

Logistic Regression Accuracy: 0.6998


In [59]:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest model
rfc_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rfc_tfidf.fit(X_tfidf_train, y_train)

# Predict on the test set
y_pred_rfc_tfidf = rfc_tfidf.predict(X_tfidf_test)

# Evaluate the model
rfc_tfidf_accuracy = accuracy_score(y_test, y_pred_rfc_tfidf)
print(f"Random Forest Accuracy: {rfc_tfidf_accuracy:.4f}")

Random Forest Accuracy: 0.6960


In [53]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer (BoW)
vectorizer_bow = CountVectorizer(stop_words='english')

# Fit and transform the headlines into a feature matrix
X_bow = vectorizer_bow.fit_transform(data['headline'])

# Split the data into training and test sets (80% train, 20% test)
X_bow_train, X_bow_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

In [60]:
# Create a Naive Bayes model
nb_bow = GaussianNB()

# Train the model
nb_bow.fit(X_bow_train.toarray(), y_train)

# Predict on the test set
y_pred_nb_bow = nb_bow.predict(X_bow_test.toarray())

# Evaluate the model
nb_bow_accuracy = accuracy_score(y_test, y_pred_nb_bow)
print(f"Naive Bayes Accuracy: {nb_bow_accuracy:.4f}")

Naive Bayes Accuracy: 0.4106


In [61]:
# Create a Logistic Regression model
lr_bow = LogisticRegression(max_iter=100000)

# Train the model
lr_bow.fit(X_bow_train, y_train)

# Predict on the test set
y_pred_lr_bow = lr_bow.predict(X_bow_test)

# Evaluate the model
lr_bow_accuracy = accuracy_score(y_test, y_pred_lr_bow)
print(f"Logistic Regression Accuracy: {lr_bow_accuracy:.4f}")

Logistic Regression Accuracy: 0.7023


In [62]:
# Create a Random Forest model
rfc_bow = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rfc_bow.fit(X_bow_train, y_train)

# Predict on the test set
y_pred_rfc_bow = rfc_bow.predict(X_bow_test)

# Evaluate the model
rfc_bow_accuracy = accuracy_score(y_test, y_pred_rfc_bow)
print(f"Random Forest Accuracy: {rfc_bow_accuracy:.4f}")

Random Forest Accuracy: 0.7087
