In [15]:
# Import statements
import numpy as np
import pandas as pd
from google.colab import files
uploaded = files.upload()

# Load in the data as a pandas dataframe
data = pd.read_csv('stock_news.csv', index_col=0)
data.head()

Unnamed: 0,headline,label
0,"Markets Close Mostly Lower Again; ROST, PANW R...",Negative
1,"Gap plummets on earnings miss, cuts full-year ...",Negative
2,Billionaire Ken Fisher is Selling These 10 Stocks,Negative
3,"Corning net income drops 13%, shares fall",Negative
4,Internet Explorer shutdown to cause Japan prob...,Negative


In [16]:
# Fill any missing headlines with an empty string
data['headline'] = data['headline'].fillna('')

# View unique labels
print("Unique labels:", data['label'].unique())

# Map string labels to integers
label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
data['label'] = data['label'].map(label_mapping)

# Drop any rows with labels that couldn't be mapped (e.g., typos)
data = data.dropna(subset=['label'])

# Convert label to integer type
data['label'] = data['label'].astype(int)

Unique labels: ['Negative' 'Neutral' 'Positive']


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Doing TFIDF first, initialize
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(data['headline'])

# Get the target labels
y = data['label']


In [18]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets (80% train, 20% test) for TFIDF extraction
X_tfidf_train, X_tfidf_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [19]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Create a Naive Bayes model for TFIDF
nb_tfidf = GaussianNB()

# Train the model and predict
nb_tfidf.fit(X_tfidf_train.toarray(), y_train)
y_pred_nb_tfidf = nb_tfidf.predict(X_tfidf_test.toarray())

# Get accuracy
nb_tfidf_accuracy = accuracy_score(y_test, y_pred_nb_tfidf)
print(f"Naive Bayes TFIDF Accuracy: {nb_tfidf_accuracy:.4f}")

Naive Bayes TFIDF Accuracy: 0.4067


In [20]:
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression model for TFIDF
lr_tfidf = LogisticRegression(max_iter=100000)

# Train the model and predict
lr_tfidf.fit(X_tfidf_train, y_train)
y_pred_lr_tfidf = lr_tfidf.predict(X_tfidf_test)

# Get accuracy
lr_tfidf_accuracy = accuracy_score(y_test, y_pred_lr_tfidf)
print(f"Logistic Regression TFIDF Accuracy: {lr_tfidf_accuracy:.4f}")

Logistic Regression TFIDF Accuracy: 0.6998


In [21]:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest model for TFIDF
rfc_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model and predict
rfc_tfidf.fit(X_tfidf_train, y_train)
y_pred_rfc_tfidf = rfc_tfidf.predict(X_tfidf_test)

# Get accuracy
rfc_tfidf_accuracy = accuracy_score(y_test, y_pred_rfc_tfidf)
print(f"Random Forest TFIDF Accuracy: {rfc_tfidf_accuracy:.4f}")

Random Forest TFIDF Accuracy: 0.6960


In [22]:
from sklearn.svm import LinearSVC

# Create a SVM model for TFIDF
svm_tfidf = LinearSVC()

# Train the model and predict
svm_tfidf.fit(X_tfidf_train, y_train)
y_pred_svm_tfidf = svm_tfidf.predict(X_tfidf_test)

# Get accuracy
svm_tfidf_accuracy = accuracy_score(y_test, y_pred_svm_tfidf)
print(f"SVM TFIDF Accuracy: {svm_tfidf_accuracy:.4f}")

SVM TFIDF Accuracy: 0.7031


In [23]:
from sklearn.feature_extraction.text import CountVectorizer

# Now do it for BoW, initialize
vectorizer_bow = CountVectorizer(stop_words='english')
X_bow = vectorizer_bow.fit_transform(data['headline'])

# Split the data into training and test sets (80% train, 20% test) for BoW extraction
X_bow_train, X_bow_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

In [24]:
# Create a Naive Bayes model for BoW
nb_bow = GaussianNB()

# Train the model and predict
nb_bow.fit(X_bow_train.toarray(), y_train)
y_pred_nb_bow = nb_bow.predict(X_bow_test.toarray())

# Get accuracy
nb_bow_accuracy = accuracy_score(y_test, y_pred_nb_bow)
print(f"Naive Bayes BoW Accuracy: {nb_bow_accuracy:.4f}")

Naive Bayes BoW Accuracy: 0.4106


In [25]:
# Create a Logistic Regression model for BoW
lr_bow = LogisticRegression(max_iter=100000)

# Train the model and predict
lr_bow.fit(X_bow_train, y_train)
y_pred_lr_bow = lr_bow.predict(X_bow_test)

# Get accuracy
lr_bow_accuracy = accuracy_score(y_test, y_pred_lr_bow)
print(f"Logistic Regression BoW Accuracy: {lr_bow_accuracy:.4f}")

Logistic Regression BoW Accuracy: 0.7023


In [26]:
# Create a Random Forest model for BoW
rfc_bow = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model and predict
rfc_bow.fit(X_bow_train, y_train)
y_pred_rfc_bow = rfc_bow.predict(X_bow_test)

# Get accuracy
rfc_bow_accuracy = accuracy_score(y_test, y_pred_rfc_bow)
print(f"Random Forest BoW Accuracy: {rfc_bow_accuracy:.4f}")

Random Forest BoW Accuracy: 0.7087


In [27]:
# Create a SVM model for BoW
svm_bow = LinearSVC()

# Train the model and predict
svm_bow.fit(X_bow_train, y_train)
y_pred_svm_bow = svm_bow.predict(X_bow_test)

# Get accuracy
svm_bow_accuracy = accuracy_score(y_test, y_pred_svm_bow)
print(f"SVM BoW Accuracy: {svm_bow_accuracy:.4f}")

SVM BoW Accuracy: 0.6885
