Possible datasets
 - https://data.world/crowdflower/apple-twitter-sentiment
 - https://paperswithcode.com/dataset/stocknet-1
 - https://www.kaggle.com/datasets/equinxx/stock-tweets-for-sentiment-analysis-and-prediction
 - https://www.kaggle.com/datasets/thedevastator/tweet-sentiment-s-impact-on-stock-returns
 - https://ieee-dataport.org/open-access/stock-market-tweets-data
 - https://www.kaggle.com/datasets/yash612/stockmarket-sentiment-dataset
 - https://www.kaggle.com/datasets/kazanova/sentiment140

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from multiprocessing import Pool
import multiprocessing

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score, f1_score

import kaggle

## Loading:

In [2]:
RANDOM_STATE = 2023
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
SEQUENCE_LENGTH = 300
CURRENT_DIRECTORY = os.getcwd()
W2V_SIZE = 300
NUM_CORE = multiprocessing.cpu_count()
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10


In [3]:
DATASETS = [
    {
        'name': "training.1600000.processed.noemoticon.csv",
        'api': "kazanova/sentiment140",
        'location': "data",
        'url': "https://www.kaggle.com/datasets/kazanova/sentiment140"
    },
    {
        'name': "reddit_wsb.csv",
        'api': "gpreda/reddit-wallstreetsbets-posts",
        'location': "data",
        'url': "https://www.kaggle.com/datasets/gpreda/reddit-wallstreetsbets-posts"
    },
    {
        'name': "stock_data.csv",
        'api': "yash612/stockmarket-sentiment-dataset",
        'location': "data",
        'url': "https://www.kaggle.com/datasets/yash612/stockmarket-sentiment-dataset"
    },
    {
        'name': "stock_tweets.csv",
        'api': "equinxx/stock-tweets-for-sentiment-analysis-and-prediction",
        'location': "data/unorganized/Stock Tweets for Sentiment Analysis and Prediction",
        'url': "https://www.kaggle.com/datasets/equinxx/stock-tweets-for-sentiment-analysis-and-prediction"
    },
    {
        'name': "Company_Tweet.csv",
        'api': "omermetinn/tweets-about-the-top-companies-from-2015-to-2020",
        'location': "data/unorganized/Tweets about the Top Companies from 2015 to 2020",
        'url': "https://www.kaggle.com/datasets/omermetinn/tweets-about-the-top-companies-from-2015-to-2020"
    },
    {
        'name': "stockerbot-export.csv",
        'api': "davidwallach/financial-tweets",
        'location': "data/unorganized/Financial Tweets",
        'url': "https://www.kaggle.com/datasets/davidwallach/financial-tweets"
    }
]

for dataset_info in DATASETS:
    dataset_name = dataset_info['name']
    dataset_location = dataset_info['location']

    if not os.path.exists(os.path.join(dataset_info['location'], dataset_name)):
        print(f"Downloading {dataset_name} from {dataset_info['url']} to {dataset_location}...")
        kaggle.api.dataset_download_files(dataset_info['api'], path=dataset_location, unzip=True)


In [4]:
dataset_filename = {
    '0': ("training.1600000.processed.noemoticon.csv", ["target", "ids", "date", "flag", "user", "text"]),
    '1': ("stock_data.csv", ["text", "target"])
}
dataset_path = os.path.join("", "data", dataset_filename["0"][0])
df = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["0"][1])
print(df['target'].value_counts())

test_dataset_path = os.path.join("", "data", dataset_filename["1"][0])
test_df = pd.read_csv(test_dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["1"][1], skiprows=1)
test_df['target'] = test_df['target'].replace({-1: 0, 1: 4})
print(test_df['target'].value_counts())

target
0    800000
4    800000
Name: count, dtype: int64
target
4    3685
0    2106
Name: count, dtype: int64


In [5]:
df[:10]

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [6]:
test_df[:10]

Unnamed: 0,text,target
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,4
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,4
2,user I'd be afraid to short AMZN - they are lo...,4
3,MNTA Over 12.00,4
4,OI Over 21.37,4
5,PGNX Over 3.04,4
6,AAP - user if so then the current downtrend wi...,0
7,Monday's relative weakness. NYX WIN TIE TAP IC...,0
8,GOOG - ower trend line channel test & volume s...,4
9,AAP will watch tomorrow for ONG entry.,4


## 1. Naive Bayes classifier

In [7]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

df['text'] = df['text'].str.lower()

X = df['text']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification Report:\n", classification_rep)


Accuracy: 76.89%
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.78      0.77    159494
           4       0.78      0.76      0.77    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



Evaluate on the test dataset

In [8]:
test_df['text'] = test_df['text'].str.lower()

X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])

y_test_pred = clf.predict(X_test_tfidf)

accuracy = accuracy_score(test_df['target'], y_test_pred)
classification_rep = classification_report(test_df['target'], y_test_pred)

print("Accuracy on the test dataset: {:.2f}%".format(accuracy * 100))
print("Classification Report on the test dataset:\n", classification_rep)


Accuracy on the test dataset: 54.96%
Classification Report on the test dataset:
               precision    recall  f1-score   support

           0       0.41      0.54      0.47      2106
           4       0.68      0.55      0.61      3685

    accuracy                           0.55      5791
   macro avg       0.54      0.55      0.54      5791
weighted avg       0.58      0.55      0.56      5791

