### Import Libraries, Download NLTK Resources, and Load Dataset

In [6]:
import os
import re
import string
import pickle
from pathlib import Path

import pandas as pd
import numpy as np

# NLP & ML
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score

# Gensim for Word2Vec embeddings
from gensim.models import Word2Vec

# Ensure NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
DATA_PATH = Path('stock_data.csv')
assert DATA_PATH.exists(), f"Dataset not found at {DATA_PATH}. Please check path."

df = pd.read_csv(DATA_PATH, encoding='utf-8')

# Rename columns consistently (safe-guard)
if list(df.columns)[:2] != ['review', 'sentiment']:
    # Try to detect common names
    cols = df.columns.tolist()
    # If first column looks textual rename to review, second to sentiment
    df = df.rename(columns={cols[0]: 'review', cols[1]: 'sentiment'})

print("Loaded dataset shape:", df.shape)
df.head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\FLH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\FLH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loaded dataset shape: (5791, 2)


Unnamed: 0,review,sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


### Text Cleaning, Tokenization, and Stopword Removal

In [None]:
# Removes HTML, URLs, punctuation, extra whitespace, lowercases, and removes stopwords.
# Keeps the tokenized words in a helper column for Word2Vec training. 

STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """Remove HTML, URLs, punctuation; lowercase; strip repeated whitespace."""
    if not isinstance(text, str):
        text = str(text)
    # Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove non-ascii characters
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Remove punctuation (keep periods replaced by space)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    # Lowercase and collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha() and t not in STOPWORDS]
    return tokens

# Apply cleaning and tokenization
df['review'] = df['review'].astype(str).apply(clean_text)
df['tokens'] = df['review'].apply(tokenize_and_remove_stopwords)

# Display quick checks
print("Example cleaned review:", df['review'].iloc[0])
print("Example tokens:", df['tokens'].iloc[0])
df.head()


Example cleaned review: kickers on my watchlist xide tit soq pnk cpw bpz aj trade method 1 or method 2 see prev posts
Example tokens: ['kickers', 'watchlist', 'xide', 'tit', 'soq', 'pnk', 'cpw', 'bpz', 'aj', 'trade', 'method', 'method', 'see', 'prev', 'posts']


Unnamed: 0,review,sentiment,tokens
0,kickers on my watchlist xide tit soq pnk cpw b...,1,"[kickers, watchlist, xide, tit, soq, pnk, cpw,..."
1,user aap movie 55 return for the fea geed indi...,1,"[user, aap, movie, return, fea, geed, indicato..."
2,user i d be afraid to short amzn they are look...,1,"[user, afraid, short, amzn, looking, like, nea..."
3,mnta over 12 00,1,[mnta]
4,oi over 21 37,1,[oi]


### Normalize Sentiment Labels and Show Class Distribution

In [4]:
# Map original numeric labels to 'Positive'/'Negative' if necessary and show counts.

# Map if numeric labels exist (1 / -1)
if df['sentiment'].dtype.kind in 'biufc':  # numeric
    df['sentiment'] = df['sentiment'].replace({1: 'Positive', -1: 'Negative'})

# Ensure labels are strings
df['sentiment'] = df['sentiment'].astype(str)

print("Class distribution:")
print(df['sentiment'].value_counts())
print("\nTotal examples:", len(df))
df.shape


Class distribution:
sentiment
Positive    3685
Negative    2106
Name: count, dtype: int64

Total examples: 5791


(5791, 3)

### Feature Extraction: TF-IDF and Word2Vec Embeddings

In [5]:
# Create TF-IDF features (for deployment) and train a small Word2Vec model to create averaged embeddings
# (for research/comparison). We use ngram_range=(1,2) by default for TF-IDF.

# Prepare raw text for TF-IDF (joined tokens)
df['clean_text'] = df['tokens'].apply(lambda toks: ' '.join(toks))

# TF-IDF vectorizer (this will be saved for the Flask app)
tfidf_vectorizer = TfidfVectorizer(min_df=2, max_df=0.9, ngram_range=(1,2), norm='l2')

# Fit TF-IDF on the full dataset (so deploy vectorizer knows full vocab)
X_tfidf = tfidf_vectorizer.fit_transform(df['clean_text'])
print("TF-IDF feature matrix shape:", X_tfidf.shape)

# Train a Word2Vec model on token lists (small dimensions so it's lightweight)
w2v_size = 100
w2v_window = 5
w2v_min_count = 2

w2v_model = Word2Vec(sentences=df['tokens'].tolist(),
                     vector_size=w2v_size,
                     window=w2v_window,
                     min_count=w2v_min_count,
                     workers=2,
                     epochs=25)

# Function to get average Word2Vec vector for a document
def document_vector(tokens, w2v_model, size=w2v_size):
    vecs = []
    for t in tokens:
        if t in w2v_model.wv:
            vecs.append(w2v_model.wv[t])
    if len(vecs) == 0:
        return np.zeros(size, dtype=np.float32)
    return np.mean(vecs, axis=0)

# Build matrix of averaged embeddings
X_w2v = np.vstack(df['tokens'].apply(lambda toks: document_vector(toks, w2v_model)).values)
print("Word2Vec averaged matrix shape:", X_w2v.shape)


TF-IDF feature matrix shape: (5791, 7525)
Word2Vec averaged matrix shape: (5791, 100)


### Train/Test Split and Model Evaluation Function

In [8]:
# Create consistent train/test splits (stratified) and a small evaluation util to print metrics.

# Features and target
y = df['sentiment']

# We'll make two different feature sets and share the same train/test split indices for fair comparison
X_tfidf_full = X_tfidf
X_w2v_full = X_w2v

# Stratified split
X_tfidf_train, X_tfidf_test, X_w2v_train, X_w2v_test, y_train, y_test = train_test_split(
    X_tfidf_full, X_w2v_full, y, test_size=0.20, random_state=42, stratify=y)

# Note: X_tfidf_train/test are sparse matrices; X_w2v_train/test are dense numpy arrays

def evaluate_model(name, y_true, y_pred, y_prob=None, labels=None):
    acc = accuracy_score(y_true, y_pred)
    p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    print(f"--- {name} ---")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision (weighted): {p:.4f}")
    print(f"Recall (weighted):    {r:.4f}")
    print(f"F1-score (weighted):  {f:.4f}")
    print("Confusion Matrix:")
    print(cm)
    # Try ROC AUC if binary
    if y_prob is not None:
        try:
            # binarize labels if necessary
            lb = LabelBinarizer()
            y_bin = lb.fit_transform(y_test)
            if y_bin.shape[1] == 1:
                # binary classifier
                auc = roc_auc_score(y_bin, y_prob[:,1] if y_prob.ndim>1 else y_prob)
                print("ROC AUC:", auc)
        except Exception as e:
            pass
    print()


### Train and Evaluate Models Using TF-IDF Features

In [9]:
# Train LogisticRegression, MultinomialNB, and LinearSVC on TF-IDF features.
# For speed, we use reasonable defaults. We print report outputs for each model and keep them in a dictionary.

models_tfidf = {}

# Logistic Regression
lr = LogisticRegression(penalty='l2', C=1.0, max_iter=1000, random_state=42)
lr.fit(X_tfidf_train, y_train)
y_pred_lr = lr.predict(X_tfidf_test)
# Obtain probability if available (LogisticRegression supports predict_proba)
y_prob_lr = lr.predict_proba(X_tfidf_test) if hasattr(lr, "predict_proba") else None
evaluate_model("LogisticRegression (TF-IDF)", y_test, y_pred_lr, y_prob_lr)
models_tfidf['logistic'] = lr

# Multinomial Naive Bayes
mnb = MultinomialNB()
# MultinomialNB requires non-negative input; TF-IDF is non-negative
mnb.fit(X_tfidf_train, y_train)
y_pred_mnb = mnb.predict(X_tfidf_test)
y_prob_mnb = mnb.predict_proba(X_tfidf_test) if hasattr(mnb, "predict_proba") else None
evaluate_model("MultinomialNB (TF-IDF)", y_test, y_pred_mnb, y_prob_mnb)
models_tfidf['mnb'] = mnb

# Linear SVM (LinearSVC)
svc = LinearSVC(max_iter=5000, random_state=42)
svc.fit(X_tfidf_train, y_train)
y_pred_svc = svc.predict(X_tfidf_test)
# LinearSVC does not have predict_proba by default
evaluate_model("LinearSVC (TF-IDF)", y_test, y_pred_svc)
models_tfidf['svc'] = svc

# Summarize weighted F1-scores to pick best
from sklearn.metrics import f1_score
f1_scores = {
    'logistic': f1_score(y_test, y_pred_lr, average='weighted'),
    'mnb': f1_score(y_test, y_pred_mnb, average='weighted'),
    'svc': f1_score(y_test, y_pred_svc, average='weighted'),
}
print("TF-IDF models F1 scores (weighted):", f1_scores)
best_tfidf_key = max(f1_scores, key=f1_scores.get)
print("Best TF-IDF model:", best_tfidf_key)


--- LogisticRegression (TF-IDF) ---
Accuracy: 0.7791
Precision (weighted): 0.7936
Recall (weighted):    0.7791
F1-score (weighted):  0.7602
Confusion Matrix:
[[200 221]
 [ 35 703]]
ROC AUC: 0.8662785083907846

--- MultinomialNB (TF-IDF) ---
Accuracy: 0.7739
Precision (weighted): 0.7898
Recall (weighted):    0.7739
F1-score (weighted):  0.7532
Confusion Matrix:
[[193 228]
 [ 34 704]]
ROC AUC: 0.8636038854450303

--- LinearSVC (TF-IDF) ---
Accuracy: 0.7929
Precision (weighted): 0.7898
Recall (weighted):    0.7929
F1-score (weighted):  0.7884
Confusion Matrix:
[[267 154]
 [ 86 652]]

TF-IDF models F1 scores (weighted): {'logistic': 0.7601658557553608, 'mnb': 0.7532345369019734, 'svc': 0.7883885577518115}
Best TF-IDF model: svc


### Train and Evaluate Models Using Averaged Word2Vec Embeddings

In [10]:
# Same model types as TF-IDF, but on the averaged embedding features.

models_w2v = {}

# Logistic Regression on embeddings (dense input)
lr_e = LogisticRegression(penalty='l2', C=1.0, max_iter=1000, random_state=42)
lr_e.fit(X_w2v_train, y_train)
y_pred_lr_e = lr_e.predict(X_w2v_test)
evaluate_model("LogisticRegression (Word2Vec avg)", y_test, y_pred_lr_e)
models_w2v['logistic'] = lr_e

# MultinomialNB is not ideal for dense embeddings (expects counts) — skip MNB for embeddings.
# Use LinearSVC instead
svc_e = LinearSVC(max_iter=5000, random_state=42)
svc_e.fit(X_w2v_train, y_train)
y_pred_svc_e = svc_e.predict(X_w2v_test)
evaluate_model("LinearSVC (Word2Vec avg)", y_test, y_pred_svc_e)
models_w2v['svc'] = svc_e

# Summarize F1 scores
f1_scores_w2v = {
    'logistic': f1_score(y_test, y_pred_lr_e, average='weighted'),
    'svc': f1_score(y_test, y_pred_svc_e, average='weighted'),
}
print("Word2Vec models F1 scores (weighted):", f1_scores_w2v)
best_w2v_key = max(f1_scores_w2v, key=f1_scores_w2v.get)
print("Best Word2Vec model:", best_w2v_key)


--- LogisticRegression (Word2Vec avg) ---
Accuracy: 0.7058
Precision (weighted): 0.7003
Recall (weighted):    0.7058
F1-score (weighted):  0.6777
Confusion Matrix:
[[151 270]
 [ 71 667]]

--- LinearSVC (Word2Vec avg) ---
Accuracy: 0.7248
Precision (weighted): 0.7258
Recall (weighted):    0.7248
F1-score (weighted):  0.6985
Confusion Matrix:
[[162 259]
 [ 60 678]]

Word2Vec models F1 scores (weighted): {'logistic': 0.6777298721298297, 'svc': 0.6985214932827438}
Best Word2Vec model: svc


### Select and Save Best Model Pipeline for Deployment

In [11]:
# We will pick the best TF-IDF model (based on F1) for deployment with Flask.
# Additionally, we save the TF-IDF vectorizer, the chosen model, and a README meta file in webapp/

WEBAPP_DIR = Path('webapp')
WEBAPP_DIR.mkdir(parents=True, exist_ok=True)

# Choose by comparing best TF-IDF F1 vs best W2V F1
best_overall = None
if max(f1_scores.values()) >= max(f1_scores_w2v.values()):
    # Deploy TF-IDF best model
    best_overall = ('tfidf', best_tfidf_key, models_tfidf[best_tfidf_key])
    # Save vectorizer and model
    with open(WEBAPP_DIR / 'vectorizer_tfidf.pkl', 'wb') as f:
        pickle.dump(tfidf_vectorizer, f)
    with open(WEBAPP_DIR / 'model_tfidf.pkl', 'wb') as f:
        pickle.dump(models_tfidf[best_tfidf_key], f)
    print(f"Selected TF-IDF pipeline for deployment: {best_tfidf_key}")
else:
    # Deploy Word2Vec embedding pipeline (slower at runtime but available)
    best_overall = ('w2v', best_w2v_key, models_w2v[best_w2v_key])
    # Save W2V model and classifier
    w2v_model.save(str(WEBAPP_DIR / 'w2v_model.model'))
    with open(WEBAPP_DIR / 'model_w2v.pkl', 'wb') as f:
        pickle.dump(models_w2v[best_w2v_key], f)
    print(f"Selected Word2Vec pipeline for deployment: {best_w2v_key}")

# Save a small meta file describing which pipeline was chosen
meta = {
    'deployed_pipeline': best_overall[0],
    'deployed_model_key': best_overall[1],
    'notes': "TF-IDF pipeline is recommended for fast web deployment. Word2Vec is available for research comparison."
}
with open(WEBAPP_DIR / 'deployment_meta.pkl', 'wb') as f:
    pickle.dump(meta, f)

print("Saved deployment artifacts to", WEBAPP_DIR)


Selected TF-IDF pipeline for deployment: svc
Saved deployment artifacts to webapp


### Sanity Check: Local Prediction Test with Saved Pipeline

In [12]:
# Demonstrates how the Flask app will call model + vectorizer. We do it here for confirmation.

def predict_text_tf(text, vectorizer, model):
    txt = clean_text(text)
    tokens = tokenize_and_remove_stopwords(txt)
    joined = ' '.join(tokens)
    Xv = vectorizer.transform([joined])
    pred = model.predict(Xv)[0]
    prob = model.predict_proba(Xv)[0] if hasattr(model, "predict_proba") else None
    return pred, prob

# Load saved artifacts for demo (choose whichever exists)
if (WEBAPP_DIR / 'vectorizer_tfidf.pkl').exists():
    with open(WEBAPP_DIR / 'vectorizer_tfidf.pkl', 'rb') as f:
        loaded_vectorizer = pickle.load(f)
    with open(WEBAPP_DIR / 'model_tfidf.pkl', 'rb') as f:
        loaded_model = pickle.load(f)
    demo_sent = "Shares surge after the company reports record profits and strong guidance."
    pred, prob = predict_text_tf(demo_sent, loaded_vectorizer, loaded_model)
    print("Demo sentence:", demo_sent)
    print("Predicted sentiment:", pred)
    if prob is not None:
        print("Prediction probabilities:", prob)
else:
    print("TF-IDF deployment artifacts not found — check previous cell outputs.")


Demo sentence: Shares surge after the company reports record profits and strong guidance.
Predicted sentiment: Positive
