In [49]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from gensim.models.phrases import Phrases, Phraser
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import numpy as np
import tensorflow as tf
np.random.seed(999)
tf.random.set_seed(999)

# Load DataFrame (replace with your actual data, e.g., BloombergNews100.csv)
# Load the dataset
df = pd.read_csv('data/News Articles/Kraggle Datasets/Labeled-headlines (Kraggle).csv', encoding='ISO-8859-1', on_bad_lines='skip', header=None)
df = df.dropna()

# Define a function to preprocess the text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic characters
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

# Apply the function to preprocess the 'data' column
df['processed_content'] = df[1].apply(preprocess_text)

df = pd.DataFrame({
    'processed_content': df['processed_content'],
    'sentiment': df[0]
})

df.head()

# Apply bigram/trigram preprocessing
bigram_model = Phrases(df['processed_content'], min_count=1, threshold=10)
trigram_model = Phrases(bigram_model[df['processed_content']], min_count=1, threshold=10)
bigram_phraser = Phraser(bigram_model)
trigram_phraser = Phraser(trigram_model)
df['processed_content_phrased'] = df['processed_content'].apply(
    lambda x: trigram_phraser[bigram_phraser[x]]
)

# Convert sentiment labels to numerical (if not already)
label_map = {'neutral': 1, 'none': 1, 'positive': 2, 'negative': 0}
if df['sentiment'].dtype == 'object':
    df['sentiment'] = df['sentiment'].map(label_map)

# Split data
X = df['processed_content_phrased']
y = df['sentiment']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42)
print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

Train: 3392, Val: 974, Test: 480


In [51]:
# 1. Normal ANN

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding

model = Sequential()
model.add(Dense(128, input_dim=(X_train.shape[1]), activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=20, batch_size=32)

IndexError: tuple index out of range

In [None]:
# 2. ANN + TF-IDF Model with Custom Vocabulary

from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer

positive_terms = [
    'profit', 'rose', 'increased', 'increase', 'agreement', 'signed', 'contract',
    'growth', 'grew', 'awarded', 'positive', 'operating profit', 'net profit',
    'profit rose', 'period increased', 'sales increased', 'signed agreement',
    'increased respectively', 'loss narrowed', 'pretax profit', 'profit period',
    'sales rose', 'profit net', 'grew percent', 'awarded contract', 'sales increase',
    'profit increased', 'cost savings', 'expected increase', 'profit totalled',
    'operating profit rose', 'net profit rose', 'period increased revenue',
    'increased respectively compared', 'net sales increased', 'sales period increased',
    'operating profit net', 'profit net sales', 'today net profit', 'sales rose net',
    'operating profit increased', 'net sales rose', 'net sales increase',
    'said net profit', 'operating profit totalled', 'operating profit period',
    'grew percent million'
]

negative_terms = [
    'loss', 'decreased', 'fell', 'lower', 'cut', 'dropped', 'negative',
    'declined', 'warning', 'laid', 'decreased eur', 'operating loss',
    'net loss', 'profit fell', 'sales decreased', 'profit decreased',
    'profit warning', 'loss totalled', 'pct lower', 'period decreased',
    'pretax loss', 'today slipped', 'sales fell', 'slipped net',
    'personnel reductions', 'fell percent', 'loss totaled', 'decreased net',
    'laid off', 'operating profit fell', 'profit period decreased',
    'operating loss totalled', 'period decreased net', 'net sales decreased',
    'said today slipped', 'decreased net sales', 'net profit decreased',
    'slipped net loss', 'issued profit warning', 'net profit fell',
    'operating profit decreased', 'today slipped net'
]

vocabulary = positive_terms + negative_terms

# Normalize terms for TF-IDF (replace underscores with spaces if needed)
vocabulary = [term.replace('_', ' ') for term in vocabulary]

# Using your exact TF-IDF setup
vectorizer = TfidfVectorizer(
    vocabulary=vocabulary,
    ngram_range=(1, 3),  # Changed from (0,3) to (1,3) as (0,3) is invalid
    lowercase=False
)

# Convert text to TF-IDF features using your phrased content
X_train_tfidf = vectorizer.fit_transform([' '.join(x) for x in X_train])
X_val_tfidf = vectorizer.transform([' '.join(x) for x in X_val])
X_test_tfidf = vectorizer.transform([' '.join(x) for x in X_test])
print("TF-IDF shape:", X_train_tfidf.shape)

# Convert labels to numerical format using your label_map
# Note: Using your label_map {'neutral':1, 'none':1, 'positive':2, 'negative':0}
y_train_enc = y_train.values
y_val_enc = y_val.values
y_test_enc = y_test.values

# One-hot encode labels
num_classes = 3  # For your 3 classes (0:negative, 1:neutral, 2:positive)
y_train_onehot = to_categorical(y_train_enc, num_classes=num_classes)
y_val_onehot = to_categorical(y_val_enc, num_classes=num_classes)
y_test_onehot = to_categorical(y_test_enc, num_classes=num_classes)

# ANN with Your Custom TF-IDF Features

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping

def build_ann_tfidf(input_dim):
    model = Sequential([
        Dense(256, activation='relu', input_dim=input_dim),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(16, activation='relu'),
        Dense(8, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer='adam',
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

ann_tfidf = build_ann_tfidf(X_train_tfidf.shape[1])
ann_tfidf.summary()

# Train ANN
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history_ann = ann_tfidf.fit(
    X_train_tfidf.toarray(), y_train_onehot,
    validation_data=(X_val_tfidf.toarray(), y_val_onehot),
    epochs=20,
    batch_size=32,
    # callbacks=[early_stop],
    verbose=1
)