In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
cd drive/MyDrive/IE7500_GroupB/Notebooks

/content/drive/.shortcut-targets-by-id/1hXOWgLy2A2GiwQDqG1SudIATn6Z_m1b6/IE7500_GroupB/Notebooks


In [None]:
# load necesary libraries
import numpy as np
import pandas as pd

In [None]:
dtypes_dict = {'headline': 'object',
               'url': 'object',
               'publisher': 'object',
               'stock': 'object',
               'tokens': 'object',
               'normalized_tokens': 'object',
               'filtered_tokens': 'object',
               'lemmas': 'object',
               'sentiment_score': 'float64',
               'Name': 'object',
               'Market Cap': 'float64',
               'Country': 'object',
               'IPO Year': 'float64',
               'Sector': 'object',
               'Industry': 'object',
               'year': 'int32',
               'month': 'int32',
               'day_of_week': 'int32',
               'sentiment_label': 'int64',
               'headline_length': 'int64',
               'word_count': 'int64',
               'Market_Cap_Category': 'object',
               'recommendation': 'object',
               'cap_Large': 'bool',
               'cap_Medium': 'bool',
               'cap_Mega': 'bool',
               'cap_Micro': 'bool',
               'cap_Nano': 'bool',
               'cap_Small': 'bool',
               'sector_Basic Materials': 'bool',
               'sector_Consumer Discretionary': 'bool',
               'sector_Consumer Staples': 'bool',
               'sector_Energy': 'bool',
               'sector_Finance': 'bool',
               'sector_Health Care': 'bool',
               'sector_Industrials': 'bool',
               'sector_Miscellaneous': 'bool',
               'sector_Real Estate': 'bool',
               'sector_Technology': 'bool',
               'sector_Telecommunications': 'bool',
               'sector_Utilities': 'bool', 'recommendation_label': 'int64',
               'publisher_label': 'int64', 'country_label': 'int64',
               'industry_label': 'int64'}

In [None]:
# load dataframes to use
df_main = pd.read_csv("saved_dfs/df_for_models.csv", dtype=dtypes_dict,
                      parse_dates=['date'])

In [None]:
df_main.head()

Unnamed: 0,headline,url,publisher,date,stock,tokens,normalized_tokens,filtered_tokens,lemmas,sentiment_score,...,sector_Industrials,sector_Miscellaneous,sector_Real Estate,sector_Technology,sector_Telecommunications,sector_Utilities,recommendation_label,publisher_label,country_label,industry_label
0,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01,A,"['Agilent', 'Technologies', 'Announces', 'Pric...","['agilent', 'technologies', 'announces', 'pric...","['agilent', 'technologies', 'announces', 'pric...","['agilent', 'technology', 'announces', 'pricin...",0.0,...,True,False,False,False,False,False,1,4,45,18
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18,A,"['Agilent', '(', 'A', ')', 'Gears', 'Up', 'for...","['agilent', 'a', 'gears', 'up', 'for', 'q2', '...","['agilent', 'gears', 'q2', 'earnings', 'cards']","['agilent', 'gear', 'q2', 'earnings', 'card']",0.0,...,True,False,False,False,False,False,1,16,45,18
2,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15,A,"['J.P.', 'Morgan', 'Asset', 'Management', 'Ann...","['morgan', 'asset', 'management', 'announces',...","['morgan', 'asset', 'management', 'announces',...","['morgan', 'asset', 'management', 'announces',...",0.3612,...,True,False,False,False,False,False,1,4,45,18
3,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15,A,"['Pershing', 'Square', 'Capital', 'Management'...","['pershing', 'square', 'capital', 'management'...","['pershing', 'square', 'capital', 'management'...","['pershing', 'square', 'capital', 'management'...",0.0,...,True,False,False,False,False,False,1,4,45,18
4,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12,A,"['Agilent', 'Awards', 'Trilogy', 'Sciences', '...","['agilent', 'awards', 'trilogy', 'sciences', '...","['agilent', 'awards', 'trilogy', 'sciences', '...","['agilent', 'award', 'trilogy', 'science', 'go...",0.4588,...,True,False,False,False,False,False,1,4,45,18


Create Vectors from Headlines (Vectorization Models)

To generate vectors for the headline column, you can use different NLP vectorization techniques, including:

TF-IDF (Term Frequency - Inverse Document Frequency)

Count Vectorizer (Bag of Words)

Word2Vec (Pre-trained Word Embeddings)

Doc2Vec (Sentence-Level Embeddings)

BERT Embeddings (Transformer-based Representation)

# Goal 1: Predict Buy / Hold / Sell

## Method 1: Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split

# Define Features
text_features = ["headline", "lemmas"]
numerical_features = ["sentiment_score"]
categorical_features = ["publisher_label", "country_label", "industry_label"]  # Already encoded
sector_features = ["sector_Industrials", "sector_Miscellaneous", "sector_Real Estate",
                   "sector_Technology", "sector_Telecommunications", "sector_Utilities"]
target = "recommendation_label"

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df_main[text_features + numerical_features + categorical_features + sector_features],
    df_main[target],
    test_size=0.2,
    random_state=42
)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Define Preprocessing
text_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=0.9, min_df=2)
num_transformer = StandardScaler()

# Full Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_transformer, "headline"),
        ("lemmas", text_transformer, "lemmas"),
        ("num", num_transformer, numerical_features),
        ("sector", "passthrough", sector_features),  # Already binary
        ("cat", "passthrough", categorical_features)  # Already encoded
    ]
)

In [None]:
from sklearn.linear_model import LogisticRegression

# Define Model
log_reg = LogisticRegression(max_iter=1000)

# Create Pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", log_reg)
])

# Train Model
pipeline.fit(X_train, y_train)


In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="f1_macro")
print(f"Logistic Regression Mean F1-score: {cv_scores.mean():.4f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
from sklearn.model_selection import GridSearchCV

# Define Parameter Grid
param_grid = {"classifier__C": [0.01, 0.1, 1, 10]}

# Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring="f1_macro", n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best Model
best_log_reg = grid_search.best_estimator_
print(f"Best Logistic Regression Params: {grid_search.best_params_}")

In [None]:
from sklearn.metrics import classification_report

# Predictions
y_pred = best_log_reg.predict(X_test)

# Evaluation
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred))

In [None]:
import joblib

joblib.dump(best_log_reg, "best_logistic_regression.pkl")
print("Best Logistic Regression Model Saved!")

## Method 2: RandomForest

In [None]:
from sklearn.model_selection import train_test_split

# Define Features
text_features = ["headline", "lemmas"]
numerical_features = ["sentiment_score"]
categorical_features = ["publisher_label", "country_label", "industry_label"]  # Already encoded
sector_features = ["sector_Industrials", "sector_Miscellaneous", "sector_Real Estate",
                   "sector_Technology", "sector_Telecommunications", "sector_Utilities"]
target = "recommendation_label"

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df_main[text_features + numerical_features + categorical_features + sector_features],
    df_main[target],
    test_size=0.2,
    random_state=42
)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Define Preprocessing
text_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=0.9, min_df=2)
num_transformer = StandardScaler()

# Full Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_transformer, "headline"),
        ("lemmas", text_transformer, "lemmas"),
        ("num", num_transformer, numerical_features),
        ("sector", "passthrough", sector_features),  # Already binary
        ("cat", "passthrough", categorical_features)  # Already encoded
    ]
)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Define Model
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)

# Create Pipeline
pipeline_rf = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", rf_model)
])

# Train Model
pipeline_rf.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(pipeline_rf, X_train, y_train, cv=5, scoring="f1_macro")
print(f"Random Forest Mean F1-score: {cv_scores.mean():.4f}")

Random Forest Mean F1-score: 0.3084


In [None]:
from sklearn.model_selection import GridSearchCV

# Define Parameter Grid
param_grid_rf = {
    "classifier__n_estimators": [100, 200, 300],
    "classifier__max_depth": [10, 20, None],
    "classifier__min_samples_split": [2, 5, 10]
}

# Grid Search
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=3, scoring="f1_macro", n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

# Best Model
best_rf_model = grid_search_rf.best_estimator_
print(f"Best Random Forest Params: {grid_search_rf.best_params_}")



In [None]:
from sklearn.metrics import classification_report

# Predictions
y_pred_rf = best_rf_model.predict(X_test)

# Evaluation
print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))

In [None]:
import joblib

joblib.dump(best_rf_model, "best_random_forest.pkl")
print("Best Random Forest Model Saved!")

## Method 3: XGBoost

In [None]:
from sklearn.model_selection import train_test_split

# Define Features
text_features = ["headline", "lemmas"]
numerical_features = ["sentiment_score"]
categorical_features = ["publisher_label", "country_label", "industry_label"]  # Already encoded
sector_features = ["sector_Industrials", "sector_Miscellaneous", "sector_Real Estate",
                   "sector_Technology", "sector_Telecommunications", "sector_Utilities"]
target = "recommendation_label"


# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df_main[text_features + numerical_features + categorical_features + sector_features],
    df_main[target],
    test_size=0.2,
    random_state=42
)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Define Preprocessing
text_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=0.9, min_df=2)
num_transformer = StandardScaler()

# Full Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_transformer, "headline"),
        ("lemmas", text_transformer, "lemmas"),
        ("num", num_transformer, numerical_features),
        ("sector", "passthrough", sector_features),  # Already binary
        ("cat", "passthrough", categorical_features)  # Already encoded
    ]
)

In [None]:
from xgboost import XGBClassifier

# Define Model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)

# Create Pipeline
pipeline_xgb = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", xgb_model)
])

# Train Model
pipeline_xgb.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(pipeline_xgb, X_train, y_train, cv=5, scoring="f1_macro")
print(f"XGBoost Mean F1-score: {cv_scores.mean():.4f}")

In [None]:
from sklearn.model_selection import GridSearchCV

# Define Parameter Grid
param_grid_xgb = {
    "classifier__learning_rate": [0.01, 0.1, 0.2],
    "classifier__max_depth": [3, 6, 9],
    "classifier__n_estimators": [100, 200, 300]
}

# Grid Search
grid_search_xgb = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=3, scoring="f1_macro", n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

# Best Model
best_xgb_model = grid_search_xgb.best_estimator_
print(f"Best XGBoost Params: {grid_search_xgb.best_params_}")

In [None]:
from sklearn.metrics import classification_report

# Predictions
y_pred_xgb = best_xgb_model.predict(X_test)

# Evaluation
print("XGBoost Performance:")
print(classification_report(y_test, y_pred_xgb))

In [None]:
import joblib

joblib.dump(best_xgb_model, "best_xgboost.pkl")
print("Best XGBoost Model Saved!")

## Method 4: SVM (Support Vector Machine)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Features and target for Goal 1
features = ["sentiment_score", "publisher_label", "country_label", "industry_label",
            "sector_Industrials", "sector_Miscellaneous", "sector_Real Estate",
            "sector_Technology", "sector_Telecommunications", "sector_Utilities"]
target = "recommendation_label"

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df_main[features], df_main[target], test_size=0.2, random_state=42
)

In [None]:
# Preprocessing: scale sentiment_score; passthrough categorical and sector binary features
preprocessor = ColumnTransformer([
    ("scale", StandardScaler(), ["sentiment_score"]),
    ("pass", "passthrough", features[1:])
])

# SVM classifier pipeline
svm_pipeline = Pipeline([
    ("prep", preprocessor),
    ("clf", SVC(decision_function_shape="ovr", probability=True))
])

In [None]:
from sklearn.model_selection import cross_val_score

# 5-fold cross-validation using F1 macro
cv_scores = cross_val_score(svm_pipeline, X_train, y_train, cv=5, scoring="f1_macro")
print(f"Cross-Validated F1 Score (SVM): {cv_scores.mean():.4f}")


In [None]:
from sklearn.model_selection import GridSearchCV

# SVM hyperparameter grid
param_grid = {
    "clf__C": [0.1, 1, 10],
    "clf__kernel": ["linear", "rbf"],
    "clf__gamma": ["scale", "auto"]
}

# Grid search
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=3, scoring="f1_macro", n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best model
best_svm_model = grid_search.best_estimator_
print(f"Best SVM Parameters: {grid_search.best_params_}")

In [None]:
# Predict on test set
y_pred = best_svm_model.predict(X_test)

# Evaluate
print("SVM Model Performance on Test Set:")
print(classification_report(y_test, y_pred))

In [None]:
import joblib

# Save the SVM model
joblib.dump(best_svm_model, "svm_recommendation_model.pkl")
print("Model saved as svm_recommendation_model.pkl")

## Method 5: MLP

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report

In [None]:
# Features and target
features = ["sentiment_score", "publisher_label", "country_label", "industry_label",
            "sector_Industrials", "sector_Miscellaneous", "sector_Real Estate",
            "sector_Technology", "sector_Telecommunications", "sector_Utilities"]
target = "recommendation_label"

# Split
X = df_main[features].fillna(0)
y = df_main[target]

# Train/test split
X_train, X_test, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert labels to one-hot for softmax classification
y_train = to_categorical(y_train_raw)
y_test = to_categorical(y_test_raw)
num_classes = y_train.shape[1]

In [None]:
model = Sequential([
    Dense(128, activation="relu", input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(num_classes, activation="softmax")
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_test_scaled, y_test),
    epochs=20,
    batch_size=32,
    verbose=1
)

In [None]:
# Predict classes
y_pred_probs = model.predict(X_test_scaled)
y_pred_classes = np.argmax(y_pred_probs, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Classification report
print("MLP Keras Model Performance:")
print(classification_report(y_true_classes, y_pred_classes))

In [None]:
model.save("keras_mlp_recommendation_model.h5")
print("Model saved as keras_mlp_recommendation_model.h5")

## Method 6: LSTM for Text-Based Prediction

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import classification_report

In [None]:
# Convert lemmas list to space-separated strings
df_main["lemmas_str"] = df_main["lemmas"].apply(lambda x: " ".join(x) if isinstance(x, list) else "")

# Define text and target
texts = df_main["lemmas_str"].fillna("")
labels = df_main["recommendation_label"]

# Train/test split
X_train_text, X_test_text, y_train_raw, y_test_raw = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# One-hot encode target
y_train = to_categorical(y_train_raw)
y_test = to_categorical(y_test_raw)
num_classes = y_train.shape[1]

In [None]:
# Tokenize text
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)

# Convert to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)

# Pad sequences to the same length
max_len = 30
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding="post", truncating="post")

In [None]:
embedding_dim = 100

model = Sequential([
    Embedding(input_dim=10000, output_dim=embedding_dim, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(num_classes, activation="softmax")
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_test_pad, y_test),
    epochs=10,
    batch_size=32
)

In [None]:
# Predict classes
y_pred_probs = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred_probs, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Classification report
print("LSTM Model Performance on Lemmas:")
print(classification_report(y_true_classes, y_pred_classes))

In [None]:
model.save("lstm_text_recommendation_model.h5")
print("Model saved as lstm_text_recommendation_model.h5")

## Method 7: BERT + MLP (Transformer-Based)

In [None]:
!pip install transformers datasets

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Convert lemmas to joined string
df_main["lemmas_str"] = df_main["lemmas"].apply(lambda x: " ".join(x) if isinstance(x, list) else "")

# Define BERT vectorization function (mean-pooled)
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=64)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Generate BERT embeddings (may take time)
df_main["bert_vector"] = df_main["lemmas_str"].apply(lambda x: get_bert_embedding(x))

In [None]:
from sklearn.model_selection import train_test_split

# Convert BERT vectors to a feature matrix
X = np.vstack(df_main["bert_vector"].values)
y = df_main["recommendation_label"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# MLP classifier on top of BERT embeddings
clf = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Evaluate
print("BERT Embeddings + MLP Performance:")
print(classification_report(y_test, y_pred))