In [1]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [2]:
# cd drive/MyDrive/IE7500_GroupB/Notebooks

In [3]:
# load necesary libraries
import numpy as np
import pandas as pd

In [4]:
dtypes_dict = {'headline': 'object',
               'url': 'object',
               'publisher': 'object',
               'stock': 'object',
               'tokens': 'object',
               'normalized_tokens': 'object',
               'filtered_tokens': 'object',
               'lemmas': 'object',
               'sentiment_score': 'float64',
               'Name': 'object',
               'Market Cap': 'float64',
               'Country': 'object',
               'IPO Year': 'float64',
               'Sector': 'object',
               'Industry': 'object',
               'year': 'int32',
               'month': 'int32',
               'day_of_week': 'int32',
               'sentiment_label': 'int64',
               'headline_length': 'int64',
               'word_count': 'int64',
               'Market_Cap_Category': 'object',
               'recommendation': 'object',
               'cap_Large': 'bool',
               'cap_Medium': 'bool',
               'cap_Mega': 'bool',
               'cap_Micro': 'bool',
               'cap_Nano': 'bool',
               'cap_Small': 'bool',
               'sector_Basic Materials': 'bool',
               'sector_Consumer Discretionary': 'bool',
               'sector_Consumer Staples': 'bool',
               'sector_Energy': 'bool',
               'sector_Finance': 'bool',
               'sector_Health Care': 'bool',
               'sector_Industrials': 'bool',
               'sector_Miscellaneous': 'bool',
               'sector_Real Estate': 'bool',
               'sector_Technology': 'bool',
               'sector_Telecommunications': 'bool',
               'sector_Utilities': 'bool', 'recommendation_label': 'int64',
               'publisher_label': 'int64', 'country_label': 'int64',
               'industry_label': 'int64'}

In [5]:
# load dataframes to use
df_main = pd.read_csv("saved_dfs/df_for_models.csv", dtype=dtypes_dict,
                      parse_dates=['date'])

In [6]:
df_main.head()

Unnamed: 0,headline,url,publisher,date,stock,tokens,normalized_tokens,filtered_tokens,lemmas,sentiment_score,...,sector_Industrials,sector_Miscellaneous,sector_Real Estate,sector_Technology,sector_Telecommunications,sector_Utilities,recommendation_label,publisher_label,country_label,industry_label
0,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01,A,"['Agilent', 'Technologies', 'Announces', 'Pric...","['agilent', 'technologies', 'announces', 'pric...","['agilent', 'technologies', 'announces', 'pric...","['agilent', 'technology', 'announces', 'pricin...",0.0,...,True,False,False,False,False,False,1,4,45,18
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18,A,"['Agilent', '(', 'A', ')', 'Gears', 'Up', 'for...","['agilent', 'a', 'gears', 'up', 'for', 'q2', '...","['agilent', 'gears', 'q2', 'earnings', 'cards']","['agilent', 'gear', 'q2', 'earnings', 'card']",0.0,...,True,False,False,False,False,False,1,16,45,18
2,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15,A,"['J.P.', 'Morgan', 'Asset', 'Management', 'Ann...","['morgan', 'asset', 'management', 'announces',...","['morgan', 'asset', 'management', 'announces',...","['morgan', 'asset', 'management', 'announces',...",0.3612,...,True,False,False,False,False,False,1,4,45,18
3,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15,A,"['Pershing', 'Square', 'Capital', 'Management'...","['pershing', 'square', 'capital', 'management'...","['pershing', 'square', 'capital', 'management'...","['pershing', 'square', 'capital', 'management'...",0.0,...,True,False,False,False,False,False,1,4,45,18
4,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12,A,"['Agilent', 'Awards', 'Trilogy', 'Sciences', '...","['agilent', 'awards', 'trilogy', 'sciences', '...","['agilent', 'awards', 'trilogy', 'sciences', '...","['agilent', 'award', 'trilogy', 'science', 'go...",0.4588,...,True,False,False,False,False,False,1,4,45,18


## Load and Prepare Data

In [7]:
# Define Features
stock_features = ["stock", "industry_label"]  # Stock-related
sector_features = ["sector_Industrials", "sector_Miscellaneous", "sector_Real Estate",
                   "sector_Technology", "sector_Telecommunications", "sector_Utilities"]  # Sector indicators
sentiment_features = ["sentiment_score"]  # Aggregated sentiment
publisher_country = ["publisher_label", "country_label"]
text_features = ["headline", "lemmas"]

# Create DataFrame for Recommendation
recommendation_df = df_main[stock_features + sector_features + sentiment_features + publisher_country + text_features]

# Aggregate sentiment scores per stock (if multiple entries exist)
recommendation_df = recommendation_df.groupby("stock").agg({
    "industry_label": "first",
    **{col: "first" for col in sector_features},  # Keep first occurrence
    "sentiment_score": "mean",  # Aggregate sentiment
    "publisher_label": "first",
    "country_label": "first",
    "headline": "last",  # Most recent headline
    "lemmas": "last"  # Most recent processed text
}).reset_index()

print(recommendation_df.head())

  stock  industry_label  sector_Industrials  sector_Miscellaneous  \
0     A              18                True                 False   
1    AA               6                True                 False   
2   AAC             143               False                 False   
3  AADR             143               False                 False   
4   AAL               5               False                 False   

   sector_Real Estate  sector_Technology  sector_Telecommunications  \
0               False              False                      False   
1               False              False                      False   
2               False              False                      False   
3               False              False                      False   
4               False              False                      False   

   sector_Utilities  sentiment_score  publisher_label  country_label  \
0             False         0.067877                4             45   
1             

# Method 1: Content-Based Filtering (TF-IDF + Cosine Similarity)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert 'lemmas' into TF-IDF vectors (text feature extraction)
tfidf_vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_df=0.9, min_df=2)
X_tfidf = tfidf_vectorizer.fit_transform(recommendation_df["lemmas"])


In [9]:
# Compute Cosine Similarity Matrix
cosine_sim = cosine_similarity(X_tfidf, X_tfidf)

# Function to recommend similar stocks based on text similarity
def recommend_stocks_content(stock_name, top_n=5):
    stock_idx = recommendation_df.index[recommendation_df["stock"] == stock_name][0]  # Get stock index
    sim_scores = list(enumerate(cosine_sim[stock_idx]))  # Retrieve similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]  # Sort & exclude self
    stock_indices = [i[0] for i in sim_scores]
    return recommendation_df.iloc[stock_indices][["stock", "industry_label", "headline"]]  # Return recommended stocks

# Example: Recommend stocks similar to stock "A"
print(recommend_stocks_content("A"))


     stock  industry_label                                           headline
790   BSJK             143                         BBBs: Beyond The Headlines
3243  JBHT             138                     JPMorgan: Tomorrow's Headlines
1986    EV             143                  CPI Report: Wednesday's Headlines
3395  KEYS              65  Danaher Upgraded As Analyst Pushes Agilent Buyout
3055  IMPV             143  Security Firm Imperva Follows Groupon With Hot...


In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute cosine similarity matrix if not already computed
cosine_sim = cosine_similarity(X_tfidf, X_tfidf)

# Function to compute average cosine similarity (excluding diagonal/self-similarity)
def average_cosine_similarity(sim_matrix):
    sim_matrix_no_diag = sim_matrix.copy()
    np.fill_diagonal(sim_matrix_no_diag, 0)  # Exclude self-similarity
    return np.mean(sim_matrix_no_diag)

# Evaluate
avg_cos_sim = average_cosine_similarity(cosine_sim)
print(f"Average Cosine Similarity (excluding self-similarity): {avg_cos_sim:.4f}")


Average Cosine Similarity (excluding self-similarity): 0.0068


The average cosine similarity of 0.0068 indicates that the headlines across different stocks have very low textual similarity. This suggests the content-based recommender is working with highly diverse information, minimizing redundancy. However, such low similarity may reduce the effectiveness of recommendations, as meaningful patterns between stocks are harder to detect.

# Method 2: KNN Similarity-Based Recommendation

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# Select numerical features for similarity search
knn_features = sector_features + ["sentiment_score", "publisher_label", "country_label"]

# Scale the features to ensure uniformity
scaler = StandardScaler()
X_knn = scaler.fit_transform(recommendation_df[knn_features])

# Train the KNN model for similarity search
knn_model = NearestNeighbors(n_neighbors=5, metric="cosine")
knn_model.fit(X_knn)


In [13]:
# Function to find similar stocks using KNN
def recommend_stocks_knn(stock_name, top_n=5):
    stock_idx = recommendation_df.index[recommendation_df["stock"] == stock_name][0]  # Find stock index
    distances, indices = knn_model.kneighbors([X_knn[stock_idx]], n_neighbors=top_n+1)  # Retrieve nearest stocks
    stock_indices = indices[0][1:]  # Exclude the stock itself
    return recommendation_df.iloc[stock_indices][["stock", "industry_label", "sentiment_score"]]

# Example: Recommend stocks similar to "A"
print(recommend_stocks_knn("A"))


     stock  industry_label  sentiment_score
3243  JBHT             138         0.067907
394    ASA             109         0.067978
2884    HY              37         0.067775
990     CF               4         0.067628
2490   GHM              65         0.068211


In [14]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics.pairwise import cosine_distances

# 1. Compute average cosine distance to nearest neighbors (excluding self)
def average_knn_distance(model, data):
    distances, indices = model.kneighbors(data, n_neighbors=6)  # 6 to include self
    avg_distances = [np.mean(dist[1:]) for dist in distances]  # exclude self-distance
    return np.mean(avg_distances)

# 2. Compute Silhouette Score with KMeans clustering
kmeans = KMeans(n_clusters=2, random_state=42)
cluster_labels = kmeans.fit_predict(X_knn)
sil_score = silhouette_score(X_knn, labels=cluster_labels, metric='cosine')

# Evaluate
avg_knn_dist = average_knn_distance(knn_model, X_knn)
print(f"Average KNN Cosine Distance (excluding self): {avg_knn_dist:.4f}")
print(f"Silhouette Score (cosine): {sil_score:.4f}")


Average KNN Cosine Distance (excluding self): 0.0008
Silhouette Score (cosine): 0.5297


The average KNN cosine distance of 0.0008 indicates that the recommended stocks are extremely close in the feature space, suggesting strong similarity in sentiment, sector, and metadata.
The silhouette score of 0.5297 reflects moderately well-separated clusters, indicating meaningful structure in the data.
Overall, the model is producing tightly grouped, contextually relevant stock recommendations.

# Method 3: Clustering-Based Stock Recommendation (K-Means)

In [15]:
from sklearn.cluster import KMeans

# Train K-Means model to create stock groups
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
recommendation_df["cluster"] = kmeans.fit_predict(X_knn)

# Print stocks with their assigned clusters
print(recommendation_df[["stock", "industry_label", "cluster"]].head(10))


  stock  industry_label  cluster
0     A              18        0
1    AA               6        0
2   AAC             143        2
3  AADR             143        2
4   AAL               5        2
5  AAMC             143        2
6  AAME              70        2
7   AAN             143        2
8  AAOI             127        3
9  AAON              65        0


In [16]:
# Function to recommend stocks from the same cluster
def recommend_stocks_cluster(stock_name, top_n=5):
    cluster_label = recommendation_df.loc[recommendation_df["stock"] == stock_name, "cluster"].values[0]
    cluster_stocks = recommendation_df[recommendation_df["cluster"] == cluster_label]  # Get stocks in the same cluster
    return cluster_stocks.sample(min(top_n, len(cluster_stocks)))

# Example: Recommend stocks in the same cluster as "A"
print(recommend_stocks_cluster("A"))

     stock  industry_label  sector_Industrials  sector_Miscellaneous  \
1161   CNI             117                True                 False   
4191   NPO              79                True                 False   
2782    HL              82                True                 False   
4910   RBC              79                True                 False   
3550   LII              65                True                 False   

      sector_Real Estate  sector_Technology  sector_Telecommunications  \
1161               False              False                      False   
4191               False              False                      False   
2782               False              False                      False   
4910               False              False                      False   
3550               False              False                      False   

      sector_Utilities  sentiment_score  publisher_label  country_label  \
1161             False         0.075402        

In [17]:
from sklearn.metrics import silhouette_score

# Evaluate Silhouette Score (cosine distance between points and clusters)
sil_score = silhouette_score(X_knn, labels=recommendation_df["cluster"], metric="cosine")

# Evaluate Inertia (how tight the clusters are)
inertia = kmeans.inertia_

print(f"Silhouette Score (cosine): {sil_score:.4f}")
print(f"Inertia (within-cluster sum of squares): {inertia:.4f}")


Silhouette Score (cosine): 0.2926
Inertia (within-cluster sum of squares): 32462.4885


The silhouette score of 0.2926 suggests moderate cluster separation, indicating that stocks within the same cluster share some similarity but there's room for clearer grouping.
The inertia value of 32,462.49 reflects the overall compactness of clusters — lower values would indicate tighter groupings.
Overall, the clustering model identifies stock groups with reasonable cohesion, suitable for broad similarity-based recommendations.

# Method 4: Collaborative Filtering (Deep Learning)

In [20]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate

# Define Inputs (Investor & Stock Embeddings)
input_investor = Input(shape=(1,))
input_stock = Input(shape=(1,))

# Define Embeddings
embedding_investor = Embedding(input_dim=100, output_dim=10)(input_investor)
embedding_stock = Embedding(input_dim=1000, output_dim=10)(input_stock)

# Flatten Layers
vector_investor = Flatten()(embedding_investor)
vector_stock = Flatten()(embedding_stock)

# Concatenate & Predict Interaction Score
concat = Concatenate()([vector_investor, vector_stock])
dense = Dense(128, activation="relu")(concat)
output = Dense(1, activation="sigmoid")(dense)

# Compile Model
model = Model(inputs=[input_investor, input_stock], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Summary
model.summary()


I0000 00:00:1743979004.004332  931925 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 11437 MB memory:  -> device: 0, name: Tesla P100-PCIE-12GB, pci bus id: 0000:03:00.0, compute capability: 6.0


In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Step 1: Simulate Interaction Data
num_investors = 50  
num_interactions = 500
np.random.seed(42)

# Create interaction samples
interactions_df = pd.DataFrame({
    "investor_id": np.random.randint(0, num_investors, size=num_interactions),
    "stock_id": np.random.randint(0, len(recommendation_df), size=num_interactions),
    "interaction": np.random.randint(0, 2, size=num_interactions)
})

# Step 2: Train-Test Split
X_train_inv, X_test_inv, X_train_stock, X_test_stock, y_train, y_test = train_test_split(
    interactions_df["investor_id"].values,
    interactions_df["stock_id"].values,
    interactions_df["interaction"].values,
    test_size=0.2,
    random_state=42
)

X_train = [X_train_inv, X_train_stock]
X_test = [X_test_inv, X_test_stock]

# Step 3: Train the Model
model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=0)

# Step 4: Evaluate
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Loss (Binary Cross-Entropy): {loss:.4f}")

# Step 5: Classification Report
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

print(classification_report(y_test, y_pred))


I0000 00:00:1743979226.386413  983380 service.cc:152] XLA service 0x153ed4004ec0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1743979226.386456  983380 service.cc:160]   StreamExecutor device (0): Tesla P100-PCIE-12GB, Compute Capability 6.0
2025-04-06 18:40:26.458732: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1743979226.862976  983380 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1743979228.071418  983380 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Test Accuracy: 0.4200
Test Loss (Binary Cross-Entropy): 0.7036
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 170ms/step
              precision    recall  f1-score   support

           0       0.43      0.72      0.54        47
           1       0.38      0.15      0.22        53

    accuracy                           0.42       100
   macro avg       0.41      0.44      0.38       100
weighted avg       0.40      0.42      0.37       100

