In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [2]:
import spacy
from nltk.tokenize import word_tokenize
import nltk
from nltk import pos_tag

In [3]:
import time
import pandas as pd
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from nltk.sentiment import SentimentIntensityAnalyzer

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.cluster import KMeans

In [5]:
lemmatizer = WordNetLemmatizer()
# Function to clean text: Tokenization, Lemmatization, and Stopword removal
def clean_text(text):
    if isinstance(text, str):  # Check if the text is a string
        text = text.lower()  # Convert to lowercase
        
        # Remove URLs using a regular expression
        text = re.sub(r'http\S+|www\S+', '', text)
        
        tokens = word_tokenize(text)  # Tokenize the text

        # Lemmatize each token
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

        # Additional preprocessing (like removing stopwords)
        stop_words = set(stopwords.words('english'))
        tokens_without_stopwords = [token for token in lemmatized_tokens if token not in stop_words]

        # Rejoin tokens into a single string before passing to TextBlob
        cleaned_text = " ".join(tokens_without_stopwords)
        return cleaned_text
    else:
        # If the text is not a string (e.g., NaN or float), return an empty string or a default value
        return ""  # Or return a string like 'Invalid Text' if needed


In [6]:
nlp = spacy.load("en_core_web_lg")

def extract_entities(text):
    # Step 1: Use spaCy to extract entities (like companies, products, etc.)
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ == "ORG"]  # Extracting organizations

    # Step 2: Apply POS tagging to the tokens in the text
    tokens = word_tokenize(text)  # Tokenize the text
    pos_tags = pos_tag(tokens)  # Get POS tags for the tokens

    # Step 3: Extract proper nouns (NNP) or important nouns (NN) using POS tagging
    nouns = [word for word, tag in pos_tags if tag in ["NNP","NN"]]  # Extracting proper nouns and common nouns

    # Combine NER and POS results
    refined_entities = list(set(entities + nouns))  # Combine and remove duplicates
    return refined_entities

In [7]:
  # Function to load data from Excel
def load_data_from_excel(file_path):
    # Read the Excel file
    df = pd.read_csv(file_path)
    
    # Assuming the Excel file has columns 'text' for the comments
    texts = df['comment_body'].tolist()  # List of texts (Reddit posts)
    return df, texts

In [8]:
# Function to perform sentiment analysis with TextBlob
def analyze_sentiment_with_textblob(texts):
    subjectivity = []
    polarity = []
    for text in texts:
        blob = TextBlob(text)
        subjectivity.append(blob.sentiment.subjectivity)
        polarity.append(blob.sentiment.polarity)
    return subjectivity, polarity


In [9]:
# Load data from Excel sheet
file_path = "/Users/jaredog/Downloads/Scrapedv2.csv"  # Replace with your actual Excel file path
df, texts = load_data_from_excel(file_path)

# --- 1. Preprocess the text ---
start_time = time.time()
cleaned_texts = [clean_text(text) for text in texts]
df['cleaned_comment'] = cleaned_texts
preprocessing_time = time.time() - start_time
print(f"Preprocessing Time: {preprocessing_time:.4f} seconds")


Preprocessing Time: 6.1746 seconds


In [10]:
# --- 2. Sentiment Analysis with TextBlob ---
start_time = time.time()
subjectivity, polarity = analyze_sentiment_with_textblob(cleaned_texts)
sentiment_analysis_time = time.time() - start_time
print(f"Sentiment Analysis Time: {sentiment_analysis_time:.4f} seconds")

# Append the sentiment analysis results to the dataframe
df['subjectivity'] = subjectivity
df['polarity'] = polarity

# Determine the overall sentiment (positive, negative, neutral)
sentiment_labels = []
for polarity_score in polarity:
    if polarity_score > 0:
        sentiment_labels.append('positive')
    elif polarity_score < 0:
        sentiment_labels.append('negative')
    else:
        sentiment_labels.append('neutral')

df['sentiment'] = sentiment_labels

Sentiment Analysis Time: 2.4401 seconds


In [11]:
# --- 3. Remove neutral sentiment rows ---
df = df[df['sentiment'] != 'neutral']  # Drop all rows where sentiment is 'neutral'


In [12]:
# --- 4. Named Entity Recognition ---
start_time = time.time()
# Apply NER only to the remaining (non-neutral) rows
# make sure no NaNs, and everything is a str

remaining_texts = df['comment_body'].tolist()  # Only the rows with non-neutral sentiment
entities = [extract_entities(text) for text in remaining_texts]
entity_extraction_time = time.time() - start_time
print(f"Entity Extraction Time: {entity_extraction_time:.4f} seconds")

# Append extracted entities to the dataframe
df['entities'] = entities
df.head()

Entity Extraction Time: 229.3215 seconds


Unnamed: 0,subreddit,title,post_id,post_url,post_content,comment_id,comment_body,comment_author,comment_score,created,created_iso,readable_date,cleaned_comment,subjectivity,polarity,sentiment,entities
0,tigerbrokers_official,TigerGPT Upgrades with DeepSeek-R1 and looks s...,1isb5q5,https://www.reddit.com/gallery/1isb5q5,[No text content],mdz914f,"yeah, i saw the launch news [from Reuters, Tig...",Passionjason,1,1740141702,2025-02-21 12:41:42,21 February 2025,"yeah , saw launch news [ reuters , tiger broke...",0.453333,0.223333,positive,"[https, news, Tiger Brokers, Reuters, DeepSeek..."
1,webull,Official referral thread,yvxdse,https://www.reddit.com/r/Webull/comments/yvxds...,Get your referral code or share one for someon...,lfv8ptv,Do you like money? That’s so weird so do I! We...,SirDouchebagTheThird,1,1722456593,2024-07-31 20:09:53,31 July 2024,like money ? ’ weird ! much common . kissed re...,0.75,-0.4625,negative,"[//a.webull.com/NwcjDmTx8qsTj6ALhZ, https, 😚, ..."
2,webull,Official referral thread,yvxdse,https://www.reddit.com/r/Webull/comments/yvxds...,Get your referral code or share one for someon...,kuld53j,"If anyone is still looking for a webull offer,...",Solid_Subject,1,1710283740,2024-03-12 22:49:00,12 March 2024,"anyone still looking webull offer , one pretty...",0.8,0.475,positive,"[offer, https, anyone, //a.webull.com/TfjQ9yCQ..."
3,webull,Official referral thread,yvxdse,https://www.reddit.com/r/Webull/comments/yvxds...,Get your referral code or share one for someon...,kt15bq7,Not only can you get 75 free fractional shares...,taegha,1,1709402773,2024-03-02 18:06:13,02 March 2024,"get 75 free fractional share , also best frien...",0.475,0.375,positive,"[friend, https, //a.webull.com/TfjynhN7H8LJpyt..."
4,webull,Official referral thread,yvxdse,https://www.reddit.com/r/Webull/comments/yvxds...,Get your referral code or share one for someon...,mksi2wj,Unlock the stock market’s hidden treasure 👋👋👋 ...,nuddermado,1,1743473047,2025-04-01 2:04:07,01 April 2025,unlock stock market ’ hidden treasure 👋👋👋 sign...,0.558333,0.136458,positive,"[’, https, referral, house, market, Street, Si..."


In [75]:

sc = df
sc = sc.drop(['title','post_url','post_content','comment_id','comment_author','created','created_iso','readable_date', ], axis=1)
sc.iloc[100:110]  # Shows rows 100 to 109


Unnamed: 0,subreddit,post_id,comment_body,comment_score,cleaned_comment,subjectivity,polarity,sentiment,entities
106,webull,1jy0o2j,That’s one thing about Fidelity that stands ou...,5,"’ one thing fidelity stand , top notch custome...",0.404762,0.309524,positive,"[customer, time, Fidelity, help, service, some..."
107,webull,1jy0o2j,I have Fidelity and Webull and I prefer Fideli...,1,fidelity webull prefer fidelity . educational ...,0.358333,0.25,positive,"[customer, Fidelity, corp, planning, Webull, r..."
108,webull,1jy0o2j,"It could use some twerking, but it's far from ...",3,"could use twerking , 's far bad imo . although...",0.622222,-0.133333,negative,"[twerking, IMO, Fidelity, curve, app, learning..."
109,webull,1jy0o2j,Schwab was worse than both at one point,2,schwab wa worse one point,0.6,-0.4,negative,"[point, Schwab]"
110,webull,1jy0o2j,"Facts. But like someone else said, their mobil...",1,"fact . like someone else said , mobile ui blee...",0.6,0.5,positive,"[someone, Webull, UI, app]"
111,webull,1jy0o2j,Agree to disagree. Even with the huge improvem...,2,agree disagree . even huge improvement ’ made ...,0.494444,0.2,positive,"[use, twerking, ease, Tasty, Agree, year, Webu..."
112,webull,1jy0o2j,Definitely agree to disagree as we have differ...,2,definitely agree disagree different preference...,0.5275,0.193333,positive,"[week, performance, multi-use, chain, RH, brok..."
113,webull,1jxvayh,Got to love futures you never have this issue ...,1,got love future never issue instant deposit time,0.463889,0.166667,positive,"[time, deposit, issue, Got]"
114,webull,1jy2dyw,A lot of us have seen the same absurd posts ag...,12,lot u seen absurd post . daily assault webull ...,0.440146,-0.107738,negative,"[customer, week, life, Please, support, minute..."
115,webull,1jy2dyw,"I specifically stated that I ""recently"" just s...",2,specifically stated `` recently '' started usi...,0.505303,-0.082955,negative,"[output, time, lot, amount, hand, paragraph, d..."


In [13]:
# --- 5. Save the modified data back to CSV ---
output_file_path = "Sentimentedv2.csv"
df.to_csv(output_file_path, index=False)
print(f"Modified data saved to {output_file_path}")

# --- Performance Metrics ---
total_time = preprocessing_time + sentiment_analysis_time + entity_extraction_time
records_per_second = len(df) / total_time  # records classified per second

print(f"\n--- Performance Metrics ---")
print(f"Total Time for Preprocessing, Sentiment Analysis, and Entity Extraction: {total_time:.4f} seconds")
print(f"Records Classified per Second: {records_per_second:.2f} records/second")

Modified data saved to Sentimentedv2.csv

--- Performance Metrics ---
Total Time for Preprocessing, Sentiment Analysis, and Entity Extraction: 237.9361 seconds
Records Classified per Second: 43.70 records/second


In [35]:
#Combining the labelled data 
# Load the datasets
df1 = pd.read_csv('/Users/jaredog/Downloads/git code/SC4021-Info-retrieval/ClassificationNew/JupyterNotebook/Sentimentedv2.csv')  # Replace with your dataset file paths
df2 = pd.read_csv('/Users/jaredog/Downloads/git code/SC4021-Info-retrieval/ClassificationNew/JupyterNotebook/Labelledv2.csv')

df1['sentiment'] = df1['sentiment'].replace({'positive': 1.0, 'negative': -1.0})
# Merge the datasets based on 'post_id' column
merged_df = pd.merge(df1, df2[['post_id', 'label']], on='post_id', how='inner')

# If you want to append the 'label' column from df2 to df1
df1['label'] = merged_df['label']
df1['label'] = pd.to_numeric(df1['label'], errors='coerce')  # Converts to float, turns non-numeric to NaN
df1 = df1.dropna(subset=['label']) #changed
df1 = df1[df1['label'] != 0.0]




# Save the resulting dataframe to a new CSV file
df1.to_csv('Sentimented+Labelledv2.csv', index=False)
df1.head()

  df1['sentiment'] = df1['sentiment'].replace({'positive': 1.0, 'negative': -1.0})


Unnamed: 0,subreddit,title,post_id,post_url,post_content,comment_id,comment_body,comment_author,comment_score,created,created_iso,readable_date,cleaned_comment,subjectivity,polarity,sentiment,entities,label
340,webull,Tax lots on Webull,1ipge6z,https://www.reddit.com/r/Webull/comments/1ipge...,Since Webull and Robinhood use Apex clearing f...,md26tdl,"Idk the answer to this, but I hope as well the...",DragonfruitLopsided,2,1739707315,2025-02-16 12:01:55,16 February 2025,"idk answer , hope well . tax lot standard brok...",0.45,-0.25,-1.0,"['cost', 'lot', 'answer', 'Tax', 'Idk', 'broke...",-1.0
341,webull,Has anyone had this woe up this morning with 5...,1iohei6,https://i.redd.it/qfdcftkwawie1.jpeg,[No text content],mcjcvrm,I posted about a similar issue yesterday eveni...,Difficult_Poetry_259,2,1739448730,2025-02-13 12:12:10,13 February 2025,posted similar issue yesterday evening . accou...,0.4,-0.05,-1.0,"['customer', 'switch', 'yesterday', 'solution'...",-1.0
342,webull,Has anyone had this woe up this morning with 5...,1iohei6,https://i.redd.it/qfdcftkwawie1.jpeg,[No text content],mcjgop2,So you have two limits. There is a Day Trade l...,GunsouBono,9,1739450432,2025-02-13 12:40:32,13 February 2025,two limit . day trade limit ( 5k ) must close ...,0.490357,0.181071,1.0,"['line', 'day', 'limit', 'MSFT', 'fall', 'Day'...",-1.0
343,webull,Is this a bug?,1iohjsh,https://www.reddit.com/gallery/1iohjsh,[No text content],mcjebpz,This makes three of us who have complained abo...,Difficult_Poetry_259,0,1739449393,2025-02-13 12:23:13,13 February 2025,make three u complained issue webull . please ...,0.2,-0.3,-1.0,"['customer', 'r/Webull', 'Please', '’', 'Trade...",-1.0
344,webull,I'm puzzled.,1io8c7g,https://i.redd.it/6devokbpdtie1.png,I'm trying to add my debit card and this pops ...,mchjvct,Exactly the same thing happen for me as well. ...,Siqk-,2,1739415535,2025-02-13 2:58:55,13 February 2025,exactly thing happen well . reached first time...,0.277976,0.094296,1.0,"['customer', 'support', 'us ach/', 'happen', '...",-1.0


In [59]:
data = df1

# Step 1: Vectorize the cleaned text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
# make sure no NaNs, and everything is a str

X_text = vectorizer.fit_transform(data['comment_body'])
X_subreddit = vectorizer.fit_transform(data['subreddit'])

In [60]:
# Step 2: Add subjectivity and polarity as features
X_features = data[['subjectivity', 'polarity','comment_score']].values

# Combine the text features and sentiment features
from scipy.sparse import hstack
X_combined = hstack([X_subreddit, X_text, X_features])

In [61]:
# Step 4: Convert 'entities' column into binary features (one-hot encoding)
mlb = MultiLabelBinarizer()
entity_features = mlb.fit_transform(data['entities'].apply(eval))  # Convert string lists into actual lists

# Combine entity features with the other features
X_combined_with_entities = hstack([X_combined, entity_features])

# Apply PCA for dimensionality reduction
pca = PCA(n_components=50)
X_combined_with_entities_pca = pca.fit_transform(X_combined_with_entities.toarray())

In [73]:
#Testing with logistic regression
# Filter only manually labeled rows
labeled_data = data[~data['label'].isna()]
X_labeled = X_combined_with_entities_pca[~data['label'].isna()]
y_labeled = labeled_data['label'].astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)

# Train a supervised classifier
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(class_weight='balanced', max_iter=1000)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)

print("Precision:", precision_score(y_test, y_pred, average='weighted', zero_division=1))
print("Recall:", recall_score(y_test, y_pred, average='weighted', zero_division=1))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted', zero_division=1))
print("Accuracy:", accuracy_score(y_test, y_pred))


Precision: 0.6661016810941848
Recall: 0.6499442586399108
F1 Score: 0.6533954379136725
Accuracy: 0.6499442586399108


In [63]:
# Step 5: Apply KMeans clustering
start_time = time.time()

kmeans_with_entities = KMeans(n_clusters=3, random_state=42) #best 3
kmeans_with_entities.fit(X_combined_with_entities_pca)

# Add cluster labels to the dataframe
predicted_labels = kmeans_with_entities.labels_
predicted_labels = predicted_labels.astype(int)



end_time = time.time()
total_time = end_time - start_time
records_classified_per_second = len(data) / total_time  # Assuming 'data' contains all records

# Step 6: Map clusters to sentiment labels
# Create a mapping based on the majority sentiment in each cluster
cluster_sentiment_mapping = {}

for cluster in range(kmeans_with_entities.n_clusters):
    # Find the rows that belong to this cluster
    cluster_rows = data[predicted_labels == cluster]
    # Majority sentiment in the cluster
    majority_sentiment = cluster_rows['label'].mode()[0]
    # Map this cluster to the majority sentiment
    cluster_sentiment_mapping[cluster] = majority_sentiment

# Map predicted labels to sentiment labels
mapped_sentiment_labels = [cluster_sentiment_mapping[label] for label in predicted_labels]

In [64]:
# Step 7: Get the true labels for the remaining rows
true_labels = data['label']
true_labels = np.array(true_labels, dtype=float).astype(int)

# Step 8: Evaluate clustering performance using Precision, Recall, F1-Score
precision = precision_score(true_labels, mapped_sentiment_labels, average='weighted', zero_division=1)  # Handling zero divisions
recall = recall_score(true_labels, mapped_sentiment_labels, average='weighted', zero_division=1)
f1 = f1_score(true_labels, mapped_sentiment_labels, average='weighted', zero_division=1)
accuracy = accuracy_score(true_labels, mapped_sentiment_labels)

# Print evaluation metrics
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Records Classified per Second: {records_classified_per_second:.2f}")

Precision: 0.7603
Recall: 0.6016
F1-Score: 0.4520
Accuracy: 0.6016
Records Classified per Second: 837142.82


In [67]:
# 1. Calculate Centroid Distances (Euclidean distance between centroids)
centroids = kmeans_with_entities.cluster_centers_
inter_cluster_distances = np.linalg.norm(centroids[:, np.newaxis] - centroids, axis=2)
print("Inter-cluster centroid distances:\n", inter_cluster_distances)

# 2. Calculate Silhouette Score
silhouette_avg = silhouette_score(X_combined_with_entities_pca, predicted_labels)
print(f"Silhouette Score: {silhouette_avg:.4f}")

# 3. Calculate Davies-Bouldin Index
davies_bouldin = davies_bouldin_score(X_combined_with_entities_pca, predicted_labels)
print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")

# 4. Calculate Average Intra-cluster Similarity (average distance within clusters)
# Intra-cluster distance (average distance within each cluster)
intra_cluster_similarity = []
for cluster_id in range(kmeans_with_entities.n_clusters):
    cluster_points = X_combined_with_entities_pca[predicted_labels == cluster_id]
    cluster_center = centroids[cluster_id]
    intra_cluster_similarity.append(np.mean(np.linalg.norm(cluster_points - cluster_center, axis=1)))

average_intra_cluster_similarity = np.mean(intra_cluster_similarity)
print(f"Average Intra-cluster Similarity: {average_intra_cluster_similarity:.4f}")

Inter-cluster centroid distances:
 [[ 0.         35.60791418  7.47917706]
 [35.60791418  0.         28.13704553]
 [ 7.47917706 28.13704553  0.        ]]
Silhouette Score: 0.5512
Davies-Bouldin Index: 0.6609
Average Intra-cluster Similarity: 4.9168


In [72]:
#random test
random_labels = np.random.choice([1, -1], size=len(data), replace=True)

# Step 2: Get the predicted labels from your trained KMeans model
predicted_labels = kmeans_with_entities.labels_

# Step 3: Evaluate the clustering performance by comparing predicted labels to random labels
accuracy = accuracy_score(random_labels, mapped_sentiment_labels)
precision = precision_score(random_labels, mapped_sentiment_labels, average='weighted', zero_division=1)
recall = recall_score(random_labels, mapped_sentiment_labels, average='weighted', zero_division=1)
f1 = f1_score(random_labels, mapped_sentiment_labels, average='weighted', zero_division=1)

# Step 4: Print the evaluation metrics
print(f"Random Accuracy Test Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Random Accuracy Test Results:
Accuracy: 0.5097
Precision: 0.7501
Recall: 0.5097
F1-Score: 0.3442
