In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
import spacy
from nltk.tokenize import word_tokenize
import nltk
from nltk import pos_tag

In [3]:
import time
import pandas as pd
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from nltk.sentiment import SentimentIntensityAnalyzer

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.cluster import KMeans

In [5]:
lemmatizer = WordNetLemmatizer()
# Function to clean text: Tokenization, Lemmatization, and Stopword removal
def clean_text(text):
    if isinstance(text, str):  # Check if the text is a string
        text = text.lower()  # Convert to lowercase
        tokens = word_tokenize(text)  # Tokenize the text

        # Lemmatize each token
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

        # Additional preprocessing (like removing stopwords)
        stop_words = set(stopwords.words('english'))
        tokens_without_stopwords = [token for token in lemmatized_tokens if token not in stop_words]

        # Rejoin tokens into a single string before passing to TextBlob
        cleaned_text = " ".join(tokens_without_stopwords)
        return cleaned_text
    else:
        # If the text is not a string (e.g., NaN or float), return an empty string or a default value
        return ""  # Or return a string like 'Invalid Text' if needed

In [6]:
nlp = spacy.load("en_core_web_lg")

def extract_entities(text):
    # Step 1: Use spaCy to extract entities (like companies, products, etc.)
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ == "ORG"]  # Extracting organizations

    # Step 2: Apply POS tagging to the tokens in the text
    tokens = word_tokenize(text)  # Tokenize the text
    pos_tags = pos_tag(tokens)  # Get POS tags for the tokens

    # Step 3: Extract proper nouns (NNP) or important nouns (NN) using POS tagging
    nouns = [word for word, tag in pos_tags if tag in ["NNP","NN"]]  # Extracting proper nouns and common nouns

    # Combine NER and POS results
    refined_entities = list(set(entities + nouns))  # Combine and remove duplicates
    return refined_entities

In [7]:
  # Function to load data from Excel
def load_data_from_excel(file_path):
    # Read the Excel file
    df = pd.read_csv(file_path)
    
    # Assuming the Excel file has columns 'text' for the comments
    texts = df['cleaned_body'].tolist()  # List of texts (Reddit posts)
    return df, texts

def load_val_data_from_excel(file_path):
    # Read the Excel file
    df = pd.read_csv(file_path)
    
    # Assuming the Excel file has columns 'text' for the comments
    texts = df['comment_body'].tolist()  # List of texts (Reddit posts)
    return df, texts

In [8]:
# Function to perform sentiment analysis with TextBlob
def analyze_sentiment_with_textblob(texts):
    subjectivity = []
    polarity = []
    for text in texts:
        blob = TextBlob(text)
        subjectivity.append(blob.sentiment.subjectivity)
        polarity.append(blob.sentiment.polarity)
    return subjectivity, polarity


In [9]:
# Load data from Excel sheet
train_file_path = "SC4021 Data Records_train_dataset.csv"  # Replace with your actual Excel file path
val_file_path = "SC4021 Data Records_evaluation_dataset.csv"  # Replace with your actual Excel file path
df_train, train_texts = load_data_from_excel(train_file_path)
df_val, val_texts = load_val_data_from_excel(val_file_path)

# --- 1. Preprocess the text ---
start_time = time.time()
train_cleaned_texts = [clean_text(text) for text in train_texts]
val_cleaned_texts = [clean_text(text) for text in val_texts]

preprocessing_time = time.time() - start_time
print(f"Preprocessing Time: {preprocessing_time:.4f} seconds")


Preprocessing Time: 1.8210 seconds


In [10]:
# --- 2. Sentiment Analysis with TextBlob ---
start_time = time.time()
train_subjectivity, train_polarity = analyze_sentiment_with_textblob(train_cleaned_texts)
val_subjectivity, val_polarity = analyze_sentiment_with_textblob(val_cleaned_texts)
sentiment_analysis_time = time.time() - start_time
print(f"Sentiment Analysis Time: {sentiment_analysis_time:.4f} seconds")

# Append the sentiment analysis results to the dataframe
df_train['subjectivity'] = train_subjectivity
df_train['polarity'] = train_polarity

df_val['subjectivity'] = val_subjectivity
df_val['polarity'] = val_polarity
# --- 3. Entity Extraction ---

# Determine the overall sentiment (positive, negative, neutral)
train_sentiment_labels = []
for polarity_score in train_polarity:
    if polarity_score > 0:
        train_sentiment_labels.append('positive')
    elif polarity_score < 0:
        train_sentiment_labels.append('negative')
    else:
        train_sentiment_labels.append('neutral')

val_sentiment_labels = []
for polarity_score in val_polarity:
    if polarity_score > 0:
        val_sentiment_labels.append('positive')
    elif polarity_score < 0:
        val_sentiment_labels.append('negative')
    else:
        val_sentiment_labels.append('neutral')

df_val['sentiment'] = val_sentiment_labels
df_train['sentiment'] = train_sentiment_labels

Sentiment Analysis Time: 0.2916 seconds


In [14]:
# --- 3. Remove neutral sentiment rows ---
# df_train_filtered = df_train[df_train['sentiment'] != 'neutral']  # Drop all rows where sentiment is 'neutral'
# df_val_filtered = df_val[df_val['sentiment'] != 'neutral']  # Drop all rows where sentiment is 'neutral'
# len(df_train_filtered), len(df_val_filtered)
print(df_train.columns)
print(df_val.columns)
df_train_filtered = df_train[df_train['Label'] != 0.0]  # Drop all rows where sentiment is 'neutral'
df_val_filtered = df_val[df_val['label'] != 0.0]  # Drop all rows where sentiment is 'neutral'
len(df_train_filtered), len(df_val_filtered)

Index(['type', 'datetime', 'post_id', 'subreddit', 'title', 'author', 'url',
       'upvotes', 'downvotes', 'upvote_ratio', 'body', 'cleaned_body', 'Label',
       'subjectivity', 'polarity', 'sentiment'],
      dtype='object')
Index(['title', 'post_id', 'post_url', 'post_content', 'post_content.1',
       'subreddit', 'comment_body', 'label', 'comment_author', 'comment_score',
       'created', 'created_iso', 'readable_date', 'subjectivity', 'polarity',
       'sentiment'],
      dtype='object')


(499, 1001)

In [15]:
# --- 4. Named Entity Recognition ---
start_time = time.time()
# Apply NER only to the remaining (non-neutral) rows
train_remaining_texts = df_train_filtered['cleaned_body'].tolist()  # Only the rows with non-neutral sentiment
val_remaining_texts = df_val_filtered['comment_body'].tolist()  # Only the rows with non-neutral sentiment
train_entities = [extract_entities(text) for text in train_remaining_texts]
val_entities = [extract_entities(text) for text in val_remaining_texts]

entity_extraction_time = time.time() - start_time
print(f"Entity Extraction Time: {entity_extraction_time:.4f} seconds")

# Append extracted entities to the dataframe
df_train_filtered['entities'] = train_entities
df_val_filtered['entities'] = val_entities

Entity Extraction Time: 16.1714 seconds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_filtered['entities'] = train_entities


In [16]:
# --- 5. Save the modified data back to CSV ---
output_file_path = "SentimentedTrain.csv"
to_df_train_filtered = df_train_filtered.rename(columns={'cleaned_body': 'text', 'Label': 'label'})
to_df_train_filtered = to_df_train_filtered[['text', 'label']]
to_df_train_filtered.to_csv(output_file_path, index=False)
print(f"Modified data saved to {output_file_path}")

output_file_path = "SentimentedVal.csv"
to_df_val_filtered = df_val_filtered.rename(columns={'comment_body': 'text', 'Label': 'label'})
to_df_val_filtered = to_df_val_filtered[['text', 'label']]
to_df_val_filtered.to_csv(output_file_path, index=False)
print(f"Modified data saved to {output_file_path}")

# --- Performance Metrics ---
total_time = preprocessing_time + sentiment_analysis_time + entity_extraction_time
records_per_second = (len(df_train_filtered) + len(df_val_filtered)) / total_time  # records classified per second

print(f"\n--- Performance Metrics ---")
print(f"Total Time for Preprocessing, Sentiment Analysis, and Entity Extraction: {total_time:.4f} seconds")
print(f"Records Classified per Second: {records_per_second:.2f} records/second")

Modified data saved to SentimentedTrain.csv
Modified data saved to SentimentedVal.csv

--- Performance Metrics ---
Total Time for Preprocessing, Sentiment Analysis, and Entity Extraction: 18.2840 seconds
Records Classified per Second: 82.04 records/second


In [None]:
# --- 5. Save the modified data back to CSV ---
from collections import Counter
output_file_path = "train.csv"
to_df_train_filtered = df_train_filtered.rename(columns={'cleaned_body': 'text', 'Label': 'label'})
to_df_train_filtered = to_df_train_filtered[['text', 'label']]
original_train_size = len(df_train_filtered)
df_msk = (to_df_train_filtered['label'] == 1.0) & (to_df_train_filtered['label'] == -1.0)
to_df_train_filtered = to_df_train_filtered[df_msk]
print(Counter(to_df_train_filtered['label']))
def map_labels(label):
    if label == 1.0:
        return 1
    elif label == -1.0:
        return 0
def map_val_labels(label):
    if label == '-1':
        return 0
    elif label == '1':
        return 1
to_df_train_filtered['label'] = to_df_train_filtered['label'].apply(lambda x: map_labels(x))
print(Counter(to_df_train_filtered['label']))
to_df_train_filtered.to_csv(output_file_path, index=False)
print(f"Modified data saved to {output_file_path}. From {original_train_size} to {len(to_df_train_filtered)} records.")

output_file_path = "val.csv"
to_df_val_filtered = df_val_filtered.rename(columns={'comment_body': 'text', 'Label': 'label'})
to_df_val_filtered = to_df_val_filtered[['text', 'label']]
original_val_size = len(to_df_val_filtered)
df_msk = (to_df_val_filtered['label'] == '1' & to_df_val_filtered['label'] == '-1')
to_df_val_filtered = to_df_val_filtered[df_msk]
print(Counter(to_df_val_filtered['label']))
to_df_val_filtered ['label'] = to_df_val_filtered ['label'].apply(lambda x: map_val_labels(x))
print(Counter(to_df_val_filtered['label']))
to_df_val_filtered.to_csv(output_file_path, index=False)
print(f"Modified data saved to {output_file_path}. From {original_val_size} to {len(to_df_val_filtered)} records.")

# --- Performance Metrics ---
total_time = preprocessing_time + sentiment_analysis_time + entity_extraction_time
records_per_second = (len(df_train_filtered) + len(df_val_filtered)) / total_time  # records classified per second

print(f"\n--- Performance Metrics ---")
print(f"Total Time for Preprocessing, Sentiment Analysis, and Entity Extraction: {total_time:.4f} seconds")
print(f"Records Classified per Second: {records_per_second:.2f} records/second")

TypeError: Cannot perform 'rand_' with a dtyped [float64] array and scalar of type [bool]

In [33]:
#Combining the labelled data 
# Load the datasets
df1 = pd.read_csv('/Users/jaredog/Downloads/git code/SC4021-Info-retrieval/ClassificationNew/JupyterNotebook/Sentimented.csv')  # Replace with your dataset file paths
df2 = pd.read_csv('/Users/jaredog/Downloads/git code/SC4021-Info-retrieval/ClassificationNew/JupyterNotebook/Labelled.csv')

df1['sentiment'] = df1['sentiment'].replace({'positive': 1.0, 'negative': -1.0})
# Merge the datasets based on 'post_id' column
merged_df = pd.merge(df1, df2[['post_id', ' Label']], on='post_id', how='inner')

# If you want to append the 'label' column from df2 to df1
df1['label'] = merged_df[' Label']
df1 = df1[df1['label'] != 0.0]
df1 = df1.dropna(subset=['label'])

# Save the resulting dataframe to a new CSV file
df1.to_csv('Sentimented+Labelled.csv', index=False)

  df1['sentiment'] = df1['sentiment'].replace({'positive': 1.0, 'negative': -1.0})


In [34]:
data = df1

# Step 1: Vectorize the cleaned text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_text = vectorizer.fit_transform(data['cleaned_body'])

In [35]:
# Step 2: Add subjectivity and polarity as features
X_features = data[['subjectivity', 'polarity']].values

# Combine the text features and sentiment features
from scipy.sparse import hstack
X_combined = hstack([X_text, X_features])

In [36]:
# Step 4: Convert 'entities' column into binary features (one-hot encoding)
mlb = MultiLabelBinarizer()
entity_features = mlb.fit_transform(data['entities'].apply(eval))  # Convert string lists into actual lists

# Combine entity features with the other features
X_combined_with_entities = hstack([X_combined, entity_features])

# Apply PCA for dimensionality reduction
pca = PCA(n_components=50)
X_combined_with_entities_pca = pca.fit_transform(X_combined_with_entities.toarray())

In [37]:
# Step 5: Apply KMeans clustering
start_time = time.time()

kmeans_with_entities = KMeans(n_clusters=6, random_state=42)
kmeans_with_entities.fit(X_combined_with_entities_pca)

# Add cluster labels to the dataframe
predicted_labels = kmeans_with_entities.labels_
predicted_labels = predicted_labels.astype(int)

end_time = time.time()
total_time = end_time - start_time
records_classified_per_second = len(data) / total_time  # Assuming 'data' contains all records

# Step 6: Map clusters to sentiment labels
# Create a mapping based on the majority sentiment in each cluster
cluster_sentiment_mapping = {}

for cluster in range(kmeans_with_entities.n_clusters):
    # Find the rows that belong to this cluster
    cluster_rows = data[predicted_labels == cluster]
    # Majority sentiment in the cluster
    majority_sentiment = cluster_rows['label'].mode()[0]
    # Map this cluster to the majority sentiment
    cluster_sentiment_mapping[cluster] = majority_sentiment

# Map predicted labels to sentiment labels
mapped_sentiment_labels = [cluster_sentiment_mapping[label] for label in predicted_labels]

In [38]:
# Step 7: Get the true labels for the remaining rows
true_labels = data['label']

# Step 8: Evaluate clustering performance using Precision, Recall, F1-Score
precision = precision_score(true_labels, mapped_sentiment_labels, average='weighted', zero_division=1)  # Handling zero divisions
recall = recall_score(true_labels, mapped_sentiment_labels, average='weighted', zero_division=1)
f1 = f1_score(true_labels, mapped_sentiment_labels, average='weighted', zero_division=1)
accuracy = accuracy_score(true_labels, mapped_sentiment_labels)

# Print evaluation metrics
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Records Classified per Second: {records_classified_per_second:.2f}")

Precision: 0.8037
Recall: 0.7313
F1-Score: 0.6206
Accuracy: 0.7313
Records Classified per Second: 12287.63


In [31]:
# 1. Calculate Centroid Distances (Euclidean distance between centroids)
centroids = kmeans_with_entities.cluster_centers_
inter_cluster_distances = np.linalg.norm(centroids[:, np.newaxis] - centroids, axis=2)
print("Inter-cluster centroid distances:\n", inter_cluster_distances)

# 2. Calculate Silhouette Score
silhouette_avg = silhouette_score(X_combined_with_entities_pca, predicted_labels)
print(f"Silhouette Score: {silhouette_avg:.4f}")

# 3. Calculate Davies-Bouldin Index
davies_bouldin = davies_bouldin_score(X_combined_with_entities_pca, predicted_labels)
print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")

# 4. Calculate Average Intra-cluster Similarity (average distance within clusters)
# Intra-cluster distance (average distance within each cluster)
intra_cluster_similarity = []
for cluster_id in range(kmeans_with_entities.n_clusters):
    cluster_points = X_combined_with_entities_pca[predicted_labels == cluster_id]
    cluster_center = centroids[cluster_id]
    intra_cluster_similarity.append(np.mean(np.linalg.norm(cluster_points - cluster_center, axis=1)))

average_intra_cluster_similarity = np.mean(intra_cluster_similarity)
print(f"Average Intra-cluster Similarity: {average_intra_cluster_similarity:.4f}")

Inter-cluster centroid distances:
 [[ 0.          2.33667353  2.40180738  8.56946932  6.94779514  7.28260922]
 [ 2.33667353  0.          1.14731826  8.92107859  6.72770616  7.26697858]
 [ 2.40180738  1.14731826  0.          8.84242428  6.63620762  7.23532672]
 [ 8.56946932  8.92107859  8.84242428  0.         10.72656551 10.56230164]
 [ 6.94779514  6.72770616  6.63620762 10.72656551  0.          9.80764445]
 [ 7.28260922  7.26697858  7.23532672 10.56230164  9.80764445  0.        ]]
Silhouette Score: 0.2730
Davies-Bouldin Index: 1.6681
Average Intra-cluster Similarity: 1.1800


In [32]:
#random test
random_labels = np.random.choice([1, -1], size=len(data), replace=True)

# Step 2: Get the predicted labels from your trained KMeans model
predicted_labels = kmeans_with_entities.labels_

# Step 3: Evaluate the clustering performance by comparing predicted labels to random labels
accuracy = accuracy_score(random_labels, predicted_labels)
precision = precision_score(random_labels, predicted_labels, average='weighted', zero_division=1)
recall = recall_score(random_labels, predicted_labels, average='weighted', zero_division=1)
f1 = f1_score(random_labels, predicted_labels, average='weighted', zero_division=1)

# Step 4: Print the evaluation metrics
print(f"Random Accuracy Test Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Random Accuracy Test Results:
Accuracy: 0.3906
Precision: 0.7497
Recall: 0.3906
F1-Score: 0.2865


In [58]:
from transformers import BertTokenizer, BertModel
import torchfrom transformers 
import BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset


In [42]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for a given text
def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        # Tokenize and encode the text
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        
        # Get the embeddings from BERT (the output of the last hidden layer)
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract the last hidden state (embeddings)
        last_hidden_states = outputs.last_hidden_state
        # Get the mean of all token embeddings to get the sentence embedding
        sentence_embedding = torch.mean(last_hidden_states, dim=1).squeeze().numpy()
        embeddings.append(sentence_embedding)
    
    return np.array(embeddings)

# Example usage to get BERT embeddings for your dataset
texts = data['cleaned_body'].tolist()  # Assuming you have a column 'cleaned_body'
bert_embeddings = get_bert_embeddings(texts)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [43]:
pca = PCA(n_components=50)
bert_embeddings_reduced = pca.fit_transform(bert_embeddings)

# Step 5: Apply KMeans clustering using BERT embeddings
kmeans_with_bert = KMeans(n_clusters=6, random_state=42)
kmeans_with_bert.fit(bert_embeddings_reduced)

# Add cluster labels to the dataframe
predicted_labels_bert = kmeans_with_bert.labels_

In [56]:
# Prepare the dataset for fine-tuning

dataset = Dataset.from_pandas(data)  # Assuming your data is in a Pandas DataFrame
dataset = dataset.map(lambda x: tokenizer(x['cleaned_body'], truncation=True, padding=True, max_length=512))

# Fine-tune BERT for sentiment analysis
model_for_sentiment = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 2 labels (positive/negative)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=200,
)

trainer = Trainer(
    model=model_for_sentiment,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,  # Or use a separate validation dataset
)

trainer.train()

Map: 100%|███████████████████████████| 361/361 [00:00<00:00, 2023.24 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [55]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS backend")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using MPS backend


In [57]:
import torch
print(torch.__version__)

2.6.0
