In [126]:
# Package and data loading
from transformers import BertTokenizer, BertModel
from bertopic.representation import MaximalMarginalRelevance
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import torch
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import re

In [3]:
import pandas as pd
from datetime import datetime

df = pd.read_csv('raw_dataApr7.csv')


def convert_timestamp_to_datetime(timestamp):
    try:
        # Assuming the timestamp is in seconds
        return datetime.fromtimestamp(int(timestamp))
    except ValueError:
        return None

datetimes = [convert_timestamp_to_datetime(ts) for ts in df['comment_datetime']]
post_datetimes = [convert_timestamp_to_datetime(ts) for ts in df['post_datetime']]
# Get the range of datetimes
datetime_range = (min(datetimes), max(datetimes)) if datetimes else (None, None)
datetime_range
post_datetime_range = (min(post_datetimes), max(post_datetimes)) if datetimes else (None, None)
post_datetime_range

unique_post_id_count = df['post_id'].nunique()
unique_comment_id_count = df['comment_id'].nunique()
unique_comment_text_count = df['comment_text'].nunique()
post_votes_range = (min(df['post_score']), max(df['post_score']))
comment_votes_range = (min(df['comment_score']), max(df['comment_score']))

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [214]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def text_preprocess(dataset):
    def remove_urls(text):
        url_pattern = r"http[s]?://\S+"
        return re.sub(url_pattern, "", text)
    def remove_html_exp(text):
        exp = r"&nbsp;"
        return re.sub(exp, "", text)
    def convert_to_string(x):
        if isinstance(x, str):
            x = remove_urls(x)
            return remove_html_exp(x)
        else:
            return ""
    def remove_stopwords(x):
        return " ".join([word for word in x.split() if word not in stop_words])
    filtered_text = dataset[(dataset["comment_text"].str.len() > 10) | (dataset["post_body"].str.len() > 10)]
    # removing any non-ASCII characters
    filtered_text["comment_text"] = filtered_text["comment_text"].apply(convert_to_string)
    filtered_text["post_body"] = filtered_text["post_body"].apply(convert_to_string)
    text_appended = pd.concat([filtered_text["comment_text"], filtered_text["post_body"]])
    # filtered_text["merged_text"] = filtered_text["comment_text"] + " " + filtered_text["post_body"]
    text_appended = text_appended.apply(
      lambda x: ''.join(char for char in x if ord(char) < 128))
    text_appended = text_appended.apply(remove_stopwords)
    unique_text = text_appended.drop_duplicates(keep="first")
    return unique_text

df = pd.read_csv("raw_dataApr7.csv")
full_text = text_preprocess(df)
train_set, test_set = train_test_split(full_text, test_size=0.25, random_state=2)
train_text = train_set
test_text = test_set

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Administrator/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [297]:
# Sampling for thematic analysis
def text_preprocess_stopword_retained(dataset):
    def remove_urls(text):
        url_pattern = r"http[s]?://\S+"
        return re.sub(url_pattern, "", text)
    def remove_html_exp(text):
        exp = r"&nbsp;"
        return re.sub(exp, "", text)
    def convert_to_string(x):
        if isinstance(x, str):
            x = remove_urls(x)
            return remove_html_exp(x)
        else:
            return ""
    filtered_text = dataset[(dataset["comment_text"].str.len() > 10) | (dataset["post_body"].str.len() > 10)]
    # removing any non-ASCII characters
    filtered_text["comment_text"] = filtered_text["comment_text"].apply(convert_to_string)
    filtered_text["post_body"] = filtered_text["post_body"].apply(convert_to_string)
    text_appended = pd.concat([filtered_text["comment_text"], filtered_text["post_body"]])
    # filtered_text["merged_text"] = filtered_text["comment_text"] + " " + filtered_text["post_body"]
    text_appended = text_appended.apply(
      lambda x: ''.join(char for char in x if ord(char) < 128))
    unique_text = text_appended.drop_duplicates(keep="first")
    return unique_text

df_longer_than_10 = text_preprocess_stopword_retained(df)
thematic_samples = df_longer_than_10.sample(150)

In [298]:
thematic_samples.to_csv("thematic_samples.csv")

## BERTopic Model Construction 1: based on Classed-based TF-IDF Transformer
Description: https://maartengr.github.io/BERTopic/api/ctfidf.html#bertopic.vectorizers._ctfidf.ClassTfidfTransformer


In [287]:
from transformers.pipelines import pipeline 

ctfidf_model = ClassTfidfTransformer(
    seed_words=["jss", "strategy","contract", "proposal","rate","badge","Fiverr","scam","recommendation","proposal"], 
    seed_multiplier=2
)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

umap_model=UMAP(n_neighbors=10, n_components=10, min_dist=0, random_state=121)
topic_model = BERTopic(n_gram_range=(1,3),
                       ctfidf_model=ctfidf_model,
                       representation_model=MaximalMarginalRelevance(diversity=0.3),
                       embedding_model=embedding_model,
                       umap_model=umap_model,
                       #hdbscan_model = hdbscan_model,
                       min_topic_size=20,
                       calculate_probabilities=True)
topics, probs = topic_model.fit_transform(train_text)


# Explore and label the topics
print(topic_model.get_topic_info())
topic_model.visualize_term_rank()

loading configuration file config.json from cache at /Users/Administrator/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/config.json
Model config BertConfig {
  "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.38.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file model.safetensors from cache at /Users/Administrator/.cache/huggingface/hub/models--sentence-tra

    Topic  Count                                               Name  \
0      -1   1918                         -1_upwork_work_clients_job   
1       0    384                             0_you_lol_it_thank you   
2       1    217              1_upwork_freelancers_clients_business   
3       2    153                    2_proposal_proposals_client_job   
4       3    143                      3_rate_clients_contract_start   
5       4    122             4_jss_feedback_review_private feedback   
6       5    116                    5_scam_upwork_scammers_contract   
7       6    102                             6_ai_they_content_jobs   
8       7     93                              7_us_hire_rate_russia   
9       8     88                      8_clients_client_strategy_get   
10      9     85                          9_year_upwork_make_income   
11     10     81                10_freelancers_bid_rate_50 connects   
12     11     75                       11_people_toxic_upwork_shill   
13    

In [296]:
topic_model.get_topic_info().to_csv("apr11res.csv", index=False)

In [279]:
def get_coherence(model,textset):
    topics = model.get_topics()
    topic_keywords = [[word for word, _ in topics[topic]] for topic in topics if topic != -1]
    # Prepare a Gensim dictionary and corpus
    gensim_dictionary = Dictionary([text.split() for text in textset])
    corpus = [gensim_dictionary.doc2bow(text.split()) for text in textset]
    texts_for_coherence = [text.split() for text in textset]

    # Convert the documents to BOW format using the same dictionary
    if not topic_keywords or all(len(topic) == 0 for topic in topic_keywords):
        print("Topic keywords list is empty or topics contain too few words. Can't calculate coherence.")
    else:
        # Calculate Coherence
        cm = CoherenceModel(topics=topic_keywords, texts=texts_for_coherence, corpus=corpus, dictionary=gensim_dictionary, coherence='c_v')
        coherence_score = cm.get_coherence()
        return coherence_score

def get_topic_diversity(model):
    topics = model.get_topics()
    top_words_per_topic = [[word[0] for word in topics[i]] for i in topics]
    # format top 3 words into a dictionary with topic IDs as keys, diversity calculated on topic 3 words within a theme
    model_output = {'topics': top_words_per_topic}
    metric = TopicDiversity(topk=3) # Initialize metric
    topic_diversity_score = metric.score(model_output)
    return topic_diversity_score

In [289]:
# get the scores
coh_cv_ctf = get_coherence(topic_model, train_text)
print(coh_cv_ctf)

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to di

0.43265496251676533


In [290]:
div_ctf = get_topic_diversity(topic_model)

In [299]:
# validate model performance on the test hold-out set.

if not all(isinstance(text, str) for text in test_text):
    raise ValueError("All items in test_text must be strings.")
try:
    test_topics, test_probs = topic_model.transform(test_text)
except Exception as e:
    print(f"An error occurred during topic transformation: {e}")
train_topics = topic_model.get_topics()
dictionary = Dictionary([doc.split() for doc in test_text])
corpus = [dictionary.doc2bow(doc.split()) for doc in test_text]
# use the top n words for each topic from the trained model
top_n_words = topic_model.get_topic_freq().head(len(train_topics))
topic_words = [[word for word, _ in topic_model.get_topic(topic)] 
               for topic, _ in top_n_words.itertuples(index=False)]

# Calculate coherence
coherence_model = CoherenceModel(topics=topic_words, texts=[doc.split() for doc in test_text], 
                                 dictionary=dictionary, corpus=corpus, coherence='c_v')

# Get the coherence score
test_coherence = coherence_model.get_coherence()

An error occurred during topic transformation: 701


0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to di

In [292]:
# model visualisations: similarity heatmap
topic_model.visualize_heatmap(title = "Topic Similarity Matrix: How interconnected are they?",width=950)

In [294]:
topic_model.visualize_barchart()


In [266]:
topic_model.visualize_topics()

In [293]:
topic_model.visualize_hierarchy() # 