In [1]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords


import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech
from bertopic.representation import OpenAI
# nltk.download('stopwords')

stop_words = stopwords.words('english')

def contains_non_stop_words(tweet):
    # Check if the tweet contains any non-stop words
    words = tweet.lower().split()
    for word in words:
        if word not in stop_words:
            return True
    return False

def preprocess_tweet(tweet):
    # Remove hashtags, usernames starting with '@', and hyperlinks starting with 'http' or 'https'
    tweet = re.sub(r'#\w+|@\w+|http\S+|https\S+', '', tweet)

    return tweet.strip()

def process_tweets(tweets):
    processed_tweets = []
    for tweet in tweets:
        processed_tweet = preprocess_tweet(tweet)
        words = processed_tweet.lower().split()
        if any(word not in stop_words for word in words):
            processed_tweets.append(processed_tweet)
    return processed_tweets


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
docs = pd.read_csv(r"D:\gitRepo\Projects\Projects\Topic Modelling\Dataset\Corona_NLP_train.csv")
display(docs)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...,...,...,...,...
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral
41153,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative
41154,44953,89905,,14-04-2020,You know itÂ’s getting tough when @KameronWild...,Positive
41155,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral


In [3]:
docs_text = list(docs['OriginalTweet'][0:100])
processed_tweets = process_tweets(docs_text)
for i in processed_tweets[0:10]:
    print(i + "\n")

advice Talk to your neighbours family to exchange phone numbers create contact list with phone numbers of neighbours schools employer chemist GP set up online shopping accounts if poss adequate supplies of regular meds but not over order

Coronavirus Australia: Woolworths to give elderly, disabled dedicated shopping hours amid COVID-19 outbreak

My food stock is not the only one which is empty...



PLEASE, don't panic, THERE WILL BE ENOUGH FOOD FOR EVERYONE if you do not take more than you need. 

Stay calm, stay safe.

Me, ready to go at supermarket during the  outbreak.



Not because I'm paranoid, but because my food stock is litteraly empty. The  is a serious thing, but please, don't panic. It causes shortage...

As news of the regionÂ’s first confirmed COVID-19 case came out of Sullivan County last week, people flocked to area stores to purchase cleaning supplies, hand sanitizer, food, toilet paper and other goods,  reports

Cashier at grocery store was sharing his insi

## Precalculate Embeddings

In [4]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(processed_tweets, show_progress_bar=True)

Batches: 100%|██████████| 4/4 [00:02<00:00,  1.94it/s]


## Preventing Stochastic Behavior

In [5]:
from umap import UMAP
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

## Controlling Number of Topics

In [6]:
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='cityblock', cluster_selection_method='eom', prediction_data=True)

## Improving Default Representation

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

## Additional Representations

In [22]:
from transformers.pipelines import pipeline
from bertopic.representation import TextGeneration

prompt = """
I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short topic label in the following format:
topic: <topic label>
"""


# KeyBERT
keybert_model = KeyBERTInspired()

# Create your representation model
generator = pipeline('text2text-generation', model='google/flan-t5-base')
gpt_like = TextGeneration(generator)

# Create your representation model
openai.api_key = ''
openai_model = OpenAI(model="gpt-3.5-turbo-16k", delay_in_seconds=10, chat=True, prompt = prompt)

## Training

In [24]:
topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=openai_model,

  # Hyperparameters
  top_n_words=5,
  verbose=True
)

# topic_model = BERTopic(
#   representation_model=openai_model)

#   # Hyperparameters
#   top_n_words=5,
#   verbose=True
# )

# Train modele
topics, probs = topic_model.fit_transform(processed_tweets, embeddings)

# Show topics
topic_model.get_topic_info()

2023-08-06 20:34:03,252 - BERTopic - Reduced dimensionality
2023-08-06 20:34:03,263 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4,-1_Cheap deals and help during an emergency,[Cheap deals and help during an emergency],[My work is capitalizing on the demand for pa...
1,0,41,0_Panic buying at grocery stores during COVID-...,[Panic buying at grocery stores during COVID-1...,[People posting and sharing photos of of half ...
2,1,14,1_COVID-19 consumer behavior and precautions,[COVID-19 consumer behavior and precautions],[Yeah my parents are risky people to the covid...
3,2,8,2_Impact of COVID-19 on Consumer Purchase Beha...,[Impact of COVID-19 on Consumer Purchase Behav...,"[Yes, buy only what you need.\r\r\n\r\r\nBut w..."
4,3,7,3_COVID-19 impact on finances and online shopping,[COVID-19 impact on finances and online shopping],"[For corona prevention,we should stop to buy t..."
5,4,6,4_Helping Elderly with Online Shopping During ...,[Helping Elderly with Online Shopping During C...,[Please Share Know someone who s 65 Living on...
6,5,4,5_Lack of COVID-19 precautions and paid sick l...,[Lack of COVID-19 precautions and paid sick le...,[As news of the regionÂ’s first confirmed COVI...
7,6,4,6_Panic buying of toilet paper during COVID-19...,[Panic buying of toilet paper during COVID-19 ...,[Sadly those are the misinformed thinking that...
8,7,4,7_Supporting a Healthy Community with Online S...,[Supporting a Healthy Community with Online Sh...,[We're here to provide a safe shopping experie...
9,8,4,8_Malicious price increases in NYC during an e...,[Malicious price increases in NYC during an em...,[In attempts to lengthen runways marketing bud...


In [39]:
for k in list(df['Topic']):
    df = topic_model.get_topic_info()
    df = df[df['Topic'] == k]

    print("Name:",list(df['Name'])[0][0])
    print("Representation:",list(df['Representation'])[0][0])
    for i in df['Representative_Docs']:
        for j in i:
            print("Document: " + j + '\n')

Name: -
Representation: Cheap deals and help during an emergency
Document: My work is capitalizing on the  demand for packaged food and making us stay open as opposed to closing for all our health and safety

Document: We have AMAZING CHEAP DEALS! FOR THE  going on to help you???





 

And Resonable  / 

Just DM US!

Document: Of all the things to panic buy in an emergency, I don't get why toilet paper is so important. If you're afraid of the worst case scenario, just wash up in the tub and use your money on food. Y'all crazy.

Name: 0
Representation: Panic buying at grocery stores during COVID-19 pandemic
Document: People posting and sharing photos of of half to completely empty shelves calling those people "dumb" or "idiots." All while shopping at the grocery store. lol

Document: The actions of some are so selfish. If I were CEO of a grocery store, from 7-9 am would be a time for people over 65 to shop; show ID. I just saw a young couple with 300 rolls of tp. No one is t

In [38]:
df = topic_model.get_topic_info()
list(df['Topic'])

[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

## Fine-tune the Topic Representations using Generative Large Language Models

In [None]:
from transformers import pipeline
from bertopic.representation import TextGeneration

df = df[df['Topic'] == 2]
documents = [i for i in df['Representative_Docs']]
documents = documents[0]

keywords = [i for i in df['Representation']]
keywords = keywords[0]

prompt = "I have a topic described by the following keywords: "  + ",".join(keywords) + "." + "I have topic that contains the following documents:" +  "|".join(documents) + "\nBased on the above information, can you give a short label of the topic?"
print(prompt)

In [None]:
# Create your representation model
generator = pipeline('text2text-generation', model='google/flan-t5-base')
representation_model = TextGeneration(generator, prompt = prompt)

In [None]:
representation_model